Asmcodes: Noekeon Block cipher

Introduction

Noekeon is a 128-bit block cipher designed by Joan Daemen, Michaël Peeters, Gilles Van Assche, Vincent Rijmen and submitted to the NESSIE project in September 2000.

The two ciphers are “direct mode” NOEKEON, to be used for maximum efficiency where related-key attacks are not possible, and “indirect mode” NOEKEON where they are.

Cryptanalysis by Lars Knudsen and Håvard Raddum in April 2001 showed that “indirect mode” NOEKEON was still vulnerable to certain peculiar kinds of related-key cryptanalysis, and showed weaknesses in NOEKEON-variant ciphers which cast doubt on the design strategy behind NOEKEON and thus on its security. As a result, it was not a NESSIE selected algorithm.

The authors of NOEKEON contend in On NOEKEON, no! that the related-key attacks required to break “indirect mode” NOEKEON are not a practical concern, and that it is as a result of deliberate design that NOEKEON is not vulnerable to the attacks that break the variant ciphers; they assert that NOEKEON is still a good and useful cipher.

Noekeon, The Return presented in Jan 2010 argues hardened versions are still suitable for resource constrained environments.

About the code

The C code is derived from the reference sources submitted to NESSIE by the authors. For the initial snippets of code shown here, It should be clear that the endianess of data and key are not converted before and after the encryption/decryption process. Only the reduced version shown at end of post performs conversion.

Omitting the endian conversion on x86 obviously invalidates ciphertext results when comparing with test vectors but it does not, would not affect security of the cipher itself.

Direct and Indirect Mode

The only difference between the two is that with indirect mode we encrypt 16 null bytes using the master key and the resulting ciphertext is used as key for encryption and decryption.

Because it’s such a simple step, I’ve not included it here.

Gamma

Gamma is an involutive (inverse of itself) non-linear mapping that operates on the state.

It can be specified alternatively as a 16-byte S-box applied to each of the boxes of the state and is essentially a function for creating diffusion. I have not investigated if using an sbox would result in smaller code.

void Gamma(uint32_t *a)
{
  uint32_t t;
  
  a[1] ^= ~((a[3]) | (a[2]));
  a[0] ^=   a[2] & a[1];  
  
  t     = a[3]; 
  a[3]  = a[0]; 
  a[0]  = t;
  a[2] ^= a[0] ^ a[1] ^ a[3];
  
  a[1] ^= ~((a[3]) | (a[2]));
  a[0] ^=   a[2] & a[1];  
}

Theta

Theta is a linear mapping that takes the Working Key k and operates on the state.

void Theta(const uint32_t *k, uint32_t *a)
{
  uint32_t t, i;

  t = a[0] ^ a[2]; 
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[1] ^= t;
  a[3] ^= t;
  
  for (i=0; i<4; i++) {
    a[i] ^= k[i];
  }

  t = a[1] ^ a[3]; 
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[0] ^= t;
  a[2] ^= t;  

}

Pi1 and Pi2

The shift operations Pi1 and Pi2 perform cyclic shifts of three of the four words over different offsets.

void Pi1(uint32_t *a){
  a[1] = ROTL32(a[1], 1);
  a[2] = ROTL32(a[2], 5);
  a[3] = ROTL32(a[3], 2);
}

void Pi2(uint32_t *a){
  a[1] = ROTR32(a[1], 1);
  a[2] = ROTR32(a[2], 5);
  a[3] = ROTR32(a[3], 2);
}

Round Function

void Round(
    uint32_t *Key, 
    uint32_t *State, 
    uint8_t Constant1, 
    uint8_t Constant2)
{
  State[0] ^= Constant1;
  Theta(Key, State);
  State[0] ^= Constant2;
  Pi1(State);
  Gamma(State);
  Pi2(State);
}

Encryption and Decryption

void Noekeon(void *buf, const void *k, int enc)
{
  int8_t i;
  uint8_t rc=0x80;

  uint32_t nv[4], State[4], Key[4];
  const uint8_t rc_tab[]= 
{ 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F, 
  0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4 };
  
  memcpy(State, buf, 16);
  memcpy(Key, k, 16);
  
  if (enc==NOEKEON_ENCRYPT)
  {
    for (i=0; i<Nr; i++) {
      Round(Key, State, rc, 0);
      rc = rc_tab[i];
    }
    State[0] ^= rc;
    Theta(Key, State);
  } else {
    memset(nv, 0, 16);
    Theta(nv, Key);

    for (i=Nr-1; i>=0; --i) {
      rc = rc_tab[i];
      Round(Key, State, 0, rc);
    }
    Theta(Key, State);
    State[0] ^= 0x80;
  }  
  memcpy(buf, State, 16);
}

Reduced version

To implement the above code in assembly was slightly disappointing because of the need to apply the Theta function to the key for decryption. That meant keeping Theta as a separate function.

After some tweaking of the code, the following is what eventually gets converted into assembly.

void Theta(
    uint32_t *a, 
    const uint32_t *k)
{
  uint32_t t, i;

  t = a[0] ^ a[2]; 
  
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[1] ^= t;
  a[3] ^= t;
  
  for (i=0; i<4; i++) {
    a[i] ^= k[i];
  }

  t = a[1] ^ a[3]; 
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[0] ^= t;
  a[2] ^= t;  

}

void Round(
    uint32_t *s, 
    uint32_t *Key,
    int enc, 
    int rnd, 
    int end)
{
  uint32_t t;
  uint32_t rc1, rc2;
  const uint8_t rc_tab[]=   
{ 0x80,
  0x1B, 0x36, 0x6C, 0xD8, 
  0xAB, 0x4D, 0x9A, 0x2F, 
  0x5E, 0xBC, 0x63, 0xC6, 
  0x97, 0x35, 0x6A, 0xD4 };
  
  rc1 = rc_tab[rnd];
  rc2 = 0;
  if (enc==NOEKEON_DECRYPT) {
    XCHG(rc1, rc2, t);
  }
  
  s[0] ^= rc1;
  Theta(s, Key);
  s[0] ^= rc2;
  
  if (end) return;
  
  //Pi1
  s[1] = ROTL32(s[1], 1);
  s[2] = ROTL32(s[2], 5);
  s[3] = ROTL32(s[3], 2);
  
  // Gamma
  s[1] ^= ~((s[3]) | (s[2]));
  s[0] ^=   s[2] & s[1];  
  
  XCHG(s[0], s[3], t);
  
  s[2] ^= s[0] ^ s[1] ^ s[3];
  
  s[1] ^= ~((s[3]) | (s[2]));
  s[0] ^=   s[2] & s[1];  
  
  // Pi2
  s[1] = ROTR32(s[1], 1);
  s[2] = ROTR32(s[2], 5);
  s[3] = ROTR32(s[3], 2);
}

void swapcpy(
    void *dst, 
    const void *src)
{
  int i;
  for (i=0; i<4; i++) {
    ((uint32_t*)dst)[i] = SWAP32(((uint32_t*)src)[i]);
  }
}

void Noekeon(
    const void *k, 
    void *buf, 
    int enc)
{
  int i;

  uint32_t NullVector[4], State[4], Key[4];
  
  swapcpy(Key, k);
  swapcpy(State, buf);

  if (enc==NOEKEON_ENCRYPT)
  {
    for (i=0; i<=Nr; i++) {
      Round(State, Key, enc, i, i==Nr);
    }
  } else {
    memset(NullVector, 0, 16);
    Theta(Key, NullVector);

    for (i=Nr; i>=0; --i) {
      Round(State, Key, enc, i, i==0);
    }
  }
  swapcpy(buf, State);
}

The assembly code could be optimized further.

struc pushad_t
  _edi resd 1
  _esi resd 1
  _ebp resd 1
  _esp resd 1
  _ebx resd 1
  _edx resd 1
  _ecx resd 1
  _eax resd 1
  .size:
endstruc
    
%define Nr 16
    
%define a0 eax 
%define a1 ebx 
%define a2 ecx 
%define a3 edx

%define t0 ebp
%define t1 esi

Thetax:
_Thetax:
    pushad
    push   esi
    lodsd
    xchg   a3, eax
    lodsd
    xchg   a1, eax
    lodsd
    xchg   a2, eax
    lodsd
    xchg   a3, eax
    ; t = a[0] ^ a[2];
    mov    t0, a0
    xor    t0, a2
    ; t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
    mov    t1, t0
    rol    t1, 8
    xor    t0, t1    
    ror    t1, 16
    xor    t0, t1
    ; a[1] ^= t;    
    xor    a1, t0
    ; a[3] ^= t;
    xor    a3, t0
    ; a[0] ^= k[0];
    xor    a0, [edi]
    ; a[1] ^= k[1];
    xor    a1, [edi+ 4]
    ; a[2] ^= k[2];
    xor    a2, [edi+ 8]
    ; a[3] ^= k[3];
    xor    a3, [edi+12]
    ; t = a[1] ^ a[3];
    mov    t0, a1
    xor    t0, a3
    ; t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
    mov    t1, t0
    rol    t1, 8
    xor    t0, t1    
    ror    t1, 16
    xor    t0, t1
    ; a[0] ^= t;    
    xor    a0, t0
    ; a[2] ^= t;
    xor    a2, t0
    pop    edi
    stosd
    xchg   eax, a1
    stosd
    xchg   eax, a2
    stosd
    xchg   eax, a3
    stosd
    popad
    ret
   
; esi = State
; edi = Key
; ecx = enc
; eax = rnd   
; CF  = end
Round:
    pushad
    pushfd
    call   nx_rc
    db     0x80
    db     0x1B, 0x36, 0x6C, 0xD8 
    db     0xAB, 0x4D, 0x9A, 0x2F 
    db     0x5E, 0xBC, 0x63, 0xC6 
    db     0x97, 0x35, 0x6A, 0xD4
nx_rc:
    pop    ebx
    xlatb
    jecxz  nxr_enc
    xchg   al, ah
nxr_enc:
    setne  cl
    ; State[0] ^= Constant1;
    xor    byte[esi], al
    ; Theta(State, Key);
    call   Thetax
    ; State[0] ^= Constant2;
    xor    byte[esi], ah
    jecxz  nxr_end
    mov    edi, esi    
    lodsd
    xchg   a3, eax
    lodsd
    xchg   a1, eax
    lodsd
    xchg   a2, eax
    lodsd
    xchg   a3, eax
Pi1:
    rol    a1, 1
    rol    a2, 5
    rol    a3, 2
Gamma:
    ; a[1] ^= ~((a[3]) | (a[2]));
    mov    ebp, a3
    or     ebp, a2
    not    ebp
    xor    a1, ebp
    ; a[0] ^=   a[2] & a[1];
    mov    ebp, a2
    and    ebp, a1
    xor    a0, ebp
    ; XCHG(a[0], a[3], t);
    xchg   a0, a3
    ; a[2] ^= a[0] ^ a[1] ^ a[3];
    xor    a2, a0
    xor    a2, a1
    xor    a2, a3
    ; a[1] ^= ~((a[3]) | (a[2]));
    mov    ebp, a3
    or     ebp, a2
    not    ebp
    xor    a1, ebp  
    ; a[0] ^=   a[2] & a[1];
    mov    ebp, a2
    and    ebp, a1
    xor    a0, ebp    
Pi2:
    ror    a1, 1
    ror    a2, 5
    ror    a3, 2
    stosd
    xchg   eax, a1
    stosd
    xchg   eax, a2
    stosd
    xchg   eax, a3
    stosd
nxr_end: 
    popfd   
    popad
    ret
    
swapcpy:
    pushad
    push   4
    pop    ecx
swp_l0:
    lodsd
    bswap  eax
    stosd
    loop   swp_l0
    mov    [esp+_edi], edi    
    popad
    ret
    
Noekeonx:
_Noekeonx:
    pushad
    lea    esi, [esp+32+4]
    pushad     ; allocate 32-bytes for state + key
    mov    edi, esp
    lodsd
    ; copy key to local buffer
    xchg   eax, esi
    call   swapcpy
    xchg   eax, esi
    lodsd
    ; copy data to local buffer
    xchg   eax, esi
    call   swapcpy
    xchg   eax, esi
    mov    ebp, eax
    lodsd
    cdq                      ; edx = 0
    xchg   ecx, eax          ; ecx = enc
    xchg   eax, edx          ; eax = 0
    mov    edi, esp          ; edi = Key
    lea    esi, [edi+16]     ; esi = State
    jecxz  nx_enc
    
    ; allocate a 16 null byte key
    pushad    
    push   eax
    push   eax
    push   eax
    push   eax
    mov    esi, edi          ; esi = Key
    mov    edi, esp          ; edi = NullVector 
    call   Thetax            ; Theta(Key, NullVector);
    add    esp, 16           ; release NullVector
    popad
    mov    al, Nr            ; i   = Nr    
nx_dec:
    test   eax, eax    
    call   Round
    dec    eax
    jns    nx_dec
    jmp    nx_xit    
nx_enc:
    cmp    al, Nr
    call   Round       ; Round(State, Key, enc, i, i==0);
    lea    eax, [eax+1]
    jnz    nx_enc
nx_xit:
    mov    edi, ebp
    call   swapcpy    
    popad
    popad
    ret

Summary

The C code compiled with MSVC using /O2 /Os flags resulted in 431 bytes but could be significantly smaller without endian conversion. The assembly code is currently 292 bytes although may shrink in future.

See here for sources and any future updates.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s