Noekeon Block cipher

Introduction

Noekeon is a 128-bit block cipher designed by Joan Daemen, Michaël Peeters, Gilles Van Assche, Vincent Rijmen and submitted to the NESSIE project in September 2000.

The two ciphers are “direct mode” Noekeon, to be used for maximum efficiency where related-key attacks are not possible, and “indirect mode” Noekeon where they are.

Cryptanalysis by Lars Knudsen and Håvard Raddum in April 2001 showed that “indirect mode” Noekeon was still vulnerable to certain peculiar kinds of related-key cryptanalysis, and showed weaknesses in Noekeon-variant ciphers which cast doubt on the design strategy behind Noekeon and thus on its security. As a result, it was not a NESSIE selected algorithm.

The authors of Noekeon contend in On Noekeon, no! that the related-key attacks required to break “indirect mode” Noekeon are not a practical concern, and that it is as a result of deliberate design that Noekeon is not vulnerable to the attacks that break the variant ciphers; they assert that Noekeon is still a good and useful cipher.

Noekeon, The Return presented in Jan 2010 argues hardened versions are still suitable for resource constrained environments.

About the code

The C code is derived from the reference sources submitted to NESSIE by the authors. For the initial snippets of code shown here, It should be clear that the endianess of data and key are not converted before and after the encryption/decryption process. Only the reduced version shown at end of post performs conversion.

Omitting the endian conversion on x86 obviously invalidates ciphertext results when comparing with test vectors but it does not, would not affect security of the cipher itself.

Direct and Indirect Mode

The only difference between the two is that with indirect mode we encrypt 16 null bytes using the master key and the resulting ciphertext is used as key for encryption and decryption.

Because it’s such a simple step, I’ve not included it here.

Gamma

Gamma is an involutive (inverse of itself) non-linear mapping that operates on the state.

It can be specified alternatively as a 16-byte S-box applied to each of the boxes of the state and is essentially a function for creating diffusion. I have not investigated if using an sbox would result in smaller code.

void Gamma(uint32_t *a)
{
  uint32_t t;
  
  a[1] ^= ~((a[3]) | (a[2]));
  a[0] ^=   a[2] & a[1];  
  
  t     = a[3]; 
  a[3]  = a[0]; 
  a[0]  = t;
  a[2] ^= a[0] ^ a[1] ^ a[3];
  
  a[1] ^= ~((a[3]) | (a[2]));
  a[0] ^=   a[2] & a[1];  
}

Theta

Theta is a linear mapping that takes the Working Key k and operates on the state.

void Theta(const uint32_t *k, uint32_t *a)
{
  uint32_t t, i;

  t = a[0] ^ a[2]; 
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[1] ^= t;
  a[3] ^= t;
  
  for (i=0; i<4; i++) {
    a[i] ^= k[i];
  }

  t = a[1] ^ a[3]; 
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[0] ^= t;
  a[2] ^= t;  

}

Pi1 and Pi2

The shift operations Pi1 and Pi2 perform cyclic shifts of three of the four words over different offsets.

void Pi1(uint32_t *a){
  a[1] = ROTL32(a[1], 1);
  a[2] = ROTL32(a[2], 5);
  a[3] = ROTL32(a[3], 2);
}

void Pi2(uint32_t *a){
  a[1] = ROTR32(a[1], 1);
  a[2] = ROTR32(a[2], 5);
  a[3] = ROTR32(a[3], 2);
}

Round Function

void Round(
    uint32_t *Key, 
    uint32_t *State, 
    uint8_t Constant1, 
    uint8_t Constant2)
{
  State[0] ^= Constant1;
  Theta(Key, State);
  State[0] ^= Constant2;
  Pi1(State);
  Gamma(State);
  Pi2(State);
}

Encryption and Decryption

void Noekeon(void *buf, const void *k, int enc)
{
  int8_t i;
  uint8_t rc=0x80;

  uint32_t nv[4], State[4], Key[4];
  const uint8_t rc_tab[]= 
{ 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F, 
  0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4 };
  
  memcpy(State, buf, 16);
  memcpy(Key, k, 16);
  
  if (enc==NOEKEON_ENCRYPT)
  {
    for (i=0; i<Nr; i++) {
      Round(Key, State, rc, 0);
      rc = rc_tab[i];
    }
    State[0] ^= rc;
    Theta(Key, State);
  } else {
    memset(nv, 0, 16);
    Theta(nv, Key);

    for (i=Nr-1; i>=0; --i) {
      rc = rc_tab[i];
      Round(Key, State, 0, rc);
    }
    Theta(Key, State);
    State[0] ^= 0x80;
  }  
  memcpy(buf, State, 16);
}

Reduced version

To implement the above code in assembly was slightly disappointing because of the need to apply the Theta function to the key for decryption. That meant keeping Theta as a separate function.

After some tweaking of the code, the following is what eventually gets converted into assembly.

void Theta(
    uint32_t *a, 
    const uint32_t *k)
{
  uint32_t t, i;

  t = a[0] ^ a[2]; 
  
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[1] ^= t;
  a[3] ^= t;
  
  for (i=0; i<4; i++) {
    a[i] ^= k[i];
  }

  t = a[1] ^ a[3]; 
  t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
  
  a[0] ^= t;
  a[2] ^= t;  

}

void Round(
    uint32_t *s, 
    uint32_t *Key,
    int enc, 
    int rnd, 
    int end)
{
  uint32_t t;
  uint32_t rc1, rc2;
  const uint8_t rc_tab[]=   
{ 0x80,
  0x1B, 0x36, 0x6C, 0xD8, 
  0xAB, 0x4D, 0x9A, 0x2F, 
  0x5E, 0xBC, 0x63, 0xC6, 
  0x97, 0x35, 0x6A, 0xD4 };
  
  rc1 = rc_tab[rnd];
  rc2 = 0;
  if (enc==NOEKEON_DECRYPT) {
    XCHG(rc1, rc2, t);
  }
  
  s[0] ^= rc1;
  Theta(s, Key);
  s[0] ^= rc2;
  
  if (end) return;
  
  //Pi1
  s[1] = ROTL32(s[1], 1);
  s[2] = ROTL32(s[2], 5);
  s[3] = ROTL32(s[3], 2);
  
  // Gamma
  s[1] ^= ~((s[3]) | (s[2]));
  s[0] ^=   s[2] & s[1];  
  
  XCHG(s[0], s[3], t);
  
  s[2] ^= s[0] ^ s[1] ^ s[3];
  
  s[1] ^= ~((s[3]) | (s[2]));
  s[0] ^=   s[2] & s[1];  
  
  // Pi2
  s[1] = ROTR32(s[1], 1);
  s[2] = ROTR32(s[2], 5);
  s[3] = ROTR32(s[3], 2);
}

void swapcpy(
    void *dst, 
    const void *src)
{
  int i;
  for (i=0; i<4; i++) {
    ((uint32_t*)dst)[i] = SWAP32(((uint32_t*)src)[i]);
  }
}

void Noekeon(
    const void *k, 
    void *buf, 
    int enc)
{
  int i;

  uint32_t NullVector[4], State[4], Key[4];
  
  swapcpy(Key, k);
  swapcpy(State, buf);

  if (enc==NOEKEON_ENCRYPT)
  {
    for (i=0; i<=Nr; i++) {
      Round(State, Key, enc, i, i==Nr);
    }
  } else {
    memset(NullVector, 0, 16);
    Theta(Key, NullVector);

    for (i=Nr; i>=0; --i) {
      Round(State, Key, enc, i, i==0);
    }
  }
  swapcpy(buf, State);
}

The assembly code could be optimized further.

struc pushad_t
  _edi resd 1
  _esi resd 1
  _ebp resd 1
  _esp resd 1
  _ebx resd 1
  _edx resd 1
  _ecx resd 1
  _eax resd 1
  .size:
endstruc
    
%define Nr 16
    
%define a0 eax 
%define a1 ebx 
%define a2 ecx 
%define a3 edx

%define t0 ebp
%define t1 esi

Thetax:
_Thetax:
    pushad
    push   esi
    lodsd
    xchg   a3, eax
    lodsd
    xchg   a1, eax
    lodsd
    xchg   a2, eax
    lodsd
    xchg   a3, eax
    ; t = a[0] ^ a[2];
    mov    t0, a0
    xor    t0, a2
    ; t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
    mov    t1, t0
    rol    t1, 8
    xor    t0, t1    
    ror    t1, 16
    xor    t0, t1
    ; a[1] ^= t;    
    xor    a1, t0
    ; a[3] ^= t;
    xor    a3, t0
    ; a[0] ^= k[0];
    xor    a0, [edi]
    ; a[1] ^= k[1];
    xor    a1, [edi+ 4]
    ; a[2] ^= k[2];
    xor    a2, [edi+ 8]
    ; a[3] ^= k[3];
    xor    a3, [edi+12]
    ; t = a[1] ^ a[3];
    mov    t0, a1
    xor    t0, a3
    ; t ^= ROTR32(t, 8) ^ ROTL32(t, 8);
    mov    t1, t0
    rol    t1, 8
    xor    t0, t1    
    ror    t1, 16
    xor    t0, t1
    ; a[0] ^= t;    
    xor    a0, t0
    ; a[2] ^= t;
    xor    a2, t0
    pop    edi
    stosd
    xchg   eax, a1
    stosd
    xchg   eax, a2
    stosd
    xchg   eax, a3
    stosd
    popad
    ret
   
; esi = State
; edi = Key
; ecx = enc
; eax = rnd   
; CF  = end
Round:
    pushad
    pushfd
    call   nx_rc
    db     0x80
    db     0x1B, 0x36, 0x6C, 0xD8 
    db     0xAB, 0x4D, 0x9A, 0x2F 
    db     0x5E, 0xBC, 0x63, 0xC6 
    db     0x97, 0x35, 0x6A, 0xD4
nx_rc:
    pop    ebx
    xlatb
    jecxz  nxr_enc
    xchg   al, ah
nxr_enc:
    setne  cl
    ; State[0] ^= Constant1;
    xor    byte[esi], al
    ; Theta(State, Key);
    call   Thetax
    ; State[0] ^= Constant2;
    xor    byte[esi], ah
    jecxz  nxr_end
    mov    edi, esi    
    lodsd
    xchg   a3, eax
    lodsd
    xchg   a1, eax
    lodsd
    xchg   a2, eax
    lodsd
    xchg   a3, eax
Pi1:
    rol    a1, 1
    rol    a2, 5
    rol    a3, 2
Gamma:
    ; a[1] ^= ~((a[3]) | (a[2]));
    mov    ebp, a3
    or     ebp, a2
    not    ebp
    xor    a1, ebp
    ; a[0] ^=   a[2] & a[1];
    mov    ebp, a2
    and    ebp, a1
    xor    a0, ebp
    ; XCHG(a[0], a[3], t);
    xchg   a0, a3
    ; a[2] ^= a[0] ^ a[1] ^ a[3];
    xor    a2, a0
    xor    a2, a1
    xor    a2, a3
    ; a[1] ^= ~((a[3]) | (a[2]));
    mov    ebp, a3
    or     ebp, a2
    not    ebp
    xor    a1, ebp  
    ; a[0] ^=   a[2] & a[1];
    mov    ebp, a2
    and    ebp, a1
    xor    a0, ebp    
Pi2:
    ror    a1, 1
    ror    a2, 5
    ror    a3, 2
    stosd
    xchg   eax, a1
    stosd
    xchg   eax, a2
    stosd
    xchg   eax, a3
    stosd
nxr_end: 
    popfd   
    popad
    ret
    
swapcpy:
    pushad
    push   4
    pop    ecx
swp_l0:
    lodsd
    bswap  eax
    stosd
    loop   swp_l0
    mov    [esp+_edi], edi    
    popad
    ret
    
Noekeonx:
_Noekeonx:
    pushad
    lea    esi, [esp+32+4]
    pushad     ; allocate 32-bytes for state + key
    mov    edi, esp
    lodsd
    ; copy key to local buffer
    xchg   eax, esi
    call   swapcpy
    xchg   eax, esi
    lodsd
    ; copy data to local buffer
    xchg   eax, esi
    call   swapcpy
    xchg   eax, esi
    mov    ebp, eax
    lodsd
    cdq                      ; edx = 0
    xchg   ecx, eax          ; ecx = enc
    xchg   eax, edx          ; eax = 0
    mov    edi, esp          ; edi = Key
    lea    esi, [edi+16]     ; esi = State
    jecxz  nx_enc
    
    ; allocate a 16 null byte key
    pushad    
    push   eax
    push   eax
    push   eax
    push   eax
    mov    esi, edi          ; esi = Key
    mov    edi, esp          ; edi = NullVector 
    call   Thetax            ; Theta(Key, NullVector);
    add    esp, 16           ; release NullVector
    popad
    mov    al, Nr            ; i   = Nr    
nx_dec:
    test   eax, eax    
    call   Round
    dec    eax
    jns    nx_dec
    jmp    nx_xit    
nx_enc:
    cmp    al, Nr
    call   Round       ; Round(State, Key, enc, i, i==0);
    lea    eax, [eax+1]
    jnz    nx_enc
nx_xit:
    mov    edi, ebp
    call   swapcpy    
    popad
    popad
    ret

Summary

The C code compiled with MSVC using /O2 /Os flags resulted in 431 bytes but could be significantly smaller without endian conversion. The assembly code is currently 292 bytes although may shrink in future.

See here for sources and any future updates.

Advertisements
Posted in assembly, cryptography, encryption, programming, security | Tagged , , , | Leave a comment

Chaskey Message Authentication Code

Introduction

Chaskey is a lightweight MAC algorithm optimised for 32-bit micro-controllers designed by Nicky Mouha, Bart Mennink, Anthony Van Herrewege, Dai Watanabe, Bart Preneel and Ingrid Verbauwhede.

It is based on a 128-bit block cipher, the Chaskey cipher, which uses ARX operations and an Even-Mansour structure.

What follows is an implementation in C and x86 assembly optimized for size.

State structure

typedef union state_t {
  uint8_t  b[16];
  uint32_t w[4];
} state;

Key scheduling

void chas_setkey(void *out, void *in) 
{
  int      i;
  uint32_t *k=(uint32_t*)out;
  
  memcpy (out, in, 16);
  
  for (i=0; i<2; i++)
  {
    k[4] = (k[0] << 1);     
    k[4] ^= 0x87 * (k[3] >> 31);    
    k[5] = (k[1] << 1) | (k[0] >> 31); 
    k[6] = (k[2] << 1) | (k[1] >> 31); 
    k[7] = (k[3] << 1) | (k[2] >> 31);
    
    k += 4;    
  }
}

Assembly could possibly do with better optimization.

%define k0 ebp
%define k1 ebx
%define k2 ecx
%define k3 edx
    
chas_setkeyx:
_chas_setkeyx:
    pushad
    mov    edi, [esp+32+4]   ; edi = out
    mov    esi, [esp+32+8]   ; esi = in
    push   edi
    movsd
    movsd
    movsd
    movsd
    pop    esi
    clc
sk_l0:
    pushfd

    lodsd
    xchg   eax, k0
    lodsd
    xchg   eax, k1
    lodsd
    xchg   eax, k2
    lodsd
    xchg   eax, k3
 
    push   k3
    lea    eax, [k0+k0]
    add    k3, k3
    sbb    dl, dl
    and    dl, 0x87
    xor    al, dl   
    pop    k3
    stosd
    
    lea    eax, [k1+k1]
    shr    k0, 31
    or     eax, k0
    stosd
    
    lea    eax, [k2+k2]
    shr    k1, 31
    or     eax, k1
    stosd
    
    lea    eax, [k3+k3]
    shr    k2, 31
    or     eax, k2
    stosd

    popfd
    cmc
    jc     sk_l0
    
    popad
    ret

Key whitening

void chas_xor(state *out, const void *in, int len) {
  int i;

  for (i=0; i<len; i++) {
    out->b[i] ^= ((uint8_t*)in)[i];
  }
}
; ecx = length
; esi = input
; edi = v   
chas_xor:
    pushad
    jecxz  cx_l1
cx_l0:    
    mov    al, [esi]
    xor    [edi], al
    cmpsb
    loop   cx_l0
cx_l1:    
    popad
    ret

Permutation function

This is derived from SipHash permutation function which is derived from chacha and salsa stream ciphers.

void chas_permute(uint32_t v[])
{
  int i=12;
  
  do
  {
    v[0] += v[1]; 
    v[1]=ROTL32(v[1], 5); 
    v[1] ^= v[0]; 
    v[0]=ROTL32(v[0],16); 
    v[2] += v[3]; 
    v[3]=ROTL32(v[3], 8); 
    v[3] ^= v[2]; 
    v[0] += v[3]; 
    v[3]=ROTL32(v[3],13); 
    v[3] ^= v[0]; 
    v[2] += v[1]; 
    v[1]=ROTL32(v[1], 7); 
    v[1] ^= v[2]; 
    v[2]=ROTL32(v[2],16); 
  } while (--i);
}

Assembly code is straight forward but perhaps there’s room to optimize further.

%define v0 eax    
%define v1 edx    
%define v2 ebp    
%define v3 ebx
    
; ecx = 16    
; edi = v
chas_permute:
    pushad
    mov    cl, 12
    mov    esi, edi
    lodsd
    xchg   eax, v3
    lodsd
    xchg   eax, v1
    lodsd
    xchg   eax, v2
    lodsd
    xchg   eax, v3
cp_l0:
    add    v0, v1            ; v[0] += v[1];
    rol    v1, 5             ; v[1] = ROTL(v[1], 5);
    xor    v1, v0            ; v[1] ^= v[0];
    rol    v0, 16            ; v[0] = ROTL(v[0], 16);
    add    v2, v3            ; v[2] += v[3]; 
    rol    v3, 8             ; v[3] = ROTL(v[3], 8); 
    xor    v3, v2            ; v[3] ^= v[2]; 
    add    v0, v3            ; v[0] += v[3];
    rol    v3, 13            ; v[3] = ROTL(v[3], 13);
    xor    v3, v0            ; v[3] ^= v[0];
    add    v2, v1            ; v[2] += v[1];
    rol    v1, 7             ; v[1] = ROTL(v[1],  7);
    xor    v1, v2            ; v[1] ^= v[2];
    rol    v2, 16            ; v[2] = ROTL(v[2], 16);
    loop   cp_l0
    stosd
    xchg   eax, v1
    stosd
    xchg   eax, v2
    stosd
    xchg   eax, v3
    stosd
    popad   
    ret

MAC generation

void chas_mac (uint8_t *tag, 
    uint8_t *msg, uint32_t msglen, uint8_t *key) 
{
  state v;
  int   len;
  
  // copy 16 bytes of key
  memcpy(v.b, key, 16);

  // absorb message 
  for (;;)
  {
    len = (msglen < 16) ? msglen : 16;
    
    // xor state with msg data
    chas_xor(&v, msg, len);

    // final?
    if (msglen <= 16) {
      if (msglen < 16) {
        // final? add padding bit
        v.b[msglen] ^= 0x01;
      }
      key += (msglen == 16) ? 16 : 32;
      break;
    }    
    
    // apply permutation function
    chas_permute(v.w);
    
    // update position and length
    msg += 16;
    msglen -= 16;
  }

  // pre-whiten
  chas_xor(&v, key, 16);
  // permute
  chas_permute(v.w);
  // post-whiten
  chas_xor(&v, key, 16);
  // return tag
  memcpy(tag, v.b, 16);
}

Assembly code

; chaskey    
chas_macx:
_chas_macx:
    pushad
    lea    esi, [esp+32+4]
    pushad                   ; allocate 32 bytes
    mov    edi, esp          ; edi = v
    lodsd
    xchg   eax, ebp          ; ebp = tag ptr
    lodsd
    xchg   eax, ebx          ; ebx = msg ptr
    lodsd
    xchg   edx, eax          ; edx = msglen
    lodsd
    xchg   eax, esi          ; esi = key

    ; memcpy(v, &key[0], 16);
    push   16
    pop    ecx
    push   edi               ; save v
    rep    movsb
    pop    edi               ; restore v
    push   esi               ; save &key[16]
    mov    esi, ebx          ; esi = msg    
    ; absorb message
cm_l0:
    mov    cl, 16
    ; len = (msglen < 16) ? msglen : 16;
    cmp    edx, ecx
    cmovb  ecx, edx
    
    ; chas_xor(&v, msg, len);
    call   chas_xor
    mov    cl, 16
    
    ; if (msglen <= 16)
    cmp    edx, ecx
    jbe    cm_l2
    
    call   chas_permute

    ; msglen -= 16
    sub    edx, ecx
    ; msg += 16
    add    esi, ecx
    
    jmp    cm_l0
cm_l2:    
    pop    esi
    ; if (msglen < 16)
    je     cm_l4    
    ; v.b[msglen] ^= 0x01;
    xor    byte[edi+edx], 0x01
    ; load key + 32
    add    esi, ecx
cm_l4:
    ; chas_xor(v, key, 16);
    call   chas_xor
    ; chas_permute(v);
    call   chas_permute
    ; chas_xor(v, key, 16);
    call   chas_xor
    
    ; memcpy(tag, v, 16);
    mov    esi, edi
    mov    edi, ebp
    rep    movsb
        
    popad
    popad
    ret

Summary

The MSVC generated code resulted in 346 bytes using /Os /O2 flags and the x86 assembly is currently 234 bytes. The underlying block cipher doesn’t require key scheduling and this code could be reduced much further if you only needed that functionality. See sources here for future updates.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , | 2 Comments

Threefish-256 Block Cipher

Introduction

Threefish is a symmetric block cipher designed and published in 2008 by Niels Ferguson, Stefan Lucks, Bruce Schneier, Doug Whiting, Mihir Bellare, Tadayoshi Kohno, Jon Callas and Jesse Walker.

Only the 256-bit key version is presented here which also processes 256-bit blocks of data. Threefish is an ARX(Add-Rotate-Xor) based cipher which makes it suitable for wide range of architectures.

This code is optimized for size so please consider another implementation if speed is required.

Key schedule

#define K(s) (((uint64_t*)key)[(s)])
#define T(s) (((uint64_t*)tweak)[(s)])

void threefish_setkey(
    threefish_ctx_t *c,
    const void *key, 
    const void *tweak)
{
  uint8_t i;
  
  memcpy((void*)c->k, key,   32);
  memcpy((void*)c->t, tweak, 16);
  
  c->k[4] = 0x1BD11BDAA9FC1A22ULL;
  
  for(i=0; i<4; i++){
    c->k[4] ^= K(i);
  }
  c->t[2] = T(0) ^ T(1);  
}
threefish_setkeyx:
_threefish_setkeyx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   eax, edi          ; edi = ctx   
    lodsd
    xchg   eax, esi          ; esi = key
    push   edi               ; save ctx 
    
    ; memcpy((void*)c->k, key,   32);
    push   8
    pop    ecx
    rep    movsd
    scasd
    scasd
    
    ; memcpy((void*)c->t, tweak, 16);
    xchg   eax, esi
    lodsd
    xchg   eax, esi
    mov    cl, 4
    rep    movsd

    pop    edi 
    ; c->k[4] = 0x1BD11BDAA9FC1A22ULL;
    mov    eax, 0xA9FC1A22
    mov    ebx, 0x1BD11BDA
    mov    cl, 4    
tf_sk:
    xor    eax, [edi]
    scasd
    xor    ebx, [edi]
    scasd
    loop   tf_sk
    
    stosd
    xchg   eax, ebx
    stosd
    movq   t0, [edi]
    pxor   t0, [edi+8]
    movq   [edi+16], t0
    popad
    ret

Add Key

// perform both addition and subtraction on data
// enc should be 0 for addition or 1 for subtraction
void addkey(
    const threefish_ctx_t *c, 
    void *data, uint8_t s, 
    uint64_t enc)
{
  int i;
  uint64_t x0, x1, x2;
  uint64_t *x=(uint64_t*)data;
  
  for (i=0; i<4; i++) {
    x0 = x[i];
    x1 = c->k[(s + i) % 5];
    x2 = 0;
    
    if (i==1) x2 = c->t[s % 3];
    if (i==2) x2 = c->t[(s+1) % 3];
    if (i==3) x2 = s;
    
    // add or subtract depending on enc
    x[i] = ((x0 ^ -enc) + x1 + x2) ^ -enc;
  }
}
; void addkey(const threefish_ctx_t *c, 
;    void *data, uint8_t s, 
;    uint64_t enc)
addkey:
    pushad

    pxor    x3, x3           ; x3 = 0 
    jecxz   ak_l0
    
    pcmpeqb x3, x3           ; x3 = ~0
    dec     ecx              ; ecx = 0
ak_l0:  
    push    5
    pop     ebp  
    
    ; x0 = x[i];
    movq    x0, [esi+ecx*8]   
    
    ; x1 = c->k[(s + i) % 5];
    lea     eax, [ebx+ecx]    
    cdq    
    idiv    ebp        
    movq    x1, [edi+edx*8]
    
    dec     ebp
    dec     ebp    
    
    pxor    x2, x2    
    jecxz   ak_lx
    
    ; if (i==3) x2 = s;
    movd    x2, ebx
    cmp     cl, 3
    je      ak_lx

    mov     eax, ebx
    cdq
    cmp     cl, 1
    je      ak_lxx
    inc     eax
ak_lxx:    
    idiv    ebp
    
    ; if (i==1) x2 = c->t[s % 3];
    ; if (i==2) x2 = c->t[(s+1) % 3];    
    movq    x2, [edi+8*edx+40]
ak_lx:    
    ; x[i] = ((x0 ^ -enc) + x1 + x2) ^ -enc;
    pxor    x0, x3
    paddq   x0, x1
    paddq   x0, x2
    pxor    x0, x3
    
    movq    [esi+ecx*8], x0
    
    inc     ecx
    cmp     cl, 4
    jnz     ak_l0    
    popad
    ret

Mix Function

Although for the inverse function we’re supposed to rotate right, subtracting the rotation constant from 64 and rotating left does the same thing.

void mix(void *data, 
    uint8_t rc[], 
    int rnd, int enc)
{
  int     i;
  uint64_t *x;
  uint8_t  r;
  
  x = (uint64_t*)data;
  
  for (i=0; i<4; i += 2)
  {
    r = rc[(rnd & 7) + (i << 2)];

    if (enc==THREEFISH_DECRYPT)
    {
      x[i+1] ^= x[i];
      x[i+1]  = ROTL64(x[i+1], -(r-64));
      x[i]   -= x[i+1];      
    } else {
      x[i]   += x[i+1];
      x[i+1]  = ROTL64(x[i+1], r);
      x[i+1] ^= x[i];    
    }
  }
}

Assembly uses MMX to reduce code

; void mix(void *data, 
;    uint8_t rc[], 
;    int rnd, int enc)
mix:
    pushad
    xor    eax, eax
m_l0:    
    movq   x0, [esi+eax*8]
    movq   x1, [esi+eax*8+8]
    
    ; r = rc[(rnd & 7) + (i << 2)];
    mov    edi, edx
    and    edi, 7
    lea    edi, [edi+eax*4] 
    movzx  edi, byte[ebp+edi]
    jecxz  m_l1
    
    sub    edi, 64
    neg    edi
    
    pxor   x1, x0
    call   rotl64
    psubq  x0, x1
    jmp    m_l2
m_l1    
    paddq  x0, x1
    call   rotl64
    pxor   x1, x0
m_l2:
    movq   [esi+eax*8  ], x0    
    movq   [esi+eax*8+8], x1    
    add    al, 2
    cmp    al, 4
    jnz    m_l0
    popad
    ret

Permute

void permute(void *data)
{
  uint64_t t;
  uint64_t *x=(uint64_t*)data;
  
  t    = x[1];
  x[1] = x[3];
  x[3] = t;
}
; void permute(void *data)    
permute:
    movq   x0, [esi+1*8]
    movq   x1, [esi+3*8]

    movq   [esi+1*8], x1 
    movq   [esi+3*8], x0 
    ret

Encryption/Decryption

void threefish_encrypt(
    const threefish_ctx_t *c, 
    void *data, 
    uint32_t enc)
{
  uint8_t i, s=0, ofs=1;
  uint32_t x0, x1;
  
  uint8_t rc[16] = 
  { 14, 52, 23,  5, 25, 46, 58, 32, 
    16, 57, 40, 37, 33, 12, 22, 32};

  if (enc == THREEFISH_DECRYPT) {
    s   = 18;
    ofs = ~0;
    
    // swap rotation constants if
    // decrypting
    for (i=0; i<4; i += 2) {
      x0 = ((uint32_t*)rc)[i+0];
      x1 = ((uint32_t*)rc)[i+1];

      ((uint32_t*)rc)[i+0] = SWAP32(x1);
      ((uint32_t*)rc)[i+1] = SWAP32(x0);
    }
  }
  // apply 72 rounds
  for (i=0; i<72; i++)
  {
    // add key every 4 rounds
    if((i & 3) == 0) {
      addkey(c, data, s, enc);
      s += ofs;
    }

    // permute if decrypting
    if (enc == THREEFISH_DECRYPT) {
      permute(data);
    }
    
    // apply mixing function
    mix (data, rc, i, enc);
    
    // permute if encrypting
    if (enc == THREEFISH_ENCRYPT) {
      permute(data);
    }
  }
  // add key
  addkey(c, data, s, enc);
}
; void threefish_encrypt(
;    const threefish_ctx_t *c, 
;    void *data, 
;    uint32_t enc)
threefish_encryptx:
_threefish_encryptx
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax    
    lodsd
    push   eax
    lodsd
    cdq
    xchg   eax, ecx
    xor    ebx, ebx
    pop    esi
    push   1
    pop    eax
    ; load rotation constants
    push   0x20160C21
    push   0x25283910
    push   0x203A2E19
    push   0x0517340E
    mov    ebp, esp
    jecxz  tf_l1
    
    neg    eax               ; ofs = -1
    mov    bl, 18            ; s   = 18
    pushad
    mov    esi, ebp
    mov    edi, ebp
tf_l0:    
    lodsd
    xchg   eax, ebx
    lodsd
    bswap  eax
    stosd
    xchg   eax, ebx
    bswap  eax
    stosd
    dec    edx
    jp     tf_l0
    popad
    ; apply 72 rounds
tf_l1:    
    ; add key every 4 rounds
    test  dl, 3
    jnz   tf_l2
    call  addkey
    add   ebx, eax
tf_l2:    
    ; permute if decrypting
    jecxz tf_l3    
    call  permute
tf_l3:    
    ; apply mixing function
    call  mix
    ; permute if encrypting
    test  ecx, ecx
    jnz   tf_l4
    call  permute
tf_l4:    
    inc   edx                 
    cmp   dl, 72
    jnz   tf_l1 
    ; add key
    call  addkey
    add   esp, 16
    popad
    ret

Summary

Using /Os /O2 flags with MSVC 2010 results in 747 bytes. The assembly version is currently 371 bytes but may shrink in future. See the full sources here.

Posted in assembly, cryptography, encryption, programming | Tagged , , | Leave a comment

Half SipHash

Introduction

SipHash: a fast short-input Pseudo-Random-Function(PRF) was designed and published in 2012 by Jean-Philippe Aumasson and Daniel J. Bernstein.

Last month, MR. Aumasson posted to the kernel-hardening mail list a link to a “Half SipHash” implementation which is intended to boost performance. What I show here is Half SipHash but only generates 32-bit hashes.

I already looked at the original code and felt it didn’t fit well onto a 32-bit architecture. Incidentally ChasKey by Nicky Mouha is derived from SipHash, designed specifically for 32-bit architectures and provides 128-bit security.

So this code is really just half of the Half SipHash function returning 32-bits which might not be adequate for some people. The assembly code is presumably slower than a compiler generated version due to the choice of instructions.

Round function

The round function consists of simple ARX (Add-Rotate-Xor) operations which makes it ideal for almost any CPU.

Some of the more common operations on the 128-bit state before and after ARX rounds are now built into the round function itself and work depending on additional parameter: flag which is either zero or one.

After the message, key and length have been absorbed into the state, it performs the following for 32-bit hashes.

v[2] ^= 0xff;

This is simply flipping the lower 8 bits so the additional parameter to SIPROUND is either zero or one which when negated can either flip bits or have no effect at all. Xor’ing by zero does nothing but if we negate 1 and Xor the result, that is the same as performing a logical not.

Since our cROUNDS are 2 and dROUNDS for the end are 4, we can multiply last flag by 2 and add to rnds counter thus avoiding the need for any conditional testing.

void SIPROUND(
    uint32_t v[4], 
    uint32_t w, 
    uint8_t last)
{
  int i, rnds=cROUNDS;
  
  v[2] ^= (uint8_t)-last;
  rnds += (last * 2);
  
  v[3] ^= w;
  
  for (i=0; i<rnds; i++)
  {
    v[0] += v[1];                                                              
    v[1] = ROTL(v[1], 5);                                                      
    v[1] ^= v[0];                                                              
    v[0] = ROTL(v[0], 16);                                                     
    v[2] += v[3];                                                              
    v[3] = ROTL(v[3], 8);                                                      
    v[3] ^= v[2];                                                              
    v[0] += v[3];                                                              
    v[3] = ROTL(v[3], 7);                                                      
    v[3] ^= v[0];                                                              
    v[2] += v[1];                                                              
    v[1] = ROTL(v[1], 13);                                                     
    v[1] ^= v[2];                                                              
    v[2] = ROTL(v[2], 16);   
  }    
  v[0] ^= w;
}

uint32_t hsh32(
    const uint8_t *data, 
    const size_t inlen, 
    const uint8_t *key) 
{
    uint32_t v[4];
    uint8_t i;
    uint32_t m;
    
    uint8_t *in = (uint8_t*)data;
    uint32_t *k = (uint32_t*)key;
    
    uint32_t k0 = k[0];
    uint32_t k1 = k[1];
    
    int left = inlen & 3;        
    const uint8_t *end = in + inlen - left;
    uint32_t b = ((uint32_t)inlen) << 24;
    
    v[0] = k0;
    v[1] = k1;
    v[2] = k0 ^ 0x6c796765;
    v[3] = k1 ^ 0x74656462;

    for (; in != end; in += 4) {
      m = ((uint32_t*)in)[0];
      SIPROUND(v, m, 0);
    }

    while (--left >= 0) {
      b |= ((uint32_t)in[left]) << (8 * left);   
    }

    for (i=0; i<2; i++) {
      SIPROUND(v, b, i);
      b = 0;
    }
    
    return v[1] ^ v[3];
}

With the above code in C, it’s much easier to implement assembly. Instead of using a flag parameter, we clear or set the Carry Flag (CF) using Clear Carry CLC or Set Carry STC. You can see in the last loop, EFLAGS register is saved with PUSHFD/POPFD and then flipped using Complement Carry CMC.

Inside the round function, we avoid flipping bits using Jump If No Carry JNC

; don't change these    
%define cROUNDS 2
%define dROUNDS 4

%define v0 ebx    
%define v1 ebp    
%define v2 edx    
%define v3 edi

_hsh32x:
hsh32x:
    pushad
    lea    esi, [esp+32+ 4]
    lodsd
    push   eax               ; save data
    lodsd
    push   eax               ; save inlen
    lodsd
    xchg   eax, esi          ; esi = key 
    
    ; initialize state
    lodsd                    ; eax = k0
    mov    v0, eax           ; v0  = k0 
    mov    v2, eax           ; v2  = k0
    lodsd                    ; eax = k1
    mov    v1, eax           ; v1  = k1
    mov    v3, eax           ; v3  = k1
    xor    v2, 0x6c796765
    xor    v3, 0x74656462
    
    ; update state in 4-byte blocks
    pop    ecx               ; ecx = inlen
    pop    esi               ; esi = data 
    push   ecx               ; save inlen
    shr    ecx, 2            ; inlen /= 4 
    jz     shx_l2
shx_l0:
    lodsd
shx_l1:
    clc    
    call   SIPROUND
    loop   shx_l0
shx_l2:    
    pop    ecx               ; restore inlen
    mov    eax, ecx
    push   edx               ; save edx
    cdq                      ; edx = 0    
    shl    eax, 24           ; eax = inlen << 24
    and    ecx, 3            ; inlen &= 3
shx_l3:
    dec    ecx               ; if (--left >= 0) 
    js     shx_l4            ;   goto shx_l4
    shl    edx, 8            ; t <<= 8
    mov    dl, byte[esi+ecx] ; t |= in[left]
    jmp    shx_l3
shx_l4:
    or     eax, edx          ; b |= t
    pop    edx               ; restore edx
    clc                      ; CF=0  
shx_l5:    
    pushfd                   ; save flags
    call   SIPROUND
    popfd                    ; restore flags
    cmc                      ; CF=1 for last round
    jc     shx_l5
    xor    v1, v3            ; v[1] ^= v[3]
    mov    [esp+28], v1      ; pushad_t._eax = v1
    popad
    ret

; CF=0 for normal update    
; CF=1 for final update    
SIPROUND:
    push   ecx               ; save ecx
    push   cROUNDS 
    pop    ecx
    jnc    sr_l0             ; skip if no carry
    
    xor    eax, eax          ; w=0 if last round
    add    ecx, ecx          ; assumes: cROUNDS=2, dROUNDS=4
    not    dl                ; requires edx to be v2
sr_l0:
    xor    v3, eax           ; v3 ^= w    
sr_l1:    
    add    v0, v1            ; v[0] += v[1];
    rol    v1, 5             ; v[1] = ROTL(v[1], 5);
    xor    v1, v0            ; v[1] ^= v[0];
    rol    v0, 16            ; v[0] = ROTL(v[0], 16);
    add    v2, v3            ; v[2] += v[3]; 
    rol    v3, 8             ; v[3] = ROTL(v[3], 8); 
    xor    v3, v2            ; v[3] ^= v[2]; 
    add    v0, v3            ; v[0] += v[3];
    rol    v3, 7             ; v[3] = ROTL(v[3], 7);
    xor    v3, v0            ; v[3] ^= v[0];
    add    v2, v1            ; v[2] += v[1];
    rol    v1, 13            ; v[1] = ROTL(v[1], 13);
    xor    v1, v2            ; v[1] ^= v[2];
    rol    v2, 16            ; v[2] = ROTL(v[2], 16);
    loop   sr_l1
    xor    v0, eax           ; v0 ^= w
    pop    ecx               ; restore ecx
    ret

Summary

The code generated by MSVC compiler using /Os /O2 switches resulted in 261 bytes. The assembly version is 148 bytes. See sources here for any future updates.

Posted in assembly, cryptography, programming, security | Tagged , , , , | Leave a comment

Poly1305 Message Authentication Code

Introduction

Poly1305 is a cryptographic Message Authentication Code (MAC) designed and published in 2004 by Daniel J. Bernstein. It can be used to verify the data integrity and authenticity of a message.

Adam Langley has published details in RFC 7539 which uses ChaCha20 and Poly1305 for Authenticated Encryption Associated Data (AEAD) as an alternative to AES-GCM (Galois Counter Mode)

The reasons to use ChaCha20-Poly1305 instead of AES-GCM are not for lack of confidence in AES but due to its performance on ARM architectures that do not have hardware acceleration.

Dan’s ciphers

Many of Dan’s designs have become popular in the last number of years and there are some good observations on the reasons why in an essay titled On the Impending Crypto Monoculture by Peter Gutmann.

It’s not a case of Dan being a crypto king (although he does awesome work). IMHO the choice of using his algorithms comes down to a number of reasons.

  • Provably secure
  • Patent free
  • Low overhead in hardware and software
  • Simple and secure implementations already provided by author
  • Not tainted by government agencies

Dan’s designs are intended to be resistant to timing attacks, side channel analysis which AES was never intended to be.

CCM, EAX and GCM can be challenging to implement correctly. Poly1305 is simple with just addition, modular multiplication and therefore less prone to error.

Vlad Krasnov from Cloudflare offers a sound review of both poly1305 and chacha20 in It takes two to ChaCha (Poly)

About C code

As most of you might know, the codes presented here in all entries are optimized for size so if you want an implementation optimized for speed, this will certainly disappoint you.

As per usual with these entries, I’ll show code in C first and then x86 assembly. If you can spot ways to optimize the code for size, feel free to fork the project and submit a pull request with your name included in source code as author.

The C code here is derived from a reference implementation provided by Dan on his site except I’m following the design presented by Langley in RFC 7539 for ChaCha20. The RFC is easy to follow and if you want to better understand implementation, I highly recommend it.

Addition

The addition is performed on 8-bit bytes stored as 32-bit words. At the end of every number added to the accumulator is a byte: 0, 1 or 252. So rather than expecting the caller to pad a buffer with zeros, poly1305_add() takes care of this.

/**********************************************
 *
 * poly1305 addition
 *
 **********************************************/
void poly1305_add(
    uint32_t out[], 
    const uint8_t in[], 
    int inlen, 
    uint8_t last)
{
  uint32_t c, i, x = 0;
  uint8_t *p = (uint8_t*)in;

  // add in to out
  for (i=0; i<17; i++) {
    c = *p;
    c = (i == inlen) ? last : c;
    c = (i  > inlen) ? 0    : c;
    p = (i  < inlen) ? p+1  : p;
    
    x += out[i] + c; 
    out[i] = x & 255; 
    x >>= 8; 
  }
}

Even though ADC (Add with Carry) is available and in fact works fine I’ve stuck with the original reference provided by MR. Dan. Function expects accumulator in EDI, the input bytes in ESI, inlen in ECX and last byte in AL. ADC only increments p if i is less than inlen.

; ***************************************
;
; poly1305 addition
;
; al  = last 
; ecx = inlen
; edi = out
; esi = p
;
; ***************************************
_poly1305_addx:
poly1305_addx:
    pushad
    movzx  edx, al            ; edx = last
    xor    ebx, ebx           ; x = 0
    xor    ecx, ecx           ; ecx = 0    
    xor    ebp, ebp           ; i = 0
add_l0:
    movzx  eax, byte[esi]     ; c = *p 
    cmp    ebp, [esp+_ecx]    ; EFLAGS = (i - inlen)
    cmovz  eax, edx           ; c = (i == inlen) ? last : c;
    cmova  eax, ecx           ; c = (i  > inlen) ? 0    : c;
    adc    esi, ecx           ; p = (i  < inlen) ? p+1  : p;
    add    ebx, [edi]         ; x += out[i]
    add    ebx, eax           ; x += c
    mov    al, bl             ; al = x
    stosd                     ; out[i] = x & 255
    shr    ebx, 8             ; x >>= 8
    inc    ebp                ; i++
    cmp    ebp, 17 
    jb     add_l0
    popad
    ret

Modular Multiplication

The modulus is 2^130-5 or 3fffffffffffffffffffffffffffffffb and Dan explains in his paper the reason for choosing this number specifically.

I chose 2^130-5 because its sparse form makes divisions particularly easy in both software and hardware. My encoding of messages as polynomials takes advantage of the gap between 2^128 and 2^130-5.

/**********************************************
 *
 * poly1305 modular multiplication
 *
 * "P" is 2^130-5 or 3fffffffffffffffffffffffffffffffb
 *
 **********************************************/
void poly1305_mulmod(
    uint32_t acc[17],
    const uint32_t r[17])
{
  uint32_t hr[17];
  uint32_t i, j, u;

  memcpy ((uint8_t*)hr, (uint8_t*)acc, 17*4);
  
  // multiply
  for (i=0; i<17; i++) {
    u = 0;
    for (j=0; j<=i; j++) {
      u += hr[j] * r[i - j];
    }
    for (; j<17; j++) {
      u += hr[j] * r[i + (17 - j)] * 320;
    }
    acc[i] = u;
  }
  
  for (u=0, j=0; j<2; j++)
  {
    for (i=0; i<16; i++) 
    { 
      u += acc[i];
      acc[i] = u & 255; 
      u >>= 8; 
    }
    if (!j) 
    {
      u += acc[16];
      acc[16] = u & 3;
      u = (u >> 2) * 5;
    }
  }
  acc[16] += u;
}

The assembly sticks close to the original C code provided. I attempted to use a different implementation of modular multiplication but the reduction in code was negligible.

; ***************************************
;
; poly1305 modular multiplication
;
; "P" is 2^130-5 or 3fffffffffffffffffffffffffffffffb
;
; esi = acc 
; ebx = r
;
; ***************************************
poly1305_mulmodx:
_poly1305_mulmodx:
    pushad
    ; memcpy ((uint8_t*)hr, (uint8_t*)acc, 17*4); 
    push   17*4
    pop    ecx
    sub    esp, ecx
    mov    esi, edi            ; esi = acc
    mov    edi, esp            ; edi = hr
    push   esi                 ; save acc
    rep    movsb
    pop    edi                 ; edi = acc
    
    ; multiply
    mov    esi, ebx            ; esi = r
    xor    ebx, ebx            ; i = 0
mm_l0:
    xor    ecx, ecx            ; j = 0
    xor    edx, edx            ; u = 0
mm_l1:
    mov    eax, [esp+ecx*4]    ; eax = hr[j]
    mov    ebp, ebx            ; ebp = i
    sub    ebp, ecx            ; ebp = i - j
    imul   eax, [esi+ebp*4]    ; eax *= r[i - j]
    add    edx, eax            ; u += eax
    inc    ecx                 ; j++
    cmp    ecx, ebx            ; j <= i
    jbe    mm_l1
mm_l2:
    push   17
    pop    ebp
    
    cmp    ecx, ebp            ; i < 17
    jae    mm_l3

    sub    ebp, ecx            ; ebp = 17 - j          
    add    ebp, ebx            ; ebp += i
    
    push   64
    pop    eax
    inc    ah                  ; eax = 320
    imul   eax, [esi+ebp*4]    ; eax *= r[i+17-j]
    imul   eax, [esp+ecx*4]    ; eax *= hr[j]
    add    edx, eax            ; u += eax 
    inc    ecx                 ; j++
    jmp    mm_l2
mm_l3:
    mov    [edi+ebx*4], edx    ; acc[i] = u
    inc    ebx
    cmp    ebx, 17
    jb     mm_l0
    
    ; reduce
    xor    ebx, ebx
    xor    edx, edx            ; u = 0
f_lx:    
    mov    cl, 16
    mov    esi, edi            ; esi = acc
    push   edi    
f_l0:
    lodsd                      ; eax = acc[i]
    add    edx, eax            ; u += acc[i]
    movzx  eax, dl             ; eax = u & 255
    stosd                      ; acc[i] = eax
    shr    edx, 8              ; u >>= 8
    loop   f_l0
    dec    ebx                 ;  
    jnp    f_l1    
    ; ---------------
    lodsd                      ; eax = acc[16]
    add    edx, eax            ; u += eax
    movzx  eax, dl             ; 
    and    al, 3               ; al &= 3
    stosd                      ; acc[16] = eax
    shr    edx, 2              ; u >>= 2
    lea    edx, [edx+edx*4]    ; u = (u >> 2) * 5
    pop    edi
    jmp    f_lx
f_l1:    
    add    [edi], edx          ; acc[16] += u;
    add    esp, 17*4 + 4
    popad
    ret

MAC function

This is the main function called to generate 128-bit MAC of message. First r is “clamped”

I constrained r to simplify and accelerate implementations of Poly1305-AES in various contexts. A wider range of r|e.g., all 128-bit integers would allow a quantitatively better security bound, but the current bound DL=2106 will be perfectly satisfactory for the foreseeable future, whereas slower authenticator computations would not be perfectly satisfactory.

/**********************************************
 *
 * poly1305 mac function
 *
 **********************************************/
void poly1305_mac (
    uint8_t *out, 
    const uint8_t *in, 
    uint32_t inlen, 
    const uint8_t *k)
{
  uint32_t i, len, neg;
  uint32_t r[17], acc[17];
  uint8_t  minusp[16]={5};
  
  // copy r
  for (i=0; i<16; i++) {
    r[i] = k[i];
  }
  // clamp r
  r[ 3] &= 15;
  r[ 7] &= 15;
  r[11] &= 15;
  r[15] &= 15;
  
  r[ 4] &= 252;
  r[ 8] &= 252;
  r[12] &= 252;
  r[16]  = 0;

  // zero initialize accumulator
  memset ((uint8_t*)acc, 0, 17*4);

  for (;;) 
  {
    // if zero length, break
    if (inlen==0) break;
    
    // process 16 bytes or remaining
    len = inlen < 16 ? inlen : 16;
    
    // add bytes to acc
    poly1305_add (acc, in, len, 1);

    // multiply accumulator by r mod p
    poly1305_mulmod (acc, r);

    // update length and buffer position
    in    += len;
    inlen -= len;
  }

  memcpy (r, acc, sizeof(r));

  poly1305_add (acc, minusp, 16, 252);
  neg = -(acc[16] >> 7);
  
  for (i=0; i<17; i++) {
    acc[i] ^= neg & (r[i] ^ acc[i]);
  }
  
  // add s
  poly1305_add (acc, &k[16], 16, 0);
  
  // return tag
  for (i=0; i<16; i++) {
    out[i] = acc[i];
  }
}

Summary

Assembly generated by MSVC 2010 with /O2 /Os flags results in 507 bytes. The assembly implementation is currently 339 bytes but may shrink in the future.

See here for future updates or if you want to contribute.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , | 1 Comment

Bel-T Block Cipher

Introduction

Bel-T is a block cipher that was adopted as a standard by the Republic of Belarus in 2011 and defined in STB 34.101.31-2011. It takes 128-bits of data and encrypts using an n-bit key (where n can be 128, 192 or 256).

It uses 8 rounds of a function based on the Lai-Massey scheme also used by IDEA and IDEA-NXT. There’s a key schedule procedure however because it’s only relevant for 128 and 192-bit keys, it’s not included here. If you’re interested in the key scheduling function, you can find it in the code this version was derived from.

The C code here is derived from this code written by Philipp Jovanovic however they look very different to one another after much tweaking by myself. Essentially what I’ve done is reduce the amount of code at the expense of performance. Moreover, the C code here is unlikely to run on big endian architectures without some modification.

Structures used

I’ve assigned a number of registers for each variable while using stack for everything else. The work_t structure defines each position on stack for some variables. For example, ‘flgs’ holds copy of flags register saved with pushfd and ‘key’ points to 32-byte area of memory containing encryption/decryption key.

struc pushad_t
  _edi dd ?
  _esi dd ?
  _ebp dd ?
  _esp dd ?
  _ebx dd ?
  _edx dd ?
  _ecx dd ?
  _eax dd ?
  .size:
endstruc

%define BELT_ENCRYPT 0
%define BELT_DECRYPT 1

%define t   eax
%define a   ebx
%define b   ecx
%define c   edx
%define d   esi
%define e   edi
%define G   ebp

; workspace on stack
struc work_t
  flgs dd ?
  j    dd ?
  key  dd ?
  i    dd ?
  blk  dd ?
endstruc

For the C code, I try instruct the compiler to use stack for all variables although any good compiler will likely decide itself what’s best depending on parameters and target architecture. At least for my own compiler options, using structures for both key and data resulted in less code being generated.

// 32-bit type
typedef union {
    uint8_t  b[4];
    uint32_t w;
} w32_t;

// 128-bit type
typedef struct {
    union {
      uint8_t  v8[16];
      uint32_t w[4];
      struct {
        uint32_t a, b, c, d;
      };
    };
} w128_t;

// 256-bit type
typedef struct {
    union {
      uint8_t  v8[32];
      uint32_t w[8];
      struct {
        uint32_t a, b, c, d;
        uint32_t e, f, g, h;
      };
    };
} w256_t;

Substitution box

Simply maps 32-bits of input to 32-bits of output using fixed 256-byte array. I don’t know how the sbox was generated although if anyone does know, please respond in comments. If it’s possible to generate at runtime, it might also be possible to save additional space for this code in ROM.

// sbox
const uint8_t H[256] =
{ 0xB1, 0x94, 0xBA, 0xC8, 0x0A, 0x08, 0xF5, 0x3B,
  0x36, 0x6D, 0x00, 0x8E, 0x58, 0x4A, 0x5D, 0xE4,
  0x85, 0x04, 0xFA, 0x9D, 0x1B, 0xB6, 0xC7, 0xAC,
  0x25, 0x2E, 0x72, 0xC2, 0x02, 0xFD, 0xCE, 0x0D,
  0x5B, 0xE3, 0xD6, 0x12, 0x17, 0xB9, 0x61, 0x81,
  0xFE, 0x67, 0x86, 0xAD, 0x71, 0x6B, 0x89, 0x0B,
  0x5C, 0xB0, 0xC0, 0xFF, 0x33, 0xC3, 0x56, 0xB8,
  0x35, 0xC4, 0x05, 0xAE, 0xD8, 0xE0, 0x7F, 0x99,
  0xE1, 0x2B, 0xDC, 0x1A, 0xE2, 0x82, 0x57, 0xEC,
  0x70, 0x3F, 0xCC, 0xF0, 0x95, 0xEE, 0x8D, 0xF1,
  0xC1, 0xAB, 0x76, 0x38, 0x9F, 0xE6, 0x78, 0xCA,
  0xF7, 0xC6, 0xF8, 0x60, 0xD5, 0xBB, 0x9C, 0x4F,
  0xF3, 0x3C, 0x65, 0x7B, 0x63, 0x7C, 0x30, 0x6A,
  0xDD, 0x4E, 0xA7, 0x79, 0x9E, 0xB2, 0x3D, 0x31,
  0x3E, 0x98, 0xB5, 0x6E, 0x27, 0xD3, 0xBC, 0xCF,
  0x59, 0x1E, 0x18, 0x1F, 0x4C, 0x5A, 0xB7, 0x93,
  0xE9, 0xDE, 0xE7, 0x2C, 0x8F, 0x0C, 0x0F, 0xA6,
  0x2D, 0xDB, 0x49, 0xF4, 0x6F, 0x73, 0x96, 0x47,
  0x06, 0x07, 0x53, 0x16, 0xED, 0x24, 0x7A, 0x37,
  0x39, 0xCB, 0xA3, 0x83, 0x03, 0xA9, 0x8B, 0xF6,
  0x92, 0xBD, 0x9B, 0x1C, 0xE5, 0xD1, 0x41, 0x01,
  0x54, 0x45, 0xFB, 0xC9, 0x5E, 0x4D, 0x0E, 0xF2,
  0x68, 0x20, 0x80, 0xAA, 0x22, 0x7D, 0x64, 0x2F,
  0x26, 0x87, 0xF9, 0x34, 0x90, 0x40, 0x55, 0x11,
  0xBE, 0x32, 0x97, 0x13, 0x43, 0xFC, 0x9A, 0x48,
  0xA0, 0x2A, 0x88, 0x5F, 0x19, 0x4B, 0x09, 0xA1,
  0x7E, 0xCD, 0xA4, 0xD0, 0x15, 0x44, 0xAF, 0x8C,
  0xA5, 0x84, 0x50, 0xBF, 0x66, 0xD2, 0xE8, 0x8A,
  0xA2, 0xD7, 0x46, 0x52, 0x42, 0xA8, 0xDF, 0xB3,
  0x69, 0x74, 0xC5, 0x51, 0xEB, 0x23, 0x29, 0x21,
  0xD4, 0xEF, 0xD9, 0xB4, 0x3A, 0x62, 0x28, 0x75,
  0x91, 0x14, 0x10, 0xEA, 0x77, 0x6C, 0xDA, 0x1D };

// substitute 32-bits
uint32_t G(uint32_t x, w256_t *key, int idx, int r) {
    int   i;
    w32_t u;

    u.w = key->w[idx & 7] + x;

    for (i=0; i<4; i++) {
      u.b[i] = H[u.b[i]];
    }
    return ROTL32(u.w, r);
}

Since this function is called quite a lot, the address is loaded into register.

; ------------------------
    ; G function
    ; ------------------------
    pushad
    mov    esi, [esp+_esp]   ; esi=esp
    lodsd                    ; eax=return address
    lodsd                    ; eax=x
    xchg   eax, edx          ; edx=x
    lodsd                    ; eax=r
    push   eax               ; save r
    lodsd                    ; eax=eflags
    lodsd                    ; eax=j
    inc    dword[esi-4]      ; j++
    xchg   eax, ebx          ; ebx=j
    lodsd                    ; eax=key
    xchg   eax, edi          ; edi=key
    and    ebx, 7            ; j &= 7
    add    edx, [edi+ebx*4]  ; x += key[j & 7]
    xchg   eax, edx
    push   4
    pop    ecx               ; ecx=4
    call   g_l0
    ; sbox
    db 0xB1, 0x94, 0xBA, 0xC8, 0x0A, 0x08, 0xF5, 0x3B,
    db 0x36, 0x6D, 0x00, 0x8E, 0x58, 0x4A, 0x5D, 0xE4,
    db 0x85, 0x04, 0xFA, 0x9D, 0x1B, 0xB6, 0xC7, 0xAC,
    db 0x25, 0x2E, 0x72, 0xC2, 0x02, 0xFD, 0xCE, 0x0D,
    db 0x5B, 0xE3, 0xD6, 0x12, 0x17, 0xB9, 0x61, 0x81,
    db 0xFE, 0x67, 0x86, 0xAD, 0x71, 0x6B, 0x89, 0x0B,
    db 0x5C, 0xB0, 0xC0, 0xFF, 0x33, 0xC3, 0x56, 0xB8,
    db 0x35, 0xC4, 0x05, 0xAE, 0xD8, 0xE0, 0x7F, 0x99,
    db 0xE1, 0x2B, 0xDC, 0x1A, 0xE2, 0x82, 0x57, 0xEC,
    db 0x70, 0x3F, 0xCC, 0xF0, 0x95, 0xEE, 0x8D, 0xF1,
    db 0xC1, 0xAB, 0x76, 0x38, 0x9F, 0xE6, 0x78, 0xCA,
    db 0xF7, 0xC6, 0xF8, 0x60, 0xD5, 0xBB, 0x9C, 0x4F,
    db 0xF3, 0x3C, 0x65, 0x7B, 0x63, 0x7C, 0x30, 0x6A,
    db 0xDD, 0x4E, 0xA7, 0x79, 0x9E, 0xB2, 0x3D, 0x31,
    db 0x3E, 0x98, 0xB5, 0x6E, 0x27, 0xD3, 0xBC, 0xCF,
    db 0x59, 0x1E, 0x18, 0x1F, 0x4C, 0x5A, 0xB7, 0x93,
    db 0xE9, 0xDE, 0xE7, 0x2C, 0x8F, 0x0C, 0x0F, 0xA6,
    db 0x2D, 0xDB, 0x49, 0xF4, 0x6F, 0x73, 0x96, 0x47,
    db 0x06, 0x07, 0x53, 0x16, 0xED, 0x24, 0x7A, 0x37,
    db 0x39, 0xCB, 0xA3, 0x83, 0x03, 0xA9, 0x8B, 0xF6,
    db 0x92, 0xBD, 0x9B, 0x1C, 0xE5, 0xD1, 0x41, 0x01,
    db 0x54, 0x45, 0xFB, 0xC9, 0x5E, 0x4D, 0x0E, 0xF2,
    db 0x68, 0x20, 0x80, 0xAA, 0x22, 0x7D, 0x64, 0x2F,
    db 0x26, 0x87, 0xF9, 0x34, 0x90, 0x40, 0x55, 0x11,
    db 0xBE, 0x32, 0x97, 0x13, 0x43, 0xFC, 0x9A, 0x48,
    db 0xA0, 0x2A, 0x88, 0x5F, 0x19, 0x4B, 0x09, 0xA1,
    db 0x7E, 0xCD, 0xA4, 0xD0, 0x15, 0x44, 0xAF, 0x8C,
    db 0xA5, 0x84, 0x50, 0xBF, 0x66, 0xD2, 0xE8, 0x8A,
    db 0xA2, 0xD7, 0x46, 0x52, 0x42, 0xA8, 0xDF, 0xB3,
    db 0x69, 0x74, 0xC5, 0x51, 0xEB, 0x23, 0x29, 0x21,
    db 0xD4, 0xEF, 0xD9, 0xB4, 0x3A, 0x62, 0x28, 0x75,
    db 0x91, 0x14, 0x10, 0xEA, 0x77, 0x6C, 0xDA, 0x1D
g_l0:
    pop    ebx        ; ebx=H 
g_l1:
    xlatb             ; u.b[i] = H[u.b[i]];
    ror    eax, 8     ; u.w = ROTR(u.w, 8)
    loop   g_l1
    pop    ecx        ; ecx=r
    rol    eax, cl    ; return ROTL32(u.w, r);
    mov    [esp+_eax], eax
    popad
    retn   2*4
    ; -----------------------

Encrypion/Decryption

I’ve written this so it handles both operations based on parameter passed to the function. This makes it much more compact than having 2 separate functions.

// perform encryption and decryption based on enc parameter
void belt_encrypt(void *blk, const void *ks, int enc)
{
    w256_t   v; 
    w256_t   key;
    uint32_t i, j, t, e;
    uint32_t *x=(uint32_t*)blk;

    // load 256-bit key into local space
    memcpy (key.v8, (uint8_t*)ks, 32);

    // if decryption, rotate key 128-bits
    if (enc==BELT_DECRYPT) {
      for (i=0; i<4; i++) {
        XCHG (key.w[i], key.w[7-i], t);
      }
    }
    
    // load 128-bit data into local space
    memcpy (v.v8, (uint8_t*)blk, 16);

    // apply 8 rounds
    for (i=0, j=0; i<8; i++, j += 7)
    {
      v.b ^= G(v.a,       &key, j+0, 5);
      v.c ^= G(v.d,       &key, j+1,21);
      v.a -= G(v.b,       &key, j+2,13);
      e    = G(v.b + v.c, &key, j+3,21);
      t    = i + 1;

      if (enc==BELT_ENCRYPT) 
          goto b_l3;
      
      t = (7 - i) + 1;
b_l3:
      e   ^= t;
      v.b += e;
      v.c -= e;
      v.d += G(v.c,     &key, j+4,13);
      v.b ^= G(v.a,     &key, j+5,21);
      v.c ^= G(v.d,     &key, j+6, 5);

      XCHG(v.a, v.b, t);
      XCHG(v.c, v.d, t);
      XCHG(v.b, v.c, t);

      if (enc==BELT_ENCRYPT)
          continue;
      
      // swap for decryption
      XCHG(v.b, v.c, t);
      XCHG(v.a, v.d, t);
    }
    // save data for encryption
    x[0] = v.b; x[1] = v.d;
    x[2] = v.a; x[3] = v.c;

    if (enc == BELT_ENCRYPT)
        return;
      
    // save data for decryption
    x[0] = v.c; x[1] = v.a;
    x[2] = v.d; x[3] = v.b;
}

There might be a better way to implement this in x86 assembly although i don’t see how at the moment.

belt_encryptx:
_belt_encryptx:
    pushad
    lea    esi, [esp+32+4]
    pushad            ; allocate 32-bytes for key
    mov    edi, esp
    lodsd
    xchg   ebx, eax   ; ebx=blk
    lodsd
    push   eax        ; save ks
    lodsd
    cdq               ; edx=0
    ; memcpy (key.v8, (uint8_t*)ks, 32);
    push   32
    pop    ecx
    pop    esi        ; esi=ks
    push   edi
    rep    movsb      ; copy key to local buffer
    pop    edi
    
    push   ebx        ; save blk ptr
    push   edx        ; save i
    push   edi        ; save key
    push   edx        ; save j
    dec    eax        ; BELT_ENCRYPT?
    pushfd            ; save flags
    js     b_l1
    
    ; rotate key 128-bits for decryption
    lea    esi, [edi+7*4]
    mov    cl, 4
rotate_key:
    mov    eax, [edi]  ; eax=key[i]
    xchg   eax, [esi]
    stosd
    sub    esi, 4
    loop   rotate_key
b_l1:
    mov    esi, ebx   ; esi=blk
    lodsd
    xchg   a, eax     ; a = blk[0]
    lodsd
    xchg   b, eax     ; b = blk[1]
    lodsd
    xchg   c, eax     ; c = blk[2]
    lodsd
    xchg   d, eax     ; d = blk[3]
    call   b_l2
    ; ------------------------
    ; G function goes here
    ; -----------------------
b_l2:
    pop    G
b_l3:
    push   5
    push   a
    call   G   
    xor    b, t       ; b ^= G(a, key, j+0, 5);
    
    push   21
    push   d
    call   G     
    xor    c, t       ; c ^= G(d, key, j+1,21);
    
    push   13
    push   b
    call   G       
    sub    a, t       ; a -= G(b, key, j+2,13);
    
    push   21
    lea    t, [b + c]
    push   t
    call   G          ; e  = G(b + c, key, j+3,21);
    xchg   t, e
    
    inc    dword[esp+i] ; i++
    
    mov    t, [esp+i] ; t = i;

    popfd             ; set flags
    pushfd            ; save flags
    js     b_l4
    
    sub    t, 9
    neg    t         ; t = (7 - i) + 1;
b_l4:
    xor    e, t      ; e   ^= t;
    add    b, e      ; v.b += e;
    sub    c, e      ; v.c -= e;
    
    push   13
    push   c
    call   G
    add    d, t      ; v.d += G(v.c, &key, j+4,13);
    
    push   21
    push   a
    call   G
    xor    b, t      ; v.b ^= G(v.a, &key, j+5,21);
    
    push   5
    push   d
    call   G
    xor    c, t      ; v.c ^= G(v.d, &key, j+6, 5);
    
    xchg   a, b
    xchg   c, d
    xchg   b, c
    
    popfd            ; set flags
    pushfd           ; save flags
    js     b_l5      ; if (enc==BELT_ENCRYPT) 
                     ;     continue;
    xchg   b, c
    xchg   a, d
b_l5:
    cmp    dword[esp+i], 8
    jnz    b_l3
    
    popfd            ; restore flags
    pop    eax       ; remove j
    pop    eax       ; remove key
    pop    eax       ; remove i
    pop    edi       ; restore blk
    push   edi       ; save blk

    xchg   eax, b
    stosd            ; blk[0] = b
    xchg   eax, d
    stosd            ; blk[1] = d
    xchg   eax, a
    stosd            ; blk[2] = a
    xchg   eax, c
    stosd            ; blk[3] = c
    
    pop    edi       ; restore blk
    js     b_l6
    
    stosd            ; blk[0] = c 
    xchg   eax, c    
    stosd            ; blk[1] = a
    xchg   eax, a
    stosd            ; blk[2] = d
    xchg   eax, d
    stosd            ; blk[3] = b
b_l6:
    popad            ; release stack for key
    popad            ; restore registers
    ret

Summary

Some people have expressed concerns about the low number of rounds but the round function itself is strong so 8 should be enough for now. It’s still bigger than AES but that’s simply because of the fixed sbox. I’ve not measured performance between AES and Bel-T.

architecture size
x86 490
x64 n/a

Please refer to here for sources, documentation and any updates in future.

Posted in assembly, cryptography, encryption, security | Tagged , , , , ,

Kuznyechik “Grasshopper” Block cipher

Introduction

Kuznyechik (Grasshopper in Russian) is a 128-bit block cipher supporting 256-bit keys defined in the National Standard of the Russian Federation GOST R 34.12-2015 and RFC 7801 published in 2015 to supercede GOST 28147-89 cipher also referred to as Magma that was declassified in 1994.

Kuznyechik uses an SPN (Substitution-permutation network) design the same as AES (Rijndael) and like the 128-bit key version of Rijndael uses 10 rounds. There’s Key Whitening, Linear transformation and byte substitution, design properties all used by AES finalists.

About the assembly code

Initially, I didn’t intend to implement this cipher in x86 assembly due to the overall size of code generated by C compiler. However, after removing the inverse sbox and computing these values at runtime, it resulted in significantly smaller code than Serpent and Twofish implementations which I didn’t expect. The assembly version is a joint effort between Peter Ferrie and myself resulting in a binary similar to the size of Twofish-256.

Dr. Markku-Juhani O. Saarinen deserves credit for the C code here which I’ve shamelessly copied and modified to be more compact. Markku has a knack for simplifying cryptographic algorithms and making them easier to read.

While I have derived my own implementation from code originally written by Markku, he has not endorsed the changes I’ve made.

Sbox initialization

Although the designers of Kuznyechik asserted the sbox values were chosen “randomly”, Alex Biryukov, Leo Perrin, and Aleksei Udovenko show in their paper Reverse-Engineering the S-Box of Streebog, Kuznyechik and STRIBOBr1 that the S-Boxes of both Kuznyechik and Streebog were not created pseudo-randomly but using a hidden algorithm which they were able to reverse engineer.

The following code will generate the pi array based on their research.

#include <stdio.h>
#include <stdint.h>

// gcc -std=c11 genpi.c -o genpi && ./genpi

// multiplcation by matrix over GF(2)
uint8_t matmul(uint8_t a, uint8_t mat[8][8]) {
  uint8_t res = 0;
  int x, y;
  
  for (x = 0; x < 8; x++)
    if (a & (1 << (7 - x)))
      for (y = 0; y < 8; y++)
        res ^= mat[y][x] << (7 - y);
  return res;
}

// https://en.wikipedia.org/wiki/Finite_field_arithmetic
uint8_t fieldmul(uint8_t a, uint8_t b) {
  uint8_t p = 0;
  
  while (b) {
    if (b & 1) 
      p ^= a;
    if (a & 0x8)
      a = (a << 1) ^ 0x19;  // x^4 + x^3 + 1
    else
      a <<= 1;
    b >>= 1;
  }
  return p;
}

int main(int argc, char *argv[]) {
    int i, x, y, l, r, n, z;
    
uint8_t inv[16]   = 
{0x0,0x1,0xc,0x8,0x6,0xf,0x4,0xe,
 0x3,0xd,0xb,0xa,0x2,0x9,0x7,0x5};

uint8_t nu_0[16]  = 
{0x2,0x5,0x3,0xb,0x6,0x9,0xe,0xa,
 0x0,0x4,0xf,0x1,0x8,0xd,0xc,0x7};

uint8_t nu_1[16]  = 
{0x7,0x6,0xc,0x9,0x0,0xf,0x8,0x1,
 0x4,0x5,0xb,0xe,0xd,0x2,0x3,0xa};

uint8_t sigma[16] = 
{0xc,0xd,0x0,0x4,0x8,0xb,0xa,0xe,
 0x3,0x9,0x5,0x2,0xf,0x1,0x6,0x7};

uint8_t phi[16]   = 
{0xb,0x2,0xb,0x8,0xc,0x4,0x1,0xc,
 0x6,0x3,0x5,0x8,0xe,0x3,0x6,0xb};

uint8_t alpha[8][8] = {
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,0,0,1},
    {0,1,0,0,0,0,1,1},
    {1,1,1,0,1,1,1,1},
    {1,0,0,0,1,0,1,0},
    {0,1,0,0,0,1,0,0},
    {0,0,0,1,1,0,1,0},
    {0,0,1,0,0,0,0,0},
};

uint8_t omega[8][8] = {
    {0,0,0,0,1,0,1,0},
    {0,0,0,0,0,1,0,0},
    {0,0,1,0,0,0,0,0},
    {1,0,0,1,1,0,1,0},
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,1,0,0},
    {1,0,0,0,0,0,1,0},
    {0,0,0,0,0,0,0,1},
};
  
    for(x = 0; x < 256; x++) {
        y = matmul(x, alpha);
        l = (y >> 4) & 0xf;
        r = y & 0xf;
        l = (r == 0) ? nu_0[l] : nu_1[fieldmul(l, inv[r])];
        r = sigma[fieldmul(r, phi[l])];
        y = matmul((l << 4) | r, omega);
        if ((x & 7)==0) putchar('\n');
        printf("0x%02x, ", y);
    }
    return 0;
}

I have only assumed based on code generated by C compiler that Implementing this in assembly would not reduce size of code and so only the pi array is included in source and only the inverse sbox is created at runtime. However, I make no claim it can’t be implemented in less than 256 bytes which pi takes up, I just didn’t feel it was worth the effort.

kuz_init is called the same time as kuz_setkey so if you don’t call kuz_setkey first, encryption/decryption won’t work.

// The S-Box from section 5.1.1
const uint8_t kuz_pi[0x100] = {
  0xFC, 0xEE, 0xDD, 0x11, 0xCF, 0x6E, 0x31, 0x16,   // 00..07
  0xFB, 0xC4, 0xFA, 0xDA, 0x23, 0xC5, 0x04, 0x4D,   // 08..0F
  0xE9, 0x77, 0xF0, 0xDB, 0x93, 0x2E, 0x99, 0xBA,   // 10..17
  0x17, 0x36, 0xF1, 0xBB, 0x14, 0xCD, 0x5F, 0xC1,   // 18..1F
  0xF9, 0x18, 0x65, 0x5A, 0xE2, 0x5C, 0xEF, 0x21,   // 20..27
  0x81, 0x1C, 0x3C, 0x42, 0x8B, 0x01, 0x8E, 0x4F,   // 28..2F
  0x05, 0x84, 0x02, 0xAE, 0xE3, 0x6A, 0x8F, 0xA0,   // 30..37
  0x06, 0x0B, 0xED, 0x98, 0x7F, 0xD4, 0xD3, 0x1F,   // 38..3F
  0xEB, 0x34, 0x2C, 0x51, 0xEA, 0xC8, 0x48, 0xAB,   // 40..47
  0xF2, 0x2A, 0x68, 0xA2, 0xFD, 0x3A, 0xCE, 0xCC,   // 48..4F
  0xB5, 0x70, 0x0E, 0x56, 0x08, 0x0C, 0x76, 0x12,   // 50..57
  0xBF, 0x72, 0x13, 0x47, 0x9C, 0xB7, 0x5D, 0x87,   // 58..5F
  0x15, 0xA1, 0x96, 0x29, 0x10, 0x7B, 0x9A, 0xC7,   // 60..67
  0xF3, 0x91, 0x78, 0x6F, 0x9D, 0x9E, 0xB2, 0xB1,   // 68..6F
  0x32, 0x75, 0x19, 0x3D, 0xFF, 0x35, 0x8A, 0x7E,   // 70..77
  0x6D, 0x54, 0xC6, 0x80, 0xC3, 0xBD, 0x0D, 0x57,   // 78..7F
  0xDF, 0xF5, 0x24, 0xA9, 0x3E, 0xA8, 0x43, 0xC9,   // 80..87
  0xD7, 0x79, 0xD6, 0xF6, 0x7C, 0x22, 0xB9, 0x03,   // 88..8F
  0xE0, 0x0F, 0xEC, 0xDE, 0x7A, 0x94, 0xB0, 0xBC,   // 90..97
  0xDC, 0xE8, 0x28, 0x50, 0x4E, 0x33, 0x0A, 0x4A,   // 98..9F
  0xA7, 0x97, 0x60, 0x73, 0x1E, 0x00, 0x62, 0x44,   // A0..A7
  0x1A, 0xB8, 0x38, 0x82, 0x64, 0x9F, 0x26, 0x41,   // A8..AF
  0xAD, 0x45, 0x46, 0x92, 0x27, 0x5E, 0x55, 0x2F,   // B0..B7
  0x8C, 0xA3, 0xA5, 0x7D, 0x69, 0xD5, 0x95, 0x3B,   // B8..BF
  0x07, 0x58, 0xB3, 0x40, 0x86, 0xAC, 0x1D, 0xF7,   // C0..C7
  0x30, 0x37, 0x6B, 0xE4, 0x88, 0xD9, 0xE7, 0x89,   // C8..CF
  0xE1, 0x1B, 0x83, 0x49, 0x4C, 0x3F, 0xF8, 0xFE,   // D0..D7
  0x8D, 0x53, 0xAA, 0x90, 0xCA, 0xD8, 0x85, 0x61,   // D8..DF
  0x20, 0x71, 0x67, 0xA4, 0x2D, 0x2B, 0x09, 0x5B,   // E0..E7
  0xCB, 0x9B, 0x25, 0xD0, 0xBE, 0xE5, 0x6C, 0x52,   // E8..EF
  0x59, 0xA6, 0x74, 0xD2, 0xE6, 0xF4, 0xB4, 0xC0,   // F0..F7
  0xD1, 0x66, 0xAF, 0xC2, 0x39, 0x4B, 0x63, 0xB6,   // F8..FF
};

// initialize sbox tables
void kuz_init(kuz_key_t *key)
{
  int i;

  for (i=0; i<256; i++) {
    key->pi[i] = kuz_pi[i];
    key->pi_inv[kuz_pi[i]] = i;
  }
}

Creating the inverse sbox tables with code is smaller than including it as array.

; initialize sbox tables 
; edi = key context
load_sbox:
    pop    ebx               ; ebx = kuz_pi
sbox_loop:
    mov    al, dl            ; al = i
    xlatb                    ; al = kuz_pi[i]
    stosb                    ; key->pi[i] = al
    mov    [esi+eax], dl     ; key->pi_inv[kuz_pi[i]] = i
    inc    dl                ; i++
    jnz    sbox_loop         ; i<256
    popad
    ret

kuz_init:
    pushad
    inc    dh                ; edx=256
    add    edi, edx          ; edi = key->pi
    lea    esi, [edi+edx]    ; esi = key->pi_inv
    call   load_sbox
; The S-Box from section 5.1.1
kuz_pi:
  db   0xFC, 0xEE, 0xDD, 0x11, 0xCF, 0x6E, 0x31, 0x16,   ; 00..07
  db   0xFB, 0xC4, 0xFA, 0xDA, 0x23, 0xC5, 0x04, 0x4D,   ; 08..0F
  db   0xE9, 0x77, 0xF0, 0xDB, 0x93, 0x2E, 0x99, 0xBA,   ; 10..17
  db   0x17, 0x36, 0xF1, 0xBB, 0x14, 0xCD, 0x5F, 0xC1,   ; 18..1F
  db   0xF9, 0x18, 0x65, 0x5A, 0xE2, 0x5C, 0xEF, 0x21,   ; 20..27
  db   0x81, 0x1C, 0x3C, 0x42, 0x8B, 0x01, 0x8E, 0x4F,   ; 28..2F
  db   0x05, 0x84, 0x02, 0xAE, 0xE3, 0x6A, 0x8F, 0xA0,   ; 30..37
  db   0x06, 0x0B, 0xED, 0x98, 0x7F, 0xD4, 0xD3, 0x1F,   ; 38..3F
  db   0xEB, 0x34, 0x2C, 0x51, 0xEA, 0xC8, 0x48, 0xAB,   ; 40..47
  db   0xF2, 0x2A, 0x68, 0xA2, 0xFD, 0x3A, 0xCE, 0xCC,   ; 48..4F
  db   0xB5, 0x70, 0x0E, 0x56, 0x08, 0x0C, 0x76, 0x12,   ; 50..57
  db   0xBF, 0x72, 0x13, 0x47, 0x9C, 0xB7, 0x5D, 0x87,   ; 58..5F
  db   0x15, 0xA1, 0x96, 0x29, 0x10, 0x7B, 0x9A, 0xC7,   ; 60..67
  db   0xF3, 0x91, 0x78, 0x6F, 0x9D, 0x9E, 0xB2, 0xB1,   ; 68..6F
  db   0x32, 0x75, 0x19, 0x3D, 0xFF, 0x35, 0x8A, 0x7E,   ; 70..77
  db   0x6D, 0x54, 0xC6, 0x80, 0xC3, 0xBD, 0x0D, 0x57,   ; 78..7F
  db   0xDF, 0xF5, 0x24, 0xA9, 0x3E, 0xA8, 0x43, 0xC9,   ; 80..87
  db   0xD7, 0x79, 0xD6, 0xF6, 0x7C, 0x22, 0xB9, 0x03,   ; 88..8F
  db   0xE0, 0x0F, 0xEC, 0xDE, 0x7A, 0x94, 0xB0, 0xBC,   ; 90..97
  db   0xDC, 0xE8, 0x28, 0x50, 0x4E, 0x33, 0x0A, 0x4A,   ; 98..9F
  db   0xA7, 0x97, 0x60, 0x73, 0x1E, 0x00, 0x62, 0x44,   ; A0..A7
  db   0x1A, 0xB8, 0x38, 0x82, 0x64, 0x9F, 0x26, 0x41,   ; A8..AF
  db   0xAD, 0x45, 0x46, 0x92, 0x27, 0x5E, 0x55, 0x2F,   ; B0..B7
  db   0x8C, 0xA3, 0xA5, 0x7D, 0x69, 0xD5, 0x95, 0x3B,   ; B8..BF
  db   0x07, 0x58, 0xB3, 0x40, 0x86, 0xAC, 0x1D, 0xF7,   ; C0..C7
  db   0x30, 0x37, 0x6B, 0xE4, 0x88, 0xD9, 0xE7, 0x89,   ; C8..CF
  db   0xE1, 0x1B, 0x83, 0x49, 0x4C, 0x3F, 0xF8, 0xFE,   ; D0..D7
  db   0x8D, 0x53, 0xAA, 0x90, 0xCA, 0xD8, 0x85, 0x61,   ; D8..DF
  db   0x20, 0x71, 0x67, 0xA4, 0x2D, 0x2B, 0x09, 0x5B,   ; E0..E7
  db   0xCB, 0x9B, 0x25, 0xD0, 0xBE, 0xE5, 0x6C, 0x52,   ; E8..EF
  db   0x59, 0xA6, 0x74, 0xD2, 0xE6, 0xF4, 0xB4, 0xC0,   ; F0..F7
  db   0xD1, 0x66, 0xAF, 0xC2, 0x39, 0x4B, 0x63, 0xB6,   ; F8..FF

Key Whitening

Those of you familiar with previous asmcode entries should know by now the purpose of key whitening which essentially improves resistance to brute force attacks. It involves exclusive OR operation on the data being encrypted/decrypted using subkeys. It’s the exact same as AddRoundKey in AES and was first used by Ron Rivest in 1984 for DES-X.

static void kuz_whiten(w128_t *w, kuz_key_t *key, int idx) {
  int i;
  uint32_t *p=(uint32_t*)&key->k[idx].b[0];
  
  for (i=0; i<4; i++) {
    w->w[i] ^= p[i];
  }
}

Assembly is same as that used in AES, Serpent and Twofish.

; key whitening
; esi = w
; edi = key
; edx = round
kuz_whiten:
    pushad
    mov    cl, 4
    shl    edx, cl
w_l0:
    mov    eax, [edi+edx]     ; get 4 bytes of key
    xor    [esi], eax         ; xor state
    cmpsd
    loop   w_l0
    popad
    ret

Linear Transformation

Think of the linear transformation in Serpent, the mix columns operation in Rijndael or the Maximum Distance Seperable code in Twofish. They all serve the same purpose which is to create diffusion.

In Diffusion the statistical structure of the plaintext is dissipated into long range statistics of the cipher text. This is achieved by having each plaintext digit affect the value of many cipher text digits. Which is equivalent to saying that each cipher text digit is affected by many plaintext digits.

// poly multiplication mod p(x) = x^8 + x^7 + x^6 + x + 1
// totally not constant time
static uint8_t kuz_mul_gf256(uint8_t x, uint8_t y)
{
  uint8_t z;

  z = 0;

  while (y) {
    if (y & 1) {
      z ^= x;
    }
    x = (x << 1) ^ (x & 0x80 ? 0xC3 : 0x00);
    y >>= 1;
  }
  return z;
}

static void kuz_lt (w128_t *w, int enc) {
  int     i, j;
  uint8_t x;

  // Linear vector from sect 5.1.2
  const uint8_t kuz_lvec[16] = {
    0x94, 0x20, 0x85, 0x10, 0xC2, 0xC0, 0x01, 0xFB,
    0x01, 0xC0, 0xC2, 0x10, 0x85, 0x20, 0x94, 0x01 };

  // 16 rounds
  for (j=0; j<16; j++)
  {
    if (enc == KUZ_ENCRYPT)
    {
      // An LFSR with 16 elements from GF(2^8)
      x = w->b[15]; // since lvec[15] = 1

      for (i = 14; i >= 0; i--) {
        w->b[i + 1] = w->b[i];
        x ^= kuz_mul_gf256(w->b[i], kuz_lvec[i]);
      }
      w->b[0] = x;
    } else {
      x = w->b[0];

      for (i = 0; i < 15; i++) {
        w->b[i] = w->b[i + 1];
        x ^= kuz_mul_gf256(w->b[i], kuz_lvec[i]);
      }
      w->b[15] = x;
    }
  }
}

Peter was able to combine both operations in assembly using ECX (crypto flag).

; linear transformation
; esi = w
; ecx = enc
lt_l0:
    pop    edi
    push   16
    pop    eax               ; 16 rounds
lt_l1:
    pushad
    mov    dl, 15
    jecxz  lt_l2
    cdq

lt_l2:
    mov    ah, [esi+edx]     ; ah = w->b[00 or 15]

lt_l3:
    mov    bh, [edi+edx-1]     ; bh = kuz_lvec[i]
    mov    al, [esi+edx-1]     ; al = w->b[i]
    jecxz  lt_l4
    mov    bh, [edi+edx]     ; bh = kuz_lvec[i]
    mov    al, [esi+edx+1]   ; al = w->b[i+1]

lt_l4:
    mov    [esi+edx], al     ; w->b[i] = al
    call   kuz_mul_gf256
    dec    edx
    jecxz  lt_l5
    inc    edx
    inc    edx
    cmp    edx, 15

lt_l5:
    jnz    lt_l3
    mov    [esi+edx], ah     ; w->b[00 or 15] = x
    popad
    dec    eax
    jnz    lt_l1
    popad
    ret
kuz_lt:
    pushad
    call   lt_l0
    db 0x94, 0x20, 0x85, 0x10 
    db 0xC2, 0xC0, 0x01, 0xFB
    db 0x01, 0xC0, 0xC2, 0x10
    db 0x85, 0x20, 0x94, 0x01

; poly multiplication 
; mod p(x) = x^8 + x^7 + x^6 + x + 1 
mgf_l0:
    shr    bh, 1             ; if (y & 1), y >>= 1
    jnc    mgf_l1
    xor    ah, al            ; z ^= x
mgf_l1:
    add    al, al            ; if (x & 0x80), x <<= 1
    jnc    kuz_mul_gf256
    xor    al, 0xC3          ; x ^= 0xC3
kuz_mul_gf256:
    test   bh, bh            ; while (y)
    jnz    mgf_l0
    ret

Byte Substitution

The purpose of an sbox or substituion box is to obscure the relationship between key and ciphertext — Shannon’s property of confusion. Claude E. Shannon proposed ideas related to information security in his 1948 paper Communication Theory of Secrecy Systems

Confusion seeks to make a relationship between the statistics of the cipher text and the value of the encryption key as complex as possible. Thus even if the attacker can get some handle on the statistics of the cipher text, the way in which the key was used to produce that cipher text is so complex as to make it difficult to deduce the key.

// substitute bytes
static void kuz_subbytes(w128_t *w, kuz_key_t *key, int enc) {
  int           i;
  const uint8_t *sbox = (enc==KUZ_ENCRYPT) ? key->pi : key->pi_inv;

  for (i=0; i<16; i++) {
    w->b[i] = sbox[w->b[i]];
  }
}

Key scheduling

Based on Feistel Network design. It would certainly be possible to create subkeys during encryption or decryption but I wanted to keep it simple.

/ key setup
void kuz_setkey(kuz_key_t *kuz, const uint8_t key[32])
{
  uint32_t i, j;
  w128_t   c, z;
  w256_t   x;

  kuz_init(kuz);
  
  // copy key to context
  memcpy (&kuz->k[0].b[0], key, 32);

  // copy key to local buffer
  memcpy (&x.b[0], key, 32);
  
  for (i=1; i<=32; i++) {
    // C Value
    memset (&c.b[0], 0, 16);
    c.b[15] = i;    // load round in lsb

    kuz_lt(&c, KUZ_ENCRYPT);

    for (j=0; j<16; j++) {
      z.b[j] = x.b[j] ^ c.b[j];
    }

    kuz_subbytes(&z, kuz, KUZ_ENCRYPT);
    kuz_lt(&z, KUZ_ENCRYPT);

    for (j=0; j<16; j++) {
      z.b[j] ^= x.b[16+j];
    }
    
    memcpy (&x.b[16], &x.b[0], 16);
    memcpy (&x.b[0], &z.b[0], 16);
    
    if ((i & 7) == 0) {
      memcpy (&kuz->k[(i >> 2)].b[0], &x.b[0], 32);
    }
  }
}

Markku’s code uses 64-bit values here which I replaced with REP MOVSB operations. Using MMX code might reduce code further and for 64-bit implementations, it would make more sense to use 64-bit registers instead of stack memory.

; key setup
kuz_setkeyx:
_kuz_setkeyx:
    pushad
    mov    edi, [esp+32+4]   ; ebx=kuz context
    mov    esi, [esp+32+8]   ; esi=key
    mov    ebp, edi
    
    push   32                ; eax=32
    pop    eax
    cdq                      ; edx=0
    call   kuz_init
    
    xchg   eax, ecx
    ; copy key to context
    pushad
    rep    movsb
    popad
    ; copy 256-bit key to local buffer
    pushad                   ; allocate 32 bytes for x
    mov    edi, esp          ; edi=x
    push   edi
    rep    movsb
    pop    edi
    
    pushad                   ; allocate 32 bytes for c,z
    mov    ebx, esp          ; ebx=z
    lea    esi, [ebx+16]     ; esi=c
    ; do 32 rounds
ksk_l0:
    inc    edx
    cmp    dl, 32
    ja     ksk_l3
    
    mov    cl, 16
    pushad                   ; save x
    mov    edi, esi
    xor    eax, eax
    rep    stosb             ; memset (&c.b[0], 0, 16);
    mov    [esi+15], dl      ; c.b[15] = i;
    
    mov    edi, esi
    call   kuz_lt            ; kuz_lt(&c, KUZ_ENCRYPT);
    popad

ksk_l1:
    mov    al, [edi+ecx-1]   ; al=x.b[j]
    xor    al, [esi+ecx-1]   ; ^= c.b[j]
    mov    [ebx+ecx-1], al
    loop   ksk_l1
    
    xchg   esi, ebx     ; XCHG(c, z)
    xchg   edi, ebp     ; XCHG(x, kuz)
    call   kuz_subbytes ; kuz_subbytes(&z, kuz, KUZ_ENCRYPT);
    call   kuz_lt       ; kuz_lt(&z, KUZ_ENCRYPT);
    xchg   esi, ebx     ; XCHG(c, z)
    xchg   edi, ebp     ; XCHG(x, kuz)

ksk_l2:
    mov    al, [edi+ecx+16] ; z.b[j] ^= x.b[16+j];
    xor    [ebx+ecx], al
    inc    ecx
    cmp    cl, 16
    jnz    ksk_l2
    
    ; memcpy (&x.b[16], &x.b[0], 16);
    pushad
    mov    esi, edi
    add    edi, ecx
    rep    movsb
    popad
    
    ; memcpy (&x.b[0], &z.b[0], 16);
    pushad
    mov    esi, ebx
    rep    movsb
    popad
    
    test   dl, 7
    jnz    ksk_l0
    
    ; memcpy (&kuz->k[(i >> 2)].b[0], &x.b[0], 32);
    pushad
    add    ecx, ecx            ; ecx *= 2
    mov    esi, edi
    lea    edi, [ebp+edx*4]
    rep    movsb
    popad
    jmp    ksk_l0
ksk_l3:
    popad
    popad
    popad
    ret

Encryption and Decryption

// encrypt/decrypt a block - 8 bit way
void kuz_encrypt(kuz_key_t *key, void *blk, int enc)
{
  int    i, j;
  w128_t *x=(w128_t*)blk;

  if (enc==KUZ_ENCRYPT)
  {
    i=0;
    for (;;)
    {
      kuz_whiten(x, key, i);
      if (++i == KUZ_ROUNDS) break;
      kuz_subbytes(x, key, enc);
      kuz_lt(x, enc);
    }
  } else {
    i=KUZ_ROUNDS;
    for (;;)
    {
      i--;
      kuz_whiten(x, key, i);
      if (i == 0) break;
      kuz_lt(x, enc);
      kuz_subbytes(x, key, enc);
    }
  }
}

The assembly takes advantage of similarity between both encryption/decryption operations by storing pointer to functions in registers and swapping them depending on enc parameter stored in ecx.

; encrypt/decrypt a block
kuz_encryptx:
_kuz_encryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   eax, edi          ; edi=key
    lodsd
    push   eax               ; save blk
    lodsd
    cdq                      ; edx=0
    xchg   eax, ecx          ; ecx=enc
    pop    esi               ; esi=blk
    call   load_func
    ; kuz_whiten, kuz_subbytes and kuz_lt functions
    ; are placed here.
load_func:
    pop    ebx
    lea    eax, [ebx + (kuz_lt - kuz_whiten)]
    lea    ebp, [eax + (kuz_subbytes - kuz_lt)]
    
    jecxz  kde_l2
    xchg   eax, ebp
    mov    dl, KUZ_ROUNDS
    jmp    kde_l1
kde_l0:
    call   ebp ; kuz_lt or kuz_subbytes
    call   eax ; kuz_subbytes or kuz_lt
kde_l1:
    dec    edx
kde_l2:
    call   ebx ; kuz_whiten
    inc    ecx
    test   edx, edx
    loop   kde_l3
    inc    edx
    inc    edx
    cmp    dl, KUZ_ROUNDS+1
kde_l3:
    jnz    kde_l0
    popad
    ret

Summary

Please refer to sources here and here for any future updates.

architecture size
x86 615
x64 n/a

I cannot say there’s any advantage to using this cipher compared to Rijndael (AES), Serpent or Twofish. The design properties of all these ciphers are very similar but Kuznyechik is slower than Rinjdael. They all use similar functions but the advantage of AES (Rijndael) is that it’s likely to be more scrutinized by the cryptographic community than Kuznyechik, Serpent or Twofish. Having said all that, it’s still nice to have another block cipher people can study and learn from which has been certified by organizations other than NIST.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , , , | Leave a comment

HC-256 Stream Cipher

Introduction

HC-256 is a software-efficient stream cipher designed and written by Wu Hongjun in 2004. In 2008, it was selected for the final portfolio of eSTREAM, the stream cipher project of the European Network of Excellence for Cryptology (ECRYPT, 2004-2008).

HC-256 was designed to demonstrate that strong stream cipher can be built from nonlinear feedback function and nonlinear output function. The large secret states of the two ciphers are updated in a nonlinear way, and table lookup (with changing tables) is used in the generation of keystream.

Stack exceptions problems

One of the negative features of this cipher is the large RAM requirement when generating key dependent sboxes. It requires approx. between 16-20KB but you cannot just allocate 20KB of stack in your procedure using SUB, ADD or ENTER.

The key generating function uses the following code

;
    xor    ecx, ecx          ; ecx=0
    mul    ecx               ; eax=0, edx=0
    
    mov    cl, 5             ; ecx=5
    mov    dh, 16            ; edx=4096
    ; allocate stack memory in 4096 byte blocks
    ; 4 x 4096 = 16384 bytes, 
    ; additional 4096 bytes just in case (not needed?)
xalloca:
    sub    esp, edx          ; subtract page size
    test   [esp], esp        ; page probe
                             ; causes pages of memory to be 
                             ; allocated via the guard page 
                             ; scheme (if possible)
    loop   xalloca           ; raises exception if 
                             ; unable to allocate

If you don’t perform the page probe after allocating the space required, an exception will occur once you try to use that address of memory. I never imagined this could be an issue… but again, it’s one of those things you need to consider when using lots of stack space.

Key Initialization

The initialization process of HC-256 consists of expanding the key and initialization vector into P and Q (similar to the message setup in SHA-256) and running the cipher 4096 steps without generating output.

Steps 4 and 5 are obviously from the SHA-256 hash algorithm.

#define SIG0(x)(ROTR32((x),  7) ^ ROTR32((x), 18) ^ ((x) >>  3))
#define SIG1(x)(ROTR32((x), 17) ^ ROTR32((x), 19) ^ ((x) >> 10))

// both key and iv must be 32 bytes / 256-bits!
void hc256_setkeyx(hc_ctx *c, void *kiv)
{
    uint32_t W[2560], i;
    uint32_t *pq, *wp;
    
    // 1. set counter
    c->ctr = 0;
    
    // 2. zero initialize key and iv buffers
    for (i=0; i<16; i++) {
      c->kiv[i] = 0;
    }
    
    // 3. setup the key and iv
    for (i=0; i<64; i++) {
      c->key[i>>2] = c->key[i>>2] | ((uint8_t*)kiv)[i];
      c->key[i>>2] = ROTL32(c->key[i>>2], 8);
    }
    
    // 4. copy 16 words to work buffer
    for (i=0; i<16; i++) { 
      W[i] = c->kiv[i];
    }

    // 5. expand buffer using SHA-256 macros
    for (i=16; i<2560; i++) {
      W[i] = SIG1(W[i-2])  + W[i-7]  + 
             SIG0(W[i-15]) + W[i-16] + i; 
    }
    
    pq = c->T;
    wp = &W[512];
    
    // 6. set the P and Q tables
    for (i=0; i<2048; i++) { 
      *pq++ = *wp++;
    }
    
    // 7. run cipher 4096 iterations before generating output
    for (i=0; i<4096; i++) {
      generate(c);
    }
}
hc256_setkeyx:
_hc256_setkeyx:
    pushad
    mov    edi, [esp+32+4]   ; edi=c
    mov    esi, [esp+32+8]   ; esi=kiv
    
    xor    ecx, ecx          ; ecx=0
    mul    ecx               ; eax=0, edx=0
    
    mov    cl, 5             ; ecx=5
    mov    dh, 16            ; edx=4096
    ; allocate stack memory in 4096 byte blocks
    ; 4 x 4096 = 16384 bytes, 
    ; additional 4096 bytes just in case (not needed?)
xalloca:
    sub    esp, edx          ; subtract page size
    test   [esp], esp        ; page probe
                             ; causes pages of memory to be 
                             ; allocated via the guard page scheme (if possible)
    loop   xalloca           ; raises exception if unable to allocate
    mov    ebx, esp          ; ebx=W
    
    push   edx               ; save 4096    
    push   edi               ; save ptr to c
    stosd                    ; c->ctr=0
    push   edx               ; save 4096
    push   edi               ; save ptr to c->T
    push   edx               ; save 4096
    
    ; 2. copy 512-bits of key/iv to workspace
    mov    cl, 64
    mov    edi, ebx          ; edi=W
    rep    movsb
    
    mov    esi, ebx          ; esi=W
    mov    cl, 16
expand_key:
    ; eax = SIG0(W[i-15])
    mov    eax, [edi - 15*4]
    mov    edx, eax
    mov    ebp, eax
    ror    eax, 7
    ror    edx, 18
    shr    ebp, 3
    xor    eax, edx
    xor    eax, ebp
    ; ebx = SIG1(W[i-2])
    mov    ebx, [edi - 2*4]
    mov    edx, ebx
    mov    ebp, ebx
    ror    ebx, 17
    ror    edx, 19
    shr    ebp, 10
    xor    ebx, edx
    xor    ebx, ebp
    ; W[i] = ebx + W[i-16] + eax + w[i-7] + i
    add    eax, [edi - 16*4]
    add    ebx, [edi -  7*4]
    add    eax, ebx
    add    eax, ecx
    stosd
    inc    ecx
    cmp    ecx, [esp]        ; 4096 words
    jnz    expand_key
    
    pop    ecx               ; ecx=4096
    pop    edi               ; edi=c->T
    shr    ecx, 1            ; /=2 for 2048 words
    add    esi, ecx          ; add 512*4
    rep    movsd
    
    pop    ecx               ; ecx=4096
    pop    edi               ; edi=ctx
sk_l3:
    call   hc256_generatex
    loop   sk_l3
    
    pop    eax              ; eax=4096
    lea    esp, [esp+eax*4] ; free stack
    add    esp, eax
    popad
    ret

Keystream Generation

At each step, one element of a table is updated and one 32-bit output is generated. An S-box is used to generate only 1024 outputs, then it is updated in the next 1024 steps. The keystream generation process of HC-256 is given below.

uint32_t h(uint32_t T[], uint32_t x) {
  return T[x & 255]                 + 
         T[256 + ((x >>  8) & 255)] + 
         T[512 + ((x >> 16) & 255)] + 
         T[768 + ((x >> 24) & 255)];  
}

// step function
uint32_t generate(hc_ctx* c)
{
    uint32_t r, i, i3, i10, i12, i1023;
    uint32_t *x0, *x1, *t;
    
    // 1. obtain indices based on counter
    i     = c->ctr     & 0x3ff;
    i3    = (i - 3)    & 0x3ff;
    i10   = (i - 10)   & 0x3ff;
    i12   = (i - 12)   & 0x3ff;
    i1023 = (i - 1023) & 0x3ff;

    if (c->ctr < 1024) {
      x0=c->P;
      x1=c->Q;
    } else {
      x0=c->Q;
      x1=c->P;
    }
    
    // 2.
    x0[i] = x0[i]   + 
            x0[i10] + 
            (ROTR32(x0[i3], 10) ^ ROTR32(x0[i1023], 23)) + 
            x1[(x0[i3] ^ x0[i1023]) & 0x3ff];
         
    // 3. 
    r = h(x1, x0[i12]) ^ x0[i];
    
    // 4. update counter modulo 2048
    c->ctr = (c->ctr+1) & 0x7ff;
    
    return r;
}
; expects ctx in edi
hc256_generatex:
_hc256_generatex:
    pushad
    xor    edx, edx
    mov    dh, 8               ; edx = 2048
    mov    esi, edi            ; esi = c
    lodsd                      ; eax = c->ctr
    push   eax                 ; save  c->ctr
    inc    eax                 ; c->ctr++
    dec    edx                 ; edx = 2047
    and    eax, edx            ; c->ctr &= 2047
    stosd                      ; save new c->ctr
    pop    eax                 ; restore old c->ctr
    lea    edi, [esi+edx*2+2]  ; x0  = c->Q
    shr    edx, 1              ; edx = 1023
    push   edx                 ; save 1023
    cmp    eax, edx            ; c->ctr > 1023
    jbe    gen_l0
    xchg   esi, edi            ; swap Q and P ptrs
gen_l0:
    and    eax, edx            ; i &= 1023
    
    lea    ebx, [eax-3]        ; i3 = (i - 3) & 1023;
    and    ebx, edx
    
    lea    ecx, [eax-10]       ; i10 = (i - 10) & 1023;
    and    ecx, edx
    
    lea    ebp, [eax-12]       ; i12 = (i - 12) & 1023;
    and    ebp, edx
    mov    ebp, [esi+ebp*4]
    push   ebp                 ; save i12
    
    mov    ebp, eax            ; i1023 = (i - 1023) & 1023;
    sub    ebp, edx
    and    ebp, edx
    
    push   eax                 ; save i
    mov    eax, [esi+eax*4]    ; eax  = x0[i]
    add    eax, [esi+ecx*4]    ; eax += x0[i10]
    
    mov    ebp, [esi+ebp*4]    ; ebp  = x0[i1023]
    mov    ebx, [esi+ebx*4]    ; ebx  = x0[i3]
    mov    ecx, ebx            ; ecx  = x0[i3]
    xor    ecx, ebp            ; ecx ^= x0[i1023]
    and    ecx, edx            ; ecx &= 0x3ff
    add    eax, [edi+ecx*4]    ; ecx  = x1[(x0[i3] ^ x0[i1023]) & 1023]
    ror    ebx, 10             ; ebx  = ROTR32(x0[i3], 10)
    rol    ebp, 9              ; ebp  = ROTL32(x0[i1023], 9)
    xor    ebx, ebp            ;
    add    eax, ebx            ; 
    pop    ebx                 ; ebx = i
    mov    [esi+ebx*4], eax    ; eax = (x0[i] += eax)
    pop    edx                 ; edx = i12
    
    pop    ebp                 ; ebp=1023
    inc    ebp                 ; ebp=1024
    xchg   eax, ecx
    xor    eax, eax            ; r=0
gen_l1:
    movzx  ebx, dl
    add    eax, [edi+ebx*4]
    add    edi, ebp            ; x1 += 1024/4
    shr    edx, 8              ; w1 >>= 8
    jnz    gen_l1
    
    xor    eax, ecx            ; r ^= w0;
    mov    [esp+_eax], eax     ; return r;
    popad
    ret

Encryption and Decryption

The keystream is XORed with the message for encryption. The decryption is to XOR the keystream with the ciphertext.

// encrypt/decrypt data in place
void hc256_crypt(hc_ctx *c, void *in, uint32_t inlen)
{
  uint32_t i, w;
  uint8_t *p=(uint8_t*)in;
  
  // encrypt all bytes
  for (i=0; i<inlen;) {
    w = generate(c);
    // encrypt 4 bytes or until i equals inlen
    do {
      p[i++] ^= (w & 255);
      w >>= 8;
    } while (i < inlen && w != 0);
  }
}
hc256_cryptx:
_hc256_cryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax          ; edi=ctx
    lodsd
    xchg   edx, eax          ; edx=in
    lodsd
    xchg   ecx, eax          ; ecx=len
hc_l0:                       ; .repeat
    jecxz  hc_l2             ; .break .if ecx == 0
    call   hc256_generatex
hc_l1:
    xor    [edx], al         ; *in ^= (w0 & 0xFF)
    inc    edx               ; in++
    shr    eax, 8            ; w0 >>= 8
    loopnz hc_l1             ; .while ecx != 0 && eax != 0
    jmp    hc_l0
hc_l2:
    popad
    ret

Test Vector results

HC256 test #1 - OK
5b, 07, 89, 85, d8, f6, f3, 0d,
42, c5, c0, 2f, a6, b6, 79, 51,
53, f0, 65, 34, 80, 1f, 89, f2,
4e, 74, 24, 8b, 72, 0b, 48, 18,

HC256 test #2 - OK
af, e2, a2, bf, 4f, 17, ce, e9,
fe, c2, 05, 8b, d1, b1, 8b, b1,
5f, c0, 42, ee, 71, 2b, 31, 01,
dd, 50, 1f, c6, 0b, 08, 2a, 50,

HC256 test #3 - OK
1c, 40, 4a, fe, 4f, e2, 5f, ed,
95, 8f, 9a, d1, ae, 36, c0, 6f,
88, a6, 5a, 3c, c0, ab, e2, 23,
ae, b3, 90, 2f, 42, 0e, d3, a8,

Summary

architecture size
x86 272
x64 n/a

Since it was chosen for the final portfolio of eSTREAM, it’s likely to have undergone extensive cryptanalysis. It’s unpatented, suitable for 32-bit CPUs and the assembly output of hc256x.c is approx. 454 bytes which is just a little bigger than ChaCha20 (424 bytes) a close relative of Salsa20 which was also selected for the final portfolio.

Please refer to here for sources, documentation and any future updates that may not be shown here.

Known attacks

Improved Distinguishing Attacks on HC-256 by Gautham Sekar and Bart Preneel
A Cache Timing Analysis of HC-256 by Erik Zenner

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , | 1 Comment

Rabbit Stream Cipher

Introduction

Rabbit is a stream cipher designed in 2003 by Martin Boesgaard, Mette Vesterager, Thomas Pedersen, Jesper Christiansen and Ove Scavenius. It was published in 2005 and selected as a software cipher for the eSTREAM portfolio in 2008.

It’s designed for high performance in software implementations but is also known to work well in hardware environments. Both key setup and encryption are very fast, making the algorithm particularly suited for all applications where large amounts of data or large numbers of data packages have to be encrypted.

Examples include, but are not limited to, server-side encryption, multimedia encryption, hard-disk encryption, and encryption on limited-resource devices. It is currently not known to exhibit any weaknesses and so I decided to implement it in x86 assembly after much tweaking of original C code.

A description by the authors can be found in RFC4503

Technically, Rabbit consists of a pseudorandom bitstream generator that takes a 128-bit key and a 64-bit initialization vector (IV) as input and generates a stream of 128-bit blocks. Encryption is performed by combining this output with the message, using the exclusive-OR operation. Decryption is performed in exactly the same way as encryption.

About the assembly code

This was a joint effort between Peter Ferrie and myself.

It didn’t initially appear like it would be as compact compared with Salsa20 or HC-256 which are also from the eSTREAM portfolio.

However, the only other cipher from eSTREAM software portfolio was SOSEMANUK which is based on Serpent and it seemed pointless implementing a stream cipher based on a block cipher which can already be used with CTR mode to work as a stream cipher.

Optimization trick with PUSHAD/POPAD

Although this has nothing to do with the cipher itself, I think it’s worth noting. While writing the assembly code for RABBIT, I accidently “discovered” something that I didn’t realize was possible with PUSHAD instruction. It may only work on Windows so if you use it in your own code, careful consideration should be given to whether or not it will work across all operating systems running on x86 architecture and not just Windows. UPDATE: Peter Ferrie has confirmed this is a feature of the CPU itself and will work on Linux also.

When PUSHAD is executed, it saves all the general purpose registers including ESP on the stack. The total stack space occupied for this operation is 32-bytes. But what I didn’t know is that it’s possible to overwrite ESP and when POPAD is executed, the original value of ESP remains unaffected.

So what?” you may say.. but think about it. This is a neat way to allocate and release 32-bytes of memory without worrying about overwriting ESP.

If the original value of ESP were not restored, only 12 bytes would be available, but that’s still useful for some code. So instead of using SUB, ADD or ENTER..etc to allocate 32-bytes or more in future, consider using PUSHAD since it doesn’t appear to matter if ESP is overwritten.

Try it for yourself with the following code:

;
    pushad                   ; save all registers
    xor    eax, eax          ; eax=0
    push   8                 ; overwrite 8 GP registers
    pop    ecx
    mov    edi, esp
    rep    stosd             ; memset(edi, 0, 32);
    popad                    ; like "magic", ESP is restored

I’ve updated other sources to use this “trick” and they all work fine.

It is used here in RABBIT_setiv and RABBIT_crypt

Key setup

void RABBIT_setkey(RABBIT_ctx *c, const void *key)
{
   uint32_t k0, k1, k2, k3, i;
   rabbit_blk *k=(rabbit_blk*)key;

   // Generate four subkeys
   k0 = k->d[0];
   k1 = k->d[1];
   k2 = k->d[2];
   k3 = k->d[3];

   // Generate initial state variables
   c->m.x[0] = k0;
   c->m.x[2] = k1;
   c->m.x[4] = k2;
   c->m.x[6] = k3;

   c->m.x[1] = U32V(k3<<16) | (k2>>16);
   c->m.x[3] = U32V(k0<<16) | (k3>>16);
   c->m.x[5] = U32V(k1<<16) | (k0>>16);
   c->m.x[7] = U32V(k2<<16) | (k1>>16);

   // Generate initial counter values
   c->m.c[0] = ROTL32(k2, 16);
   c->m.c[2] = ROTL32(k3, 16);
   c->m.c[4] = ROTL32(k0, 16);
   c->m.c[6] = ROTL32(k1, 16);

   c->m.c[1] = (k0&0xFFFF0000) | (k1&0xFFFF);
   c->m.c[3] = (k1&0xFFFF0000) | (k2&0xFFFF);
   c->m.c[5] = (k2&0xFFFF0000) | (k3&0xFFFF);
   c->m.c[7] = (k3&0xFFFF0000) | (k0&0xFFFF);

   // Clear carry bit
   c->m.carry = 0;

   // Iterate the system four times
   for (i=0; i<4; i++) {
      RABBIT_next_state(&c->m);
   }

   // Modify the counters
   for (i=0; i<8; i++) {
      c->m.c[i] ^= c->m.x[(i+4) & 7];
   }

   // Copy master instance to work instance
   memcpy (&c->w.x[0], &c->m.x[0], sizeof(RABBIT_state));
}

This assembly…

; Key setup
RABBIT_setkeyx:
_RABBIT_setkeyx:
    pushad
    mov    edi, [esp+32+4]
    mov    esi, [esp+32+8]
    ; ---------------------
    ; Generate four subkeys
    ; ---------------------
    lodsd
    xchg   eax, edx
    lodsd
    xchg   eax, ebx
    lodsd
    xchg   eax, ecx
    lodsd
    xchg   eax, edx
    ; --------------------------------
    ; Generate initial state variables
    ; --------------------------------
    pushad
    stosd

    ; c->m.x[1] = U32V(k3<<16) | (k2>>16);
    mov    ebp, edx
    shld   ebp, ecx, 16
    xchg   ebp, eax
    stosd

    xchg   ebx, eax
    stosd

    ; c->m.x[3] = U32V(k0<<16) | (k3>>16);
    mov    ebx, ebp
    shld   ebx, edx, 16
    xchg   ebx, eax
    stosd

    xchg   ecx, eax
    stosd

    ; c->m.x[5] = U32V(k1<<16) | (k0>>16);
    mov    ecx, ebx
    shld   ecx, ebp, 16
    xchg   ecx, eax
    stosd

    xchg   edx, eax
    stosd

    ; c->m.x[7] = U32V(k2<<16) | (k1>>16);
    mov    edx, ecx
    shld   edx, ebx, 16
    xchg   edx, eax
    stosd
    
    ; -------------------------------
    ; Generate initial counter values
    ; -------------------------------
    rol    ecx, 16
    xchg   ecx, eax
    stosd

    ; c->m.c[1] = (k0&0xFFFF0000) | (k1&0xFFFF);
    xchg   ebp, eax         ; k1.lo
    xchg   bx, ax
    stosd

    rol    edx, 16
    xchg   edx, eax
    stosd

    ; c->m.c[3] = (k1&0xFFFF0000) | (k2&0xFFFF);
    rol    ecx, 16          ; k2.lo
    xchg   ecx, eax
    stosd

    xchg   edx, eax
    xchg   bx, ax
    rol    eax, 16
    stosd

    ; c->m.c[5] = (k2&0xFFFF0000) | (k3&0xFFFF);
    xchg   ecx, eax         ; k3.lo
    xchg   bp, ax
    rol    eax, 16
    stosd

    xchg   edx, eax
    xchg   bx, ax
    rol    eax, 16
    stosd

    ; c->m.c[7] = (k3&0xFFFF0000) | (k0&0xFFFF);
    xchg   ecx, eax         ; k0.lo
    xchg   bp, ax
    rol    eax, 16
    stosd
    
    xor    eax, eax
    stosd
    popad

    ; -----------------------------
    ; Iterate the system four times
    ; -----------------------------
    push   4
    pop    ecx
rsk_l0:
    push   edi
    call   RABBIT_next_state
    pop    edi
    loop   rsk_l0
    
    ; -------------------
    ; Modify the counters
    ; -------------------
    xchg   eax, ecx
    cdq
rsk_l1:
    ; c->m.c[i] ^= c->m.x[(i+4) & 7];
    lea    ecx, [eax+edx+4]
    and    ecx, 7
    mov    ebx, [edi+ecx*4]
    xor    [edi+eax*4+32], ebx
    inc    eax
    cmp    al, 8
    jne    rsk_l1
    
    ; -------------------------------------
    ; Copy master instance to work instance
    ; -------------------------------------
    mov    cl, 68
    mov    esi, edi
    add    edi, ecx
    rep    movsb
    popad
    ret

IV setup

// IV setup
void RABBIT_setiv(RABBIT_ctx *c, const void* iv)
{
   rabbit_blk *v=(rabbit_blk*)iv;
   uint32_t   sv[4];
   int        i;

   // Generate four subvectors
   sv[0] = v->d[0];
   sv[2] = v->d[1];

   sv[1] = (sv[0]>>16) | (sv[2]&0xFFFF0000);
   sv[3] = (sv[2]<<16) | (sv[0]&0x0000FFFF);

   // Modify counter values
   for (i=0; i<8; i++) {
      c->w.c[i] = c->m.c[i] ^ sv[i & 3];
   }

   // Copy state variables but not carry flag
   memcpy (&c->w.x[0], &c->m.x[0], 32);

   // Iterate the system four times
   for (i=0; i<4; i++) {
      RABBIT_next_state(&c->w);
   }
}

Assembly…

; IV setup
RABBIT_setivx:
_RABBIT_setivx:
    ; save registers
    pushad
    mov    edx, [esp+32+4]   ; ctx
    mov    esi, [esp+32+8]   ; iv
    ; Generate four subvectors
    lodsd
    xchg   edi, eax          ; sv[0] = v->d[0];
    lodsd
    xchg   ebp, eax          ; sv[2] = v->d[1];

    pushad                   ; create sv variable (32 bytes), init first and third

    ror    edi, 16           ; sv[0] >> 16
    
    xchg   ebp, eax          ; sv[1] = (sv[0]>>16) | (sv[2]&0xFFFF0000);
    xchg   di, ax
    mov    [esp+4], eax
    
    ror    edi, 16           ; sv[3] = (sv[2]<<16) | (sv[0]&0x0000FFFF);
    mov    [esp+12], edi
    ; Modify counter values
    xor    ecx, ecx
rsv_l0:
    mov    eax, [edx+ecx*4+32]   ; eax = c->m.c[i]
    mov    ebx, ecx
    and    ebx, 3
    xor    eax, [esp+ebx*4]      ; eax ^= sv[i & 3]
    mov    [edx+ecx*4+100], eax  ; c->w.c[i] = eax
    inc    ecx
    cmp    cl, 8
    jnz    rsv_l0
    ; Copy master state variables to work
    pushad
    mov    esi, edx
    lea    edi, [esi+68]
    rep    movsd
    popad
    ; Iterate the system four times
    mov    cl, 4
rsv_l1:
    lea    eax, [edx+68]
    push   eax
    call   RABBIT_next_state ; RABBIT_next_state(&c->w);
    pop    eax
    loop   rsv_l1
    
    ; release sv of 32 bytes
    popad
    ; restore registers
    popad
    ret

Encryption/Decryption

void RABBIT_crypt(RABBIT_ctx *c, void* input, uint32_t inlen)
{
   uint32_t   i, len;
   rabbit_blk x;
   uint8_t    *in=(uint8_t*)input;

   for (;;)
   {
     // break on zero length
     if (inlen==0) break;

     // update state
     RABBIT_next_state(&c->w);

     for (i=0; i<4; i++) {
       x.d[i] = c->w.x[i<<1];
     }

     x.d[0] ^= (c->w.x[5]>>16) ^ (c->w.x[3]<<16);
     x.d[1] ^= (c->w.x[7]>>16) ^ (c->w.x[5]<<16);
     x.d[2] ^= (c->w.x[1]>>16) ^ (c->w.x[7]<<16);
     x.d[3] ^= (c->w.x[3]>>16) ^ (c->w.x[1]<<16);

     for (i=0; i<16 && inlen!=0; i++) {
       *in++ ^= x.b[i];
       inlen--;
     }
   }
}

Assembly

; encrypt/decrypt a message of any size
RABBIT_cryptx:
_RABBIT_cryptx:
    pushad                   ; save registers
    lea    esi, [esp+32+4]   ; esi = parameters
    pushad                   ; alloc 32-bytes on stack
    lodsd
    xchg   ebx, eax          ; ebx=c
    lodsd
    xchg   ecx, eax          ; ecx=input
    lodsd
    xchg   ecx, eax          ; ecx=inlen
    xchg   eax, esi          ; esi=input
    add    ebx, 68           ; ebx=&c->w.x[0]
rc_l0:
    jecxz  rc_l3             ; break if ecx==0
    push   ebx
    call   RABBIT_next_state
    pop    eax
    
    xor    eax, eax
    cdq
    mov    edi, esp
    pushad
    ; for (i=0; i<4; i++) {
    ;   x.d[i] = c->w.x[i<<1];
    ; }
rc_l1:
    mov    ecx, [ebx+eax*8]
    mov    [edi+eax*4], ecx
    inc    eax
    cmp    al, 4
    jnz    rc_l1
    
    mov    esi, [ebx+1*4]    ; ebx = c->w.x[1]
    mov    ecx, [ebx+3*4]    ; ecx = c->w.x[3]
    mov    edx, [ebx+5*4]    ; edx = c->w.x[5]
    mov    ebp, [ebx+7*4]    ; ebp = c->w.x[7]
    mov    eax, ecx
    ; x.d[0] ^= (c->w.x[5]>>16) ^ (c->w.x[3]<<16);
    shld   ecx, edx, 16  ; x5 >> 16, x3 << 16
    xor    [edi+4*0], ecx
    ; x.d[1] ^= (c->w.x[7]>>16) ^ (c->w.x[5]<<16);
    shld   edx, ebp, 16
    xor    [edi+4*1], edx
    ; x.d[2] ^= (c->w.x[1]>>16) ^ (c->w.x[7]<<16);
    shld   ebp, esi, 16
    xor    [edi+4*2], ebp
    ; x.d[3] ^= (c->w.x[3]>>16) ^ (c->w.x[1]<<16);
    shld   esi, eax, 16
    xor    [edi+4*3], esi
    popad
    
    ; for (i=0; i<16 && inlen!=0; i++) {
    ;   *in++ ^= x.b[i];
    ;   inlen--;
    ; }
    mov    dl, 16            ; do 16 bytes or remaining
rc_l2:
    mov    al, [edi]         ; al = x.b[i]
    inc    edi
    xor    [esi], al         ; *in ^= al
    inc    esi               ; in++
    dec    edx               ; i--
    loopnz rc_l2             ; break if --len==0 or i==0
    inc    ecx
    loop   rc_l0
rc_l3:
    popad                    ; free stack
    popad                    ; restore registers
    ret

Summary

architecture size
x86 458
x64 n/a

Please refer to here and here for sources, documentation and any future updates.

Salsa20 which was also chosen for eSTREAM portfolio, ChaCha20 and HC-256 are all significantly smaller than Rabbit and probably just as secure since they support 256-bit keys. I have not tested each algorithm for speed but with multiplication/division used here, it may be slower than optimized versions of Salsa20 and ChaCha20.

The final size for hand written assembly is approx. 450 bytes and approx. 700 for code generated by MSVC

Posted in assembly, cryptography, encryption, security | Tagged , , , , | 1 Comment

Salsa20 Stream Cipher

Introduction

Salsa20 or Snuffle 2005 is a stream cipher designed, written and published in 2005 by mathematician D.J Bernstein. It supports 128/256-bit keys and 64-bit nonce values.

Another version XSalsa20 supports nonces up to 192-bits if 64-bits is insufficient to what you need.

It was selected for the final portfolio of the eStream (ECRYPT Stream Cipher Project) so we can presume it’s undergone extensive cryptanalysis during the selection process.

The implementation here only supports 256-bit keys.

About the assembly code

ChaCha20 was already implemented and is based on Salsa20 although ChaCha20 is apparently a better design for performance without losing security. The x86 code was a joint effort between Peter Ferrie and myself while writing ChaCha20 and so few changes were made to implement Salsa20.

Context/State

The keystream buffer is 64-bytes or 512-bits which gets updated for each time the Salsa20 core is invoked.

#define S20_BLK_LEN 64

typedef union _s20_blk_t {
  uint8_t  v8[S20_BLK_LEN];
  uint16_t v16[S20_BLK_LEN/2];
  uint32_t v32[S20_BLK_LEN/4];
  uint64_t v64[S20_BLK_LEN/8];
} s20_blk;

typedef struct _s20_ctx_t {
  s20_blk s;
} s20_ctx;

Key generation

This function expects 256-bit keys. The ChaCha20 stream cipher (Snuffle 2008) has a better design IMHO but hasn’t received as much scrutiny as Salsa20 according to the author.

// setup the key, must be 256-bits with 64-bit nonce
void s20_setkey(s20_ctx *c, void *key, void *nonce)
{
  s20_blk *iv=(s20_blk*)nonce;
  s20_blk *k=(s20_blk*)key;
  
  c->s.v32[ 0] = 0x61707865;
  c->s.v32[ 1] = k->v32[0];
  c->s.v32[ 2] = k->v32[1];
  c->s.v32[ 3] = k->v32[2];
  c->s.v32[ 4] = k->v32[3];
  c->s.v32[ 5] = 0x3320646E;
  c->s.v32[ 6] = iv->v32[0];
  c->s.v32[ 7] = iv->v32[1];
  c->s.v32[ 8] = 0;
  c->s.v32[ 9] = 0;
  c->s.v32[10] = 0x79622D32;
  c->s.v32[11] = k->v32[4];
  c->s.v32[12] = k->v32[5];
  c->s.v32[13] = k->v32[6];
  c->s.v32[14] = k->v32[7];
  c->s.v32[15] = 0x6B206574;
}
_s20_setkeyx:
s20_setkeyx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax
    lodsd
    push   eax
    lodsd
    xchg   ebx, eax
    pop    esi
    mov    eax, 061707865h
    cdq
    stosd
    ; copy 16 bytes of 256-bit key
    movsd    
    movsd    
    movsd    
    movsd    
    mov    eax, 03320646Eh
    stosd
    ; copy 64-bit iv to 6,7
    xchg   esi, ebx
    movsd
    movsd
    ; zero 64-bit counter at 8,9
    xchg   eax, edx
    stosd
    stosd
    ; store 32-bits at 10
    mov    eax, 079622D32h
    stosd
    ; store remainder of key at 11-14
    xchg   esi, ebx
    movsd    
    movsd    
    movsd    
    movsd
    ; store last 32-bits
    mov    eax, 06B206574h
    stosd
    popad
    ret

Salsa20 Core

The 64-byte input x to Salsa20 is viewed in little-endian form as 16 words x0, x1, x2, …, x15 in {0,1,…,2^32-1}. These 16 words are fed through 320 invertible modifications, where each modification changes one word. The resulting 16 words are added to the original x0, x1, x2, …, x15 respectively modulo 2^32, producing, in little-endian form, the 64-byte output Salsa20(x).

Each modification involves xor’ing into one word a rotated version of the sum of two other words modulo 2^32. Thus the 320 modifications involve, overall, 320 additions, 320 xor’s, and 320 rotations. The rotations are all by constant distances.

The entire series of modifications is a series of 10 identical double-rounds. Each double-round is a series of 2 rounds. Each round is a set of 4 parallel quarter-rounds. Each quarter-round modifies 4 words.

// transforms block using ARX instructions
void QUARTERROUND(s20_blk *blk, uint16_t idx) 
{
    uint32_t a, b, c, d;
    uint32_t *x=(uint32_t*)&blk->v8;
    
    a = (idx         & 0xF);
    b = ((idx >>  4) & 0xF);
    c = ((idx >>  8) & 0xF);
    d = ((idx >> 12) & 0xF);

    x[b] ^= ROTL32((x[a] + x[d]), 7);
    x[c] ^= ROTL32((x[b] + x[a]), 9);
    
    x[d] ^= ROTL32((x[c] + x[b]),13);
    x[a] ^= ROTL32((x[d] + x[c]),18);
}

// encrypt/decrypt 64 byte block
// do not call directly, use s20_encrypt instead
void s20_core (s20_ctx *c, uint8_t *in, uint32_t len)
{
    s20_blk x;
    int     i, j;

    // 16-bit integers of each index
    uint16_t idx16[8]=
    { 0xC840, 0x1D95, 0x62EA, 0xB73F, 
      0x3210, 0x4765, 0x98BA, 0xEDCF };
    
    // copy state to local space
    for (i=0; i<16; i++) { 
      x.v32[i] = c->s.v32[i];
    }
    // apply 20 rounds
    for (i=0; i<20; i+=2) {
      for (j=0; j<8; j++) {
        QUARTERROUND(&x, idx16[j]);
      }
    }
    // add state to x
    for (i=0; i<16; i++) {
      x.v32[i] += c->s.v32[i];
    }
    // xor input with x
    for (i=0; i<len; i++) {
      ((uint8_t*)in)[i] ^= x.v8[i];
    }
    // update block counter
    c->s.v64[4]++;
    // stopping at 2^70 bytes per nonce is user's responsibility
}
%define a eax
%define b ebx
%define c edx
%define d esi
%define x edi

%define t0 ebp

; void QUARTERROUND(s20_blk *blk, uint16_t index)
; expects edi points to x array
QUARTERROUND:
    pushad
    push   a                ; save eax
    xchg   ah, al
    aam    16
    
    movzx  d, ah
    movzx  c, al

    pop    a
    aam    16
    
    movzx  b, ah
    movzx  a, al

    lea    a, [x+a*4]
    lea    b, [x+b*4]
    lea    c, [x+c*4]
    lea    d, [x+d*4]

    ; load ecx with rotate values
    ; 16, 12, 8, 7
    mov    ecx, 0120D0907h
s20_q0:
    ; x[b] ^= ROTL32((x[a] + x[d]), 7);
    mov    t0, [a]           ; ebp=x[a]
    add    t0, [d]
    rol    t0, cl
    xor    [b], t0
    
    xchg   cl, ch
    
    mov    t0, [b]           ; ebp=x[b]
    add    t0, [a]
    rol    t0, cl
    xor    [c], t0
    
    xchg   a, c
    xchg   b, d
    
    shr    ecx, 16
    jnz    s20_q0
    
    popad
    ret
  
; void s20_corex (s20_ctx *ctx, void *in, uint32_t len)
; do not call directly
; expects state in ebx, length in eax, input in edx
_s20_corex:
s20_corex:
    pushad

    ; allocate 64-bytes local space, x
    push   64
    pop    ecx
    sub    esp, ecx
    
    ; copy state to x
    mov    edi, esp
    mov    esi, ebx
    rep    movsb

    ; move x into edi
    mov    edi, esp
    push   eax
    push   20/2  ; 20 rounds
    pop    ebp
s20_c0:
    ; load indexes
    call   s20_c1
    dw 0C840h, 01D95h, 062EAh, 0B73Fh
    dw 03210h, 04765h, 098BAh, 0EDCFh
s20_c1:
    pop    esi  ; pointer to indexes
    mov    cl, 8
s20_c2:
    lodsw
    call   QUARTERROUND
    loop   s20_c2
    dec    ebp
    jnz    s20_c0
    
    ; add state to x
    mov    cl, 16
s20_c3:
    mov    eax, [ebx+ecx*4-4]
    add    [edi+ecx*4-4], eax
    loop   s20_c3

    ; xor input with x
    pop    ecx               ; ecx=len
s20_c4:
    mov    al, byte[edi+ecx-1]
    xor    byte[edx+ecx-1], al
    loop   s20_c4
    
    ; update block counter
    stc
    adc    dword[ebx+8*4], ecx
    adc    dword[ebx+9*4], ecx
    
    ; free stack
    popad
    popad
    ; restore registers
    popad
    ret

Encryption and Decryption

The same function is used to encrypt/decrypt providing the same key and nonce are used during key setup.

// encrypt stream of bytes
void s20_crypt (s20_ctx *c, void *in, uint32_t len) 
{
    uint32_t r;
    uint8_t  *p=(uint8_t*)in;
    
    while (len) {
      r=(len>64) ? 64 : len;
      s20_core(c, p, r);
      len -= r;
      p += r;
    }
}
_s20_cryptx:
s20_cryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   ebx, eax
    lodsd
    xchg   edx, eax
    lodsd
    xchg   ecx, eax
    jecxz  s20_l1            ; exit if len==0
    xor    eax, eax          ; r=0
s20_l0:
    mov    al, 64
    cmp    ecx, eax          ; eax=len < 64 ? len : 64
    cmovb  eax, ecx
    call   s20_corex
    add    edx, eax          ; p += r;
    sub    ecx, eax          ; len -= r;
    jnz    s20_l0
s20_l1:
    popad
    ret

Summary

architecture size
x86 245
x64 n/a

For source code and any future updates, please refere to code here.

Posted in cryptography, encryption, programming, security | Tagged , , , , | 1 Comment