Chaskey Message Authentication Code

Introduction

Chaskey is a lightweight MAC algorithm optimised for 32-bit micro-controllers designed by Nicky Mouha, Bart Mennink, Anthony Van Herrewege, Dai Watanabe, Bart Preneel and Ingrid Verbauwhede.

It is based on a 128-bit block cipher, the Chaskey cipher, which uses ARX operations and an Even-Mansour structure.

What follows is an implementation in C and x86 assembly optimized for size.

State structure

typedef union state_t {
  uint8_t  b[16];
  uint32_t w[4];
} state;

Key scheduling

void chas_setkey(void *out, void *in) 
{
  int      i;
  uint32_t *k=(uint32_t*)out;
  
  memcpy (out, in, 16);
  
  for (i=0; i<2; i++)
  {
    k[4] = (k[0] << 1);     
    k[4] ^= 0x87 * (k[3] >> 31);    
    k[5] = (k[1] << 1) | (k[0] >> 31); 
    k[6] = (k[2] << 1) | (k[1] >> 31); 
    k[7] = (k[3] << 1) | (k[2] >> 31);
    
    k += 4;    
  }
}

Assembly could possibly do with better optimization.

%define k0 ebp
%define k1 ebx
%define k2 ecx
%define k3 edx
    
chas_setkeyx:
_chas_setkeyx:
    pushad
    mov    edi, [esp+32+4]   ; edi = out
    mov    esi, [esp+32+8]   ; esi = in
    push   edi
    movsd
    movsd
    movsd
    movsd
    pop    esi
    clc
sk_l0:
    pushfd

    lodsd
    xchg   eax, k0
    lodsd
    xchg   eax, k1
    lodsd
    xchg   eax, k2
    lodsd
    xchg   eax, k3
 
    push   k3
    lea    eax, [k0+k0]
    add    k3, k3
    sbb    dl, dl
    and    dl, 0x87
    xor    al, dl   
    pop    k3
    stosd
    
    lea    eax, [k1+k1]
    shr    k0, 31
    or     eax, k0
    stosd
    
    lea    eax, [k2+k2]
    shr    k1, 31
    or     eax, k1
    stosd
    
    lea    eax, [k3+k3]
    shr    k2, 31
    or     eax, k2
    stosd

    popfd
    cmc
    jc     sk_l0
    
    popad
    ret

Key whitening

void chas_xor(state *out, const void *in, int len) {
  int i;

  for (i=0; i<len; i++) {
    out->b[i] ^= ((uint8_t*)in)[i];
  }
}
; ecx = length
; esi = input
; edi = v   
chas_xor:
    pushad
    jecxz  cx_l1
cx_l0:    
    mov    al, [esi]
    xor    [edi], al
    cmpsb
    loop   cx_l0
cx_l1:    
    popad
    ret

Permutation function

This is derived from SipHash permutation function which is derived from chacha and salsa stream ciphers.

void chas_permute(uint32_t v[])
{
  int i=12;
  
  do
  {
    v[0] += v[1]; 
    v[1]=ROTL32(v[1], 5); 
    v[1] ^= v[0]; 
    v[0]=ROTL32(v[0],16); 
    v[2] += v[3]; 
    v[3]=ROTL32(v[3], 8); 
    v[3] ^= v[2]; 
    v[0] += v[3]; 
    v[3]=ROTL32(v[3],13); 
    v[3] ^= v[0]; 
    v[2] += v[1]; 
    v[1]=ROTL32(v[1], 7); 
    v[1] ^= v[2]; 
    v[2]=ROTL32(v[2],16); 
  } while (--i);
}

Assembly code is straight forward but perhaps there’s room to optimize further.

%define v0 eax    
%define v1 edx    
%define v2 ebp    
%define v3 ebx
    
; ecx = 16    
; edi = v
chas_permute:
    pushad
    mov    cl, 12
    mov    esi, edi
    lodsd
    xchg   eax, v3
    lodsd
    xchg   eax, v1
    lodsd
    xchg   eax, v2
    lodsd
    xchg   eax, v3
cp_l0:
    add    v0, v1            ; v[0] += v[1];
    rol    v1, 5             ; v[1] = ROTL(v[1], 5);
    xor    v1, v0            ; v[1] ^= v[0];
    rol    v0, 16            ; v[0] = ROTL(v[0], 16);
    add    v2, v3            ; v[2] += v[3]; 
    rol    v3, 8             ; v[3] = ROTL(v[3], 8); 
    xor    v3, v2            ; v[3] ^= v[2]; 
    add    v0, v3            ; v[0] += v[3];
    rol    v3, 13            ; v[3] = ROTL(v[3], 13);
    xor    v3, v0            ; v[3] ^= v[0];
    add    v2, v1            ; v[2] += v[1];
    rol    v1, 7             ; v[1] = ROTL(v[1],  7);
    xor    v1, v2            ; v[1] ^= v[2];
    rol    v2, 16            ; v[2] = ROTL(v[2], 16);
    loop   cp_l0
    stosd
    xchg   eax, v1
    stosd
    xchg   eax, v2
    stosd
    xchg   eax, v3
    stosd
    popad   
    ret

MAC generation

void chas_mac (uint8_t *tag, 
    uint8_t *msg, uint32_t msglen, uint8_t *key) 
{
  state v;
  int   len;
  
  // copy 16 bytes of key
  memcpy(v.b, key, 16);

  // absorb message 
  for (;;)
  {
    len = (msglen < 16) ? msglen : 16;
    
    // xor state with msg data
    chas_xor(&v, msg, len);

    // final?
    if (msglen <= 16) {
      if (msglen < 16) {
        // final? add padding bit
        v.b[msglen] ^= 0x01;
      }
      key += (msglen == 16) ? 16 : 32;
      break;
    }    
    
    // apply permutation function
    chas_permute(v.w);
    
    // update position and length
    msg += 16;
    msglen -= 16;
  }

  // pre-whiten
  chas_xor(&v, key, 16);
  // permute
  chas_permute(v.w);
  // post-whiten
  chas_xor(&v, key, 16);
  // return tag
  memcpy(tag, v.b, 16);
}

Assembly code

; chaskey    
chas_macx:
_chas_macx:
    pushad
    lea    esi, [esp+32+4]
    pushad                   ; allocate 32 bytes
    mov    edi, esp          ; edi = v
    lodsd
    xchg   eax, ebp          ; ebp = tag ptr
    lodsd
    xchg   eax, ebx          ; ebx = msg ptr
    lodsd
    xchg   edx, eax          ; edx = msglen
    lodsd
    xchg   eax, esi          ; esi = key

    ; memcpy(v, &key[0], 16);
    push   16
    pop    ecx
    push   edi               ; save v
    rep    movsb
    pop    edi               ; restore v
    push   esi               ; save &key[16]
    mov    esi, ebx          ; esi = msg    
    ; absorb message
cm_l0:
    mov    cl, 16
    ; len = (msglen < 16) ? msglen : 16;
    cmp    edx, ecx
    cmovb  ecx, edx
    
    ; chas_xor(&v, msg, len);
    call   chas_xor
    mov    cl, 16
    
    ; if (msglen <= 16)
    cmp    edx, ecx
    jbe    cm_l2
    
    call   chas_permute

    ; msglen -= 16
    sub    edx, ecx
    ; msg += 16
    add    esi, ecx
    
    jmp    cm_l0
cm_l2:    
    pop    esi
    ; if (msglen < 16)
    je     cm_l4    
    ; v.b[msglen] ^= 0x01;
    xor    byte[edi+edx], 0x01
    ; load key + 32
    add    esi, ecx
cm_l4:
    ; chas_xor(v, key, 16);
    call   chas_xor
    ; chas_permute(v);
    call   chas_permute
    ; chas_xor(v, key, 16);
    call   chas_xor
    
    ; memcpy(tag, v, 16);
    mov    esi, edi
    mov    edi, ebp
    rep    movsb
        
    popad
    popad
    ret

Summary

The MSVC generated code resulted in 346 bytes using /Os /O2 flags and the x86 assembly is currently 234 bytes. The underlying block cipher doesn’t require key scheduling and this code could be reduced much further if you only needed that functionality. See sources here for future updates.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , , , . Bookmark the permalink.

2 Responses to Chaskey Message Authentication Code

  1. Pingback: Asmcodes: Chaskey-LTS Block Cipher | x86 crypto

  2. Pingback: Asmcodes: Light Message Authentication Code (LightMAC) | x86 crypto

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s