CubeMAC128 Message Authentication Code

Introduction

CubeMAC128 is a cryptographic Message Authentication Code (MAC) designed for packet authentication that was proposed in 2010 by mathematician and cryptographer Daniel J. Bernstein.

The CubeMAC proposal was in response to NIST concerns about using CubeHash as a MAC function for small messages. The parameters for MAC were 16 initial rounds, a 32-byte block and 32 final rounds. The recommended key length is 512-bits or 64-bytes.

You can find a detailed discussion about it in CubeHash parameter tweak: 10 times smaller MAC overhead

Permutation Function

Like other designs by Dan, the pseudo-random-function used to provide properties of confusion and diffusion only uses Add-Rotate-Xor operations which makes it suitable for many different architectures.

The following is taken from reference code.

static void transform(hashState *state)
{
  int i;
  int r;
  crypto_uint32 y[16];

  for (r = 0;r < CUBEHASH_ROUNDS;++r) {
    for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
    for (i = 0;i < 16;++i) y[i ^ 8] = state->x[i];
    for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],7);
    for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
    for (i = 0;i < 16;++i) y[i ^ 2] = state->x[i + 16];
    for (i = 0;i < 16;++i) state->x[i + 16] = y[i];
    for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
    for (i = 0;i < 16;++i) y[i ^ 4] = state->x[i];
    for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],11);
    for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
    for (i = 0;i < 16;++i) y[i ^ 1] = state->x[i + 16];
    for (i = 0;i < 16;++i) state->x[i + 16] = y[i];
  }
}

The changes made are to obviously reduce code at the expense of performance but not intentionally. Rather than move upwards in steps of one for variable i, it’s set to 15 and moved down until less than zero.

This should eliminate a CMP opcode and depends on Sign status flag (SF) to indicate when to end loop.

// permutation function
void permute(cube_state *s)
{
    int      i, j, k, n;
    uint32_t y[16];

    for (n=16; n>0; n--) 
    {
      for (k=7, j=2; j>0; k+=4, j--)
      {
        for (i=15; i>=0; --i) s->w[i + 16] += s->w[i];
        for (i=15; i>=0; --i) y[i ^ (j*4)] = s->w[i];
        for (i=15; i>=0; --i) s->w[i] = ROTL32(y[i], k);
      
        for (i=15; i>=0; --i) s->w[i] ^= s->w[i + 16];
        for (i=15; i>=0; --i) y[i ^ j] = s->w[i + 16];
        for (i=15; i>=0; --i) s->w[i + 16] = y[i];
      }
    }
}

The assembly works similar except we’re using the LOOP instruction quite a lot with PUSHAD/POPAD.

; ==================================
      ; permutation function
      ;
      ; edi = s
      pushad                   ; save registers
      pushad                   ; allocate 32-bytes
      pushad                   ; allocate 32-bytes
      mov    esi, esp          ; esi = y[16]
      ; for (n=16; n>0; n--)
      push   16
      pop    ecx
      ; for (k=7, j=2; j>0; k+=4, j--)
pm_l0:
      push   ecx               ; save n
      mov    cl, 16
      push   2
      pop    ebp               ; j=2
      push   7
      pop    edx               ; k=7
pm_l1:
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i + 16] += s->w[i];
      ; **************************
      pushad
pm_l2:
      mov    eax, [edi]
      add    [edi+64], eax
      scasd
      loop   pm_l2
      popad
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   y[i ^ (j*4)] = s->w[i];
      ; **************************
      pushad
      shl    ebp, 2
pm_l3:
      lea    ebx, [ecx-1]
      mov    eax, [edi+ebx*4]
      xor    ebx, ebp
      mov    [esi+ebx*4], eax
      loop   pm_l3
      popad
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i] = ROTL32(y[i], k);
      ; **************************
      pushad
      xchg   ecx, edx
pm_l4:
      lodsd
      rol    eax, cl
      stosd
      dec    edx
      jnz    pm_l4
      popad
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i] ^= s->w[i + 16];
      ; **************************
      pushad
pm_l5:
      mov    eax, [edi]
      xor    eax, [edi+64]
      stosd
      loop   pm_l5
      popad
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   y[i ^ j] = s->w[i + 16];
      ; **************************
      pushad
pm_l6:
      lea    ebx, [ecx-1]
      mov    eax, [edi+ebx*4+64]
      xor    ebx, ebp
      mov    [esi+ebx*4], eax
      loop   pm_l6
      popad
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i + 16] = y[i];
      ; **************************
      pushad
      add    edi, 64
      rep    movsd
      popad

      add    edx, 4          ; k += 4
      dec    ebp             ; j--
      jnz    pm_l1

      pop    ecx
      loop   pm_l0           ; will set CF to 0

      popad                  ; release 32-bytes
      popad                  ; release 32-bytes
      popad                  ; restore registers
      ret

Sponge Function

We absorb 32-byte blocks of data into the state before applying the permutation function.

// absorb data into the state
uint32_t absorb(cube_state *s, 
    const void *msg, uint32_t len)
{
    uint32_t i, idx=0;
    uint8_t  *p=(uint8_t*)msg;
    
    for (i=0; i<len; i++) {
      s->b[idx++] ^= p[i];
      if (idx == 32) {
        permute(s);
        idx = 0;
      }
    }  
    return idx;
}

We inline this to minimize the overall size of code.

;
      ; ==================================
      ; absorb data into state
      ;
      ; ebx = data
      ; ecx = len
      ; edi = s
absorb:
      xor    eax, eax      ; idx = 0
      jecxz  abs_l1        ; exit if len == 0
abs_l0:
      mov    dl, [ebx]
      xor    [edi+eax], dl ; s->b[idx] ^= *data
      inc    eax           ; idx++
      inc    ebx           ; data++
      cmp    al, 32        ; absorbed block?
      loopne abs_l0        ; while (al != 32 && ecx != 0)
      jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
      call   ebp           ; permute(s)
      jmp    absorb        ; keep going
abs_l1:
      popfd
      cmc                  ; CF = !CF 
      jc     cm_l0         ; loop twice

MAC Function

This is purely intended for small messages; authenticating network packets for example. It takes a 512-bit key, a message and returns 128-bit MAC.

The length of message shouldn’t exceed 4GB but that should be obvious. I was naturally thinking about packets of 512-bytes or less 🙂

// cube message authentication code
void cube_macx(const void *mkey, uint32_t keylen,
    const void *msg, uint32_t msglen, void *tag)
{
    uint32_t   idx;  
    cube_state s;
    
    // 1. initialize state
    memset(&s, 0, sizeof(cube_state));

    s.w[0] = 16; // 16-byte output
    s.w[1] = 32; // 32-byte block size
    s.w[2] = 16; // 16 rounds per block
    
    permute(&s);
    
    // 2. absorb key
    absorb (&s, mkey, 64);
    
    // 3. absorb message
    idx = absorb(&s, msg, msglen);

    // 4. absorb end bit
    s.b[idx] ^= 0x80;
    permute(&s);
    
    // 5. absorb final bit
    s.w[31] ^= 1;
    
    permute(&s);
    permute(&s);
    
    // 6. return 128-bit tag
    memcpy(tag, s.b, 16);  
}

So here’s the rest of assembly code with permutation function stripped out.

cube_macx:
_cube_macx:
      pushad
      call   ld_pm
      ; permutation function goes here
ld_pm:
      pop    ebp           ; ebp = permute
      lea    esi, [esp+32+4]
      xor    eax, eax
      xor    ecx, ecx
      mov    cl, 128
      sub    esp, ecx
      ; 1. initialize local state
      ; memset(&s, 0, sizeof(cube_state));
      mov    edi, esp
      rep    stosb

      mov    edi, esp
      push   edi
      ; s.w[0] = 16;
      mov    al, 16
      stosd
      ; s.w[1] = 32;
      mov    al, 32
      stosd
      ; s.w[2] = 16;
      mov    al, 16
      stosd
      pop    edi
      ; permute(&s);
      call   ebp
      ; 2. absorb key
      ; 3. absorb message
cm_l0:
      pushfd      
      lodsd
      xchg   ebx, eax        ; ebx = data
      lodsd
      xchg   ecx, eax        ; ecx = len
      ; ==================================
      ; absorb data into state
      ;
      ; ebx = data
      ; ecx = len
      ; edi = s
absorb:
      xor    eax, eax      ; idx = 0
      jecxz  abs_l1        ; exit if len == 0
abs_l0:
      mov    dl, [ebx]
      xor    [edi+eax], dl ; s->b[idx] ^= *data
      inc    eax           ; idx++
      inc    ebx           ; data++
      cmp    al, 32        ; absorbed block?
      loopne abs_l0        ; while (al != 32 && ecx != 0)
      jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
      call   ebp           ; permute(s)
      jmp    absorb        ; keep going
abs_l1:
      popfd
      cmc                  ; CF = !CF 
      jc     cm_l0         ; loop twice
      
      ; 4. absorb end bit
      xor    byte[edi+eax], 0x80
      call   ebp

      ; 5. absorb final bit
      xor    byte[edi+31*4], 1
      call   ebp
      call   ebp

      ; 6. return 128-bit tag
      lodsd
      xchg   eax, edi        ; edi = tag
      xchg   eax, esi        ; esi = s
      mov    cl, 16
      rep    movsb

      ; release stack
      pop    eax
      add    esp, 124
      popad
      ret

Summary

The size of C code using /Os /O2 switches was approx. 380 bytes. The assembly code is currently 197 bytes which I doubt can be reduced much more.

I’ve documented Poly1305 here which is another MAC function designed by the same author.

If I had to choose a compact function for authentication of small packets, I’d probably pick CubeMAC instead although it hasn’t received as much scrutiny by the cryptographic community as Poly1305.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s