## CubeMAC128 Message Authentication Code

### Introduction

CubeMAC128 is a cryptographic Message Authentication Code (MAC) designed for packet authentication that was proposed in 2010 by mathematician and cryptographer Daniel J. Bernstein.

The CubeMAC proposal was in response to NIST concerns about using CubeHash as a MAC function for small messages. The parameters for MAC were 16 initial rounds, a 32-byte block and 32 final rounds. The recommended key length is 512-bits or 64-bytes.

You can find a detailed discussion about it in CubeHash parameter tweak: 10 times smaller MAC overhead

### Permutation Function

Like other designs by Dan, the pseudo-random-function used to provide properties of confusion and diffusion only uses Add-Rotate-Xor operations which makes it suitable for many different architectures.

The following is taken from reference code.

```static void transform(hashState *state)
{
int i;
int r;
crypto_uint32 y[16];

for (r = 0;r < CUBEHASH_ROUNDS;++r) {
for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
for (i = 0;i < 16;++i) y[i ^ 8] = state->x[i];
for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],7);
for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
for (i = 0;i < 16;++i) y[i ^ 2] = state->x[i + 16];
for (i = 0;i < 16;++i) state->x[i + 16] = y[i];
for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
for (i = 0;i < 16;++i) y[i ^ 4] = state->x[i];
for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],11);
for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
for (i = 0;i < 16;++i) y[i ^ 1] = state->x[i + 16];
for (i = 0;i < 16;++i) state->x[i + 16] = y[i];
}
}
```

The changes made are to obviously reduce code at the expense of performance but not intentionally. Rather than move upwards in steps of one for variable i, it’s set to 15 and moved down until less than zero.

This should eliminate a CMP opcode and depends on Sign status flag (SF) to indicate when to end loop.

```// permutation function
void permute(cube_state *s)
{
int      i, j, k, n;
uint32_t y[16];

for (n=16; n>0; n--)
{
for (k=7, j=2; j>0; k+=4, j--)
{
for (i=15; i>=0; --i) s->w[i + 16] += s->w[i];
for (i=15; i>=0; --i) y[i ^ (j*4)] = s->w[i];
for (i=15; i>=0; --i) s->w[i] = ROTL32(y[i], k);

for (i=15; i>=0; --i) s->w[i] ^= s->w[i + 16];
for (i=15; i>=0; --i) y[i ^ j] = s->w[i + 16];
for (i=15; i>=0; --i) s->w[i + 16] = y[i];
}
}
}
```

The assembly works similar except we’re using the LOOP instruction quite a lot with PUSHAD/POPAD.

```; ==================================
; permutation function
;
; edi = s
mov    esi, esp          ; esi = y[16]
; for (n=16; n>0; n--)
push   16
pop    ecx
; for (k=7, j=2; j>0; k+=4, j--)
pm_l0:
push   ecx               ; save n
mov    cl, 16
push   2
pop    ebp               ; j=2
push   7
pop    edx               ; k=7
pm_l1:
; **************************
; for (i=15; i>=0; --i)
;   s->w[i + 16] += s->w[i];
; **************************
pm_l2:
mov    eax, [edi]
scasd
loop   pm_l2
; **************************
; for (i=15; i>=0; --i)
;   y[i ^ (j*4)] = s->w[i];
; **************************
shl    ebp, 2
pm_l3:
lea    ebx, [ecx-1]
mov    eax, [edi+ebx*4]
xor    ebx, ebp
mov    [esi+ebx*4], eax
loop   pm_l3
; **************************
; for (i=15; i>=0; --i)
;   s->w[i] = ROTL32(y[i], k);
; **************************
xchg   ecx, edx
pm_l4:
lodsd
rol    eax, cl
stosd
dec    edx
jnz    pm_l4
; **************************
; for (i=15; i>=0; --i)
;   s->w[i] ^= s->w[i + 16];
; **************************
pm_l5:
mov    eax, [edi]
xor    eax, [edi+64]
stosd
loop   pm_l5
; **************************
; for (i=15; i>=0; --i)
;   y[i ^ j] = s->w[i + 16];
; **************************
pm_l6:
lea    ebx, [ecx-1]
mov    eax, [edi+ebx*4+64]
xor    ebx, ebp
mov    [esi+ebx*4], eax
loop   pm_l6
; **************************
; for (i=15; i>=0; --i)
;   s->w[i + 16] = y[i];
; **************************
rep    movsd

add    edx, 4          ; k += 4
dec    ebp             ; j--
jnz    pm_l1

pop    ecx
loop   pm_l0           ; will set CF to 0

ret
```

### Sponge Function

We absorb 32-byte blocks of data into the state before applying the permutation function.

```// absorb data into the state
uint32_t absorb(cube_state *s,
const void *msg, uint32_t len)
{
uint32_t i, idx=0;
uint8_t  *p=(uint8_t*)msg;

for (i=0; i<len; i++) {
s->b[idx++] ^= p[i];
if (idx == 32) {
permute(s);
idx = 0;
}
}
return idx;
}
```

We inline this to minimize the overall size of code.

```;
; ==================================
; absorb data into state
;
; ebx = data
; ecx = len
; edi = s
absorb:
xor    eax, eax      ; idx = 0
jecxz  abs_l1        ; exit if len == 0
abs_l0:
mov    dl, [ebx]
xor    [edi+eax], dl ; s->b[idx] ^= *data
inc    eax           ; idx++
inc    ebx           ; data++
cmp    al, 32        ; absorbed block?
loopne abs_l0        ; while (al != 32 && ecx != 0)
jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
call   ebp           ; permute(s)
jmp    absorb        ; keep going
abs_l1:
popfd
cmc                  ; CF = !CF
jc     cm_l0         ; loop twice
```

### MAC Function

This is purely intended for small messages; authenticating network packets for example. It takes a 512-bit key, a message and returns 128-bit MAC.

The length of message shouldn’t exceed 4GB but that should be obvious. I was naturally thinking about packets of 512-bytes or less 🙂

```// cube message authentication code
void cube_macx(const void *mkey, uint32_t keylen,
const void *msg, uint32_t msglen, void *tag)
{
uint32_t   idx;
cube_state s;

// 1. initialize state
memset(&s, 0, sizeof(cube_state));

s.w[0] = 16; // 16-byte output
s.w[1] = 32; // 32-byte block size
s.w[2] = 16; // 16 rounds per block

permute(&s);

// 2. absorb key
absorb (&s, mkey, 64);

// 3. absorb message
idx = absorb(&s, msg, msglen);

// 4. absorb end bit
s.b[idx] ^= 0x80;
permute(&s);

// 5. absorb final bit
s.w[31] ^= 1;

permute(&s);
permute(&s);

// 6. return 128-bit tag
memcpy(tag, s.b, 16);
}
```

So here’s the rest of assembly code with permutation function stripped out.

```cube_macx:
_cube_macx:
call   ld_pm
; permutation function goes here
ld_pm:
pop    ebp           ; ebp = permute
lea    esi, [esp+32+4]
xor    eax, eax
xor    ecx, ecx
mov    cl, 128
sub    esp, ecx
; 1. initialize local state
; memset(&s, 0, sizeof(cube_state));
mov    edi, esp
rep    stosb

mov    edi, esp
push   edi
; s.w[0] = 16;
mov    al, 16
stosd
; s.w[1] = 32;
mov    al, 32
stosd
; s.w[2] = 16;
mov    al, 16
stosd
pop    edi
; permute(&s);
call   ebp
; 2. absorb key
; 3. absorb message
cm_l0:
pushfd
lodsd
xchg   ebx, eax        ; ebx = data
lodsd
xchg   ecx, eax        ; ecx = len
; ==================================
; absorb data into state
;
; ebx = data
; ecx = len
; edi = s
absorb:
xor    eax, eax      ; idx = 0
jecxz  abs_l1        ; exit if len == 0
abs_l0:
mov    dl, [ebx]
xor    [edi+eax], dl ; s->b[idx] ^= *data
inc    eax           ; idx++
inc    ebx           ; data++
cmp    al, 32        ; absorbed block?
loopne abs_l0        ; while (al != 32 && ecx != 0)
jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
call   ebp           ; permute(s)
jmp    absorb        ; keep going
abs_l1:
popfd
cmc                  ; CF = !CF
jc     cm_l0         ; loop twice

; 4. absorb end bit
xor    byte[edi+eax], 0x80
call   ebp

; 5. absorb final bit
xor    byte[edi+31*4], 1
call   ebp
call   ebp

; 6. return 128-bit tag
lodsd
xchg   eax, edi        ; edi = tag
xchg   eax, esi        ; esi = s
mov    cl, 16
rep    movsb

; release stack
pop    eax