Rabbit Stream Cipher

Introduction

Rabbit is a stream cipher designed in 2003 by Martin Boesgaard, Mette Vesterager, Thomas Pedersen, Jesper Christiansen and Ove Scavenius. It was published in 2005 and selected as a software cipher for the eSTREAM portfolio in 2008. It’s designed for high performance in software implementations but is also known to work well in hardware environments. A more extensive description by the authors can be found in RFC 4503

Technically, Rabbit consists of a pseudorandom bitstream generator that takes a 128-bit key and a 64-bit initialization vector (IV) as input and generates a stream of 128-bit blocks. Encryption is performed by combining this output with the message, using the exclusive-OR operation. Decryption is performed in exactly the same way as encryption.

About the assembly code

This was a joint effort between Peter Ferrie and myself. It’s not as compact as Salsa20 or HC-256 that are also part the eSTREAM portfolio.

Optimization trick with PUSHAD/POPAD

Although this has nothing to do with the cipher itself, I think it’s worth noting. While writing the assembly code for RABBIT, I accidently “discovered” something that I didn’t realize was possible with the PUSHAD instruction. When PUSHAD is executed, it saves all the general purpose registers including ESP on the stack. The total stack space occupied for this operation is 32-bytes. What I didn’t know is that it’s possible to overwrite ESP and when POPAD is executed, the original value of ESP is still restored to its original value.

So what?” you may say.. but think about it. This is a neat way to allocate and release 32-bytes of memory without worrying about overwriting ESP. If the original value of ESP were not restored, only 12 bytes would be available, but that’s still useful for some code. Instead of using SUB, ADD or ENTER to allocate 32-bytes or more in future, consider using PUSHAD since it doesn’t appear to matter if ESP is overwritten. Try it for yourself with the following code:

;
    pushad                   ; save all registers
    xor    eax, eax          ; eax=0
    push   8                 ; overwrite 8 GP registers
    pop    ecx
    mov    edi, esp
    rep    stosd             ; memset(edi, 0, 32);
    popad                    ; like "magic", ESP is restored

I’ve updated other sources to use this “trick” and they all work fine. It is used here in RABBIT_setiv and RABBIT_crypt.

Key setup

void RABBIT_setkey(RABBIT_ctx *c, const void *key)
{
   uint32_t k0, k1, k2, k3, i;
   rabbit_blk *k=(rabbit_blk*)key;

   // Generate four subkeys
   k0 = k->d[0];
   k1 = k->d[1];
   k2 = k->d[2];
   k3 = k->d[3];

   // Generate initial state variables
   c->m.x[0] = k0;
   c->m.x[2] = k1;
   c->m.x[4] = k2;
   c->m.x[6] = k3;

   c->m.x[1] = U32V(k3<<16) | (k2>>16);
   c->m.x[3] = U32V(k0<<16) | (k3>>16);
   c->m.x[5] = U32V(k1<<16) | (k0>>16);
   c->m.x[7] = U32V(k2<<16) | (k1>>16);

   // Generate initial counter values
   c->m.c[0] = ROTL32(k2, 16);
   c->m.c[2] = ROTL32(k3, 16);
   c->m.c[4] = ROTL32(k0, 16);
   c->m.c[6] = ROTL32(k1, 16);

   c->m.c[1] = (k0&0xFFFF0000) | (k1&0xFFFF);
   c->m.c[3] = (k1&0xFFFF0000) | (k2&0xFFFF);
   c->m.c[5] = (k2&0xFFFF0000) | (k3&0xFFFF);
   c->m.c[7] = (k3&0xFFFF0000) | (k0&0xFFFF);

   // Clear carry bit
   c->m.carry = 0;

   // Iterate the system four times
   for (i=0; i<4; i++) {
      RABBIT_next_state(&c->m);
   }

   // Modify the counters
   for (i=0; i<8; i++) {
      c->m.c[i] ^= c->m.x[(i+4) & 7];
   }

   // Copy master instance to work instance
   memcpy (&c->w.x[0], &c->m.x[0], sizeof(RABBIT_state));
}

x86 assembly

; Key setup
RABBIT_setkeyx:
_RABBIT_setkeyx:
    pushad
    mov    edi, [esp+32+4]
    mov    esi, [esp+32+8]
    ; ---------------------
    ; Generate four subkeys
    ; ---------------------
    lodsd
    xchg   eax, edx
    lodsd
    xchg   eax, ebx
    lodsd
    xchg   eax, ecx
    lodsd
    xchg   eax, edx
    ; --------------------------------
    ; Generate initial state variables
    ; --------------------------------
    pushad
    stosd

    ; c->m.x[1] = U32V(k3<<16) | (k2>>16);
    mov    ebp, edx
    shld   ebp, ecx, 16
    xchg   ebp, eax
    stosd

    xchg   ebx, eax
    stosd

    ; c->m.x[3] = U32V(k0<<16) | (k3>>16);
    mov    ebx, ebp
    shld   ebx, edx, 16
    xchg   ebx, eax
    stosd

    xchg   ecx, eax
    stosd

    ; c->m.x[5] = U32V(k1<<16) | (k0>>16);
    mov    ecx, ebx
    shld   ecx, ebp, 16
    xchg   ecx, eax
    stosd

    xchg   edx, eax
    stosd

    ; c->m.x[7] = U32V(k2<<16) | (k1>>16);
    mov    edx, ecx
    shld   edx, ebx, 16
    xchg   edx, eax
    stosd
    
    ; -------------------------------
    ; Generate initial counter values
    ; -------------------------------
    rol    ecx, 16
    xchg   ecx, eax
    stosd

    ; c->m.c[1] = (k0&0xFFFF0000) | (k1&0xFFFF);
    xchg   ebp, eax         ; k1.lo
    xchg   bx, ax
    stosd

    rol    edx, 16
    xchg   edx, eax
    stosd

    ; c->m.c[3] = (k1&0xFFFF0000) | (k2&0xFFFF);
    rol    ecx, 16          ; k2.lo
    xchg   ecx, eax
    stosd

    xchg   edx, eax
    xchg   bx, ax
    rol    eax, 16
    stosd

    ; c->m.c[5] = (k2&0xFFFF0000) | (k3&0xFFFF);
    xchg   ecx, eax         ; k3.lo
    xchg   bp, ax
    rol    eax, 16
    stosd

    xchg   edx, eax
    xchg   bx, ax
    rol    eax, 16
    stosd

    ; c->m.c[7] = (k3&0xFFFF0000) | (k0&0xFFFF);
    xchg   ecx, eax         ; k0.lo
    xchg   bp, ax
    rol    eax, 16
    stosd
    
    xor    eax, eax
    stosd
    popad

    ; -----------------------------
    ; Iterate the system four times
    ; -----------------------------
    push   4
    pop    ecx
rsk_l0:
    push   edi
    call   RABBIT_next_state
    pop    edi
    loop   rsk_l0
    
    ; -------------------
    ; Modify the counters
    ; -------------------
    xchg   eax, ecx
    cdq
rsk_l1:
    ; c->m.c[i] ^= c->m.x[(i+4) & 7];
    lea    ecx, [eax+edx+4]
    and    ecx, 7
    mov    ebx, [edi+ecx*4]
    xor    [edi+eax*4+32], ebx
    inc    eax
    cmp    al, 8
    jne    rsk_l1
    
    ; -------------------------------------
    ; Copy master instance to work instance
    ; -------------------------------------
    mov    cl, 68
    mov    esi, edi
    add    edi, ecx
    rep    movsb
    popad
    ret

IV setup

// IV setup
void RABBIT_setiv(RABBIT_ctx *c, const void* iv)
{
   rabbit_blk *v=(rabbit_blk*)iv;
   uint32_t   sv[4];
   int        i;

   // Generate four subvectors
   sv[0] = v->d[0];
   sv[2] = v->d[1];

   sv[1] = (sv[0]>>16) | (sv[2]&0xFFFF0000);
   sv[3] = (sv[2]<<16) | (sv[0]&0x0000FFFF);

   // Modify counter values
   for (i=0; i<8; i++) {
      c->w.c[i] = c->m.c[i] ^ sv[i & 3];
   }

   // Copy state variables but not carry flag
   memcpy (&c->w.x[0], &c->m.x[0], 32);

   // Iterate the system four times
   for (i=0; i<4; i++) {
      RABBIT_next_state(&c->w);
   }
}

x86 assembly

; IV setup
RABBIT_setivx:
_RABBIT_setivx:
    ; save registers
    pushad
    mov    edx, [esp+32+4]   ; ctx
    mov    esi, [esp+32+8]   ; iv
    ; Generate four subvectors
    lodsd
    xchg   edi, eax          ; sv[0] = v->d[0];
    lodsd
    xchg   ebp, eax          ; sv[2] = v->d[1];

    pushad                   ; create sv variable (32 bytes), init first and third

    ror    edi, 16           ; sv[0] >> 16
    
    xchg   ebp, eax          ; sv[1] = (sv[0]>>16) | (sv[2]&0xFFFF0000);
    xchg   di, ax
    mov    [esp+4], eax
    
    ror    edi, 16           ; sv[3] = (sv[2]<<16) | (sv[0]&0x0000FFFF);
    mov    [esp+12], edi
    ; Modify counter values
    xor    ecx, ecx
rsv_l0:
    mov    eax, [edx+ecx*4+32]   ; eax = c->m.c[i]
    mov    ebx, ecx
    and    ebx, 3
    xor    eax, [esp+ebx*4]      ; eax ^= sv[i & 3]
    mov    [edx+ecx*4+100], eax  ; c->w.c[i] = eax
    inc    ecx
    cmp    cl, 8
    jnz    rsv_l0
    ; Copy master state variables to work
    pushad
    mov    esi, edx
    lea    edi, [esi+68]
    rep    movsd
    popad
    ; Iterate the system four times
    mov    cl, 4
rsv_l1:
    lea    eax, [edx+68]
    push   eax
    call   RABBIT_next_state ; RABBIT_next_state(&c->w);
    pop    eax
    loop   rsv_l1
    
    ; release sv of 32 bytes
    popad
    ; restore registers
    popad
    ret

Encryption/Decryption

void RABBIT_crypt(RABBIT_ctx *c, void* input, uint32_t inlen)
{
   uint32_t   i, len;
   rabbit_blk x;
   uint8_t    *in=(uint8_t*)input;

   for (;;)
   {
     // break on zero length
     if (inlen==0) break;

     // update state
     RABBIT_next_state(&c->w);

     for (i=0; i<4; i++) {
       x.d[i] = c->w.x[i<<1];
     }

     x.d[0] ^= (c->w.x[5]>>16) ^ (c->w.x[3]<<16);
     x.d[1] ^= (c->w.x[7]>>16) ^ (c->w.x[5]<<16);
     x.d[2] ^= (c->w.x[1]>>16) ^ (c->w.x[7]<<16);
     x.d[3] ^= (c->w.x[3]>>16) ^ (c->w.x[1]<<16);

     for (i=0; i<16 && inlen!=0; i++) {
       *in++ ^= x.b[i];
       inlen--;
     }
   }
}

x86 assembly

; encrypt/decrypt a message of any size
RABBIT_cryptx:
_RABBIT_cryptx:
    pushad                   ; save registers
    lea    esi, [esp+32+4]   ; esi = parameters
    pushad                   ; alloc 32-bytes on stack
    lodsd
    xchg   ebx, eax          ; ebx=c
    lodsd
    xchg   ecx, eax          ; ecx=input
    lodsd
    xchg   ecx, eax          ; ecx=inlen
    xchg   eax, esi          ; esi=input
    add    ebx, 68           ; ebx=&c->w.x[0]
rc_l0:
    jecxz  rc_l3             ; break if ecx==0
    push   ebx
    call   RABBIT_next_state
    pop    eax
    
    xor    eax, eax
    cdq
    mov    edi, esp
    pushad
    ; for (i=0; i<4; i++) {
    ;   x.d[i] = c->w.x[i<<1];
    ; }
rc_l1:
    mov    ecx, [ebx+eax*8]
    mov    [edi+eax*4], ecx
    inc    eax
    cmp    al, 4
    jnz    rc_l1
    
    mov    esi, [ebx+1*4]    ; ebx = c->w.x[1]
    mov    ecx, [ebx+3*4]    ; ecx = c->w.x[3]
    mov    edx, [ebx+5*4]    ; edx = c->w.x[5]
    mov    ebp, [ebx+7*4]    ; ebp = c->w.x[7]
    mov    eax, ecx
    ; x.d[0] ^= (c->w.x[5]>>16) ^ (c->w.x[3]<<16);
    shld   ecx, edx, 16  ; x5 >> 16, x3 << 16
    xor    [edi+4*0], ecx
    ; x.d[1] ^= (c->w.x[7]>>16) ^ (c->w.x[5]<<16);
    shld   edx, ebp, 16
    xor    [edi+4*1], edx
    ; x.d[2] ^= (c->w.x[1]>>16) ^ (c->w.x[7]<<16);
    shld   ebp, esi, 16
    xor    [edi+4*2], ebp
    ; x.d[3] ^= (c->w.x[3]>>16) ^ (c->w.x[1]<<16);
    shld   esi, eax, 16
    xor    [edi+4*3], esi
    popad
    
    ; for (i=0; i<16 && inlen!=0; i++) {
    ;   *in++ ^= x.b[i];
    ;   inlen--;
    ; }
    mov    dl, 16            ; do 16 bytes or remaining
rc_l2:
    mov    al, [edi]         ; al = x.b[i]
    inc    edi
    xor    [esi], al         ; *in ^= al
    inc    esi               ; in++
    dec    edx               ; i--
    loopnz rc_l2             ; break if --len==0 or i==0
    inc    ecx
    loop   rc_l0
rc_l3:
    popad                    ; free stack
    popad                    ; restore registers
    ret

Sources here.

Advertisements
This entry was posted in assembly, cryptography, encryption, security and tagged , , , , . Bookmark the permalink.

1 Response to Rabbit Stream Cipher

  1. Peter Ferrie says:

    The popad behavior comes from the CPU, not the operating system. On Linux, for example, the stack pointer slot is also available to hold data. It’s a great feature.

    Liked by 1 person

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s