RC4 Stream cipher

Introduction

RC4 is an infamous stream cipher designed in 1987 by Ron Rivest. It was an RSA trade secret up until September 1994 when an anonymous user posted source code to the cypherpunks mailing list

Already used extensively in commercial applications and protocols, it gained notoriety among the security community as a result of a research paper published in 2001 that helped develop the first WEP key recovery tools.

Weaknesses in the Key Scheduling Algorithm of RC4 by Fluhrer, Mantin and Shamir quickly saw the demise of RC4 as a secure cipher. Its reputation was damaged again with the publication of Efficient Reconstruction of RC4 Keys from Internal States by Eli Biham and and Yaniv Carmeli in 2008 but by this time, most people knew it no longer provided adequate protection of data.

Matthew Green from John Hopkins University has a great post called What’s the deal with RC4? if you want more background into the cipher itself.

What follows is a tiny implementation in x86 assembler, optimized for size, so if you require a version optimized for speed, consider using a library such as PolarSSL or OpenSSL

Key-scheduling algorithm (KSA)

The KSA is very simple and consists of mixing key bytes with 256 8-bit element array we call sbox (substitution box). In order to optimize the KSA, some implementations use 32-bit elements (OpenSSL for example) but of course, this requires more code and my goal is to optimize for size.

for i from 0 to 255
    S[i] := i
endfor
j := 0
for i from 0 to 255
    j := (j + S[i] + key[i mod keylength]) mod 256
    swap values of S[i] and S[j]
endfor

The structure I’m using for key context is similar to those you see in many open source libraries. x and y are 32-bit integers because when loading/storing we only use 1 byte each which also happens to zero out the 32-bit register. You could load a byte but then have to zero the upper 24-bits for indexing the sbox array.

typedef struct _RC4_KEY {
  uint32_t x, y;
  uint8_t s[256];
} RC4_KEY;

Written in C, the above pseudocode looks like the following which is essentially the code posted to cipherpunks mailing list with some minor modifications.

void RC4_set_key (RC4_KEY *c, uint32_t len, void *key)
{
  int i, j, t;
  uint8_t *k=(uint8_t*)key;
  
  // Initialize ctx with identity sbox
  for (i=0; i<256; i++) {
    c->s[i] = (uint8_t)i;
  }

  c->x = 0;
  c->y = 0;
  
  // Randomize the sbox using key data
  for (i=0, j=0; i<256; i++) {
    j = (j + (c->s[i] + k[i % len])) % 256;
    
    t = c->s[i];
    c->s[i] = c->s[j];
    c->s[j] = t;
  }
}

And the x86 assembler version where we attempt to reduce space.

_rc4_setkey:
rc4_setkey:
    pushad
    lea    esi, [esp+4]
    lodsd
    xchg   eax, ebp
    lodsd
    push   eax
    lodsd
    xchg   eax, esi
    pop    esi
    xor    eax, eax               ; i=0
    xor    ecx, ecx
    cdq                           ; j=0
    stosd                         ; x=0
    stosd                         ; y=0
r4_l0:
    mov    [edi+eax], al          ; s[i] = i
    inc    al                     ; i++
    jnz    r4_l0
r4_l1:
    cmp    ecx, ebp               ; if (key_idx == key_len)
    jne    r4_l2
    xor    ecx, ecx               ; key_idx = 0
r4_l2:
    mov    bl, [edi+eax]          ; s[i]
    add    dl, bl                 ; j += s[i]
    add    dl, [esi+ecx]          ; j += key[key_idx]
    xchg   bl, [edi+edx]          ; s[j] = s[i]
    mov    [edi+eax], bl          ; s[i] = s[j]
    inc    ecx                    ; key_idx++
    inc    al                     ; i++
    jnz    r4_l1                  ;
    
    popad
    ret

Pseudo-random generation algorithm (PRGA)

A PRNG originally part of OpenBSD until version 5.5 (released in may 2014) used RC4. It now uses ChaCha20 instead.

i := 0
j := 0
while GeneratingOutput:
    i := (i + 1) mod 256
    j := (j + S[i]) mod 256
    swap values of S[i] and S[j]
    K := S[(S[i] + S[j]) mod 256]
    output K
endwhile

The implementation in C is similar to the KSA.

void RC4 (RC4_KEY *c, uint32_t len, void *in, void *out)
{
  uint32_t i;
  uint8_t  x=(uint8_t)c->x, y=(uint8_t)c->y, j=0, t;
  
  for (i=0; i<len; i++) {
    // Update x and y
    x = (x + 1) % 256;
    y = (y + c->s[x]) % 256;

    // Modify sbox
    t = c->s[x];
    c->s[x] = c->s[y];
    c->s[y] = t;

    // Encrypt/decrypt next byte
    j = (c->s[x] + c->s[y]) % 256;
    ((uint8_t*)in)[i] ^= c->s[j];
  }
  c->x = x;
  c->y = y;
}

The assembler function.

_rc4:
rc4:
    pushad
    lea    esi, [esp+4]
    lodsd
    xchg   eax, ecx
    jecxz  r4_l1
    lodsd
    xchg   eax, edi
    lodsd
    xchg   eax, esi
    push   esi               ; save pointer to ctx
    lodsd                    ; eax = x
    xchg   eax, ebx      
    lodsd                    ; ebx = y
    xchg   eax, ebx
    cdq
r4_l0:
    inc    al                ; x++
    
    mov    dl, [esi+eax]     ; dl = s[x]
    add    bl, dl            ; y += dl
    xchg   dl, [esi+ebx]     ; swap s[y], s[x]
    mov    [esi+eax], dl     ; s[x] = s[y]
    
    add    dl, [esi+ebx]     ; dl = s[x] + s[y]
    mov    dl, [esi+edx]     ; dl = s[ dl ]
    xor    [edi], dl         ; *p ^= (s[ s[x] + s[y] ])
    inc    edi               ; p++    
    loop   r4_l0
    pop    edi               ; edi = rc4key
    stosd                    ; save x
    xchg   eax, ebx 
    stosd                    ; save y
r4_l1:
    popad
    ret

Update

I had an idea to shrink rc4 and the reason it wasn’t implemented was because you couldn’t save x+y.

However, since this version written by Peter Ferrie doesn’t save x+y either, I decided to revisit the initial idea and here it is in 64 bytes.

The size of buffer is BUFSIZ or whatever you define it to be.

;
; Single call RC4 in x86 assembly (for 128-bit keys)
; Odzhan
;
; 64 bytes

;rc4_ctx struct
;  sbox db 256 dup (?)   ; uint8_t s[256];
;  key  db 16 dup (?)    ; uint8_t k[16];
;  len  dd ?             ; uint32_t len;
;  buf  db BUFSIZ dup(?) ; uint8_t buf[BUFSIZ];
;rc4_ctx ends
    
    bits 32
    
%ifndef BIN
    global RC4Z
    global _RC4Z
%endif

_RC4Z:
RC4Z:
    pushad
    mov    edi, [esp+32+4]   ; edi=rc4_key
    mov    esi, edi          ; esi=rc4_key
    ; x=0, y=0, i=0            
    xor    ecx, ecx          ;
    mul    ecx
; --------------------------------------------
; KSA. edi=s + key, esi=s, eax=i, edx=j
; --------------------------------------------
init_sbox:
    stosb                    ; s[i]=i
    inc    al                ; i++
    jnz    init_sbox
    ; edi now points to 128-bit key
init_key:
    movzx  ebx, al           ; ebx = k_idx
    and    bl, 15            ; %= 16
    
    add    dl, [edi+ebx]     ; j += k[k_idx % 16]
    jmp    swap
upd_idx:
    inc    al                ; i++
    jnz    init_key          ; i<256
    cdq                      ; y=0 
    mov    ecx, [edi+16]     ; ecx = len
; -----------------------------------------------
; PRNG. edi=buf, esi=s, eax=x, edx=y, ecx=len 
; -----------------------------------------------
crypt_data:
    inc    al                ; x++
swap:    
    mov    bl, [esi+eax]     ; bl = s[i] or bl=s[x]
    add    dl, bl            ; j += bl or y += bl 
    xchg   bl, [esi+edx]     ; XCHG(bl, s[j]) or XCHG(bl, s[y])
    mov    [esi+eax], bl
    jecxz  upd_idx           ; if ecx == 0 goto upd_idx
    
    add    bl, [esi+edx]     ; bl = s[x] + s[y]
    mov    bl, [esi+ebx]     ; bl = s[ bl ]
    xor    [edi+16+4], bl    ; *p ^= (s[ s[x] + s[y] ])
    inc    edi               ; p++     
    loop   crypt_data
    popad
    ret

Below is just a C string with the RC4 assembly code which uses cdecl calling convention. It’s only for x86 architecture. If you wanted to change buf to void* and use malloc() instead, it would require changes to the code.

#define RC4Z_SIZE 64

char RC4Z[] = {
  /* 0000 */ "\x60"             /* pushad              */
  /* 0001 */ "\x8b\x7c\x24\x24" /* mov edi, [esp+0x24] */
  /* 0005 */ "\x89\xfe"         /* mov esi, edi        */
  /* 0007 */ "\x31\xc9"         /* xor ecx, ecx        */
  /* 0009 */ "\xf7\xe1"         /* mul ecx             */
  /* 000B */ "\xaa"             /* stosb               */
  /* 000C */ "\xfe\xc0"         /* inc al              */
  /* 000E */ "\x75\xfb"         /* jnz 0xb             */
  /* 0010 */ "\x0f\xb6\xd8"     /* movzx ebx, al       */
  /* 0013 */ "\x80\xe3\x0f"     /* and bl, 0xf         */
  /* 0016 */ "\x02\x14\x1f"     /* add dl, [edi+ebx]   */
  /* 0019 */ "\xeb\x0a"         /* jmp 0x25            */
  /* 001B */ "\xfe\xc0"         /* inc al              */
  /* 001D */ "\x75\xf1"         /* jnz 0x10            */
  /* 001F */ "\x99"             /* cdq                 */
  /* 0020 */ "\x8b\x4f\x10"     /* mov ecx, [edi+0x10] */
  /* 0023 */ "\xfe\xc0"         /* inc al              */
  /* 0025 */ "\x8a\x1c\x06"     /* mov bl, [esi+eax]   */
  /* 0028 */ "\x00\xda"         /* add dl, bl          */
  /* 002A */ "\x86\x1c\x16"     /* xchg [esi+edx], bl  */
  /* 002D */ "\x88\x1c\x06"     /* mov [esi+eax], bl   */
  /* 0030 */ "\xe3\xe9"         /* jecxz 0x1b          */
  /* 0032 */ "\x02\x1c\x16"     /* add bl, [esi+edx]   */
  /* 0035 */ "\x8a\x1c\x1e"     /* mov bl, [esi+ebx]   */
  /* 0038 */ "\x30\x5f\x14"     /* xor [edi+0x14], bl  */
  /* 003B */ "\x47"             /* inc edi             */
  /* 003C */ "\xe2\xe5"         /* loop 0x23           */
  /* 003E */ "\x61"             /* popad               */
  /* 003F */ "\xc3"             /* ret                 */
};

Results

The x64 version was surprisingly smaller than the 32-bit version, mostly because it uses the fastcall convention. The 32-bit could be modified to use fastcall instead of cdecl which would save more space but only the tiny version has this option.

architecture size
x86 single call 64
x86 101
x64 100

Source code for x86 and x86-64 here

Advertisements
This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , , , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s