## RC4 Stream Cipher

### Introduction

RC4 is an infamous stream cipher designed in 1987 by Ron Rivest. It was an RSA trade secret up until September 1994 when an anonymous user posted source code to the cypherpunks mailing list. Having been used extensively in commercial applications and protocols, it gained notoriety among the security community as a result of a research paper published in 2001 that helped develop the first WEP key recovery tools. Weaknesses in the Key Scheduling Algorithm of RC4 by Fluhrer, Mantin and Shamir quickly saw the demise of RC4 as a secure cipher.

Matthew Green from John Hopkins University has a great post called What’s the deal with RC4? if you want more background into the cipher itself. What follows is a tiny implementation in x86 assembler, optimized for size. You really shouldn’t be using RC4 anymore, but if you require a version optimized for speed, consider using a library such as OpenSSL

### Key-scheduling algorithm (KSA)

The KSA is very simple and consists of mixing key bytes with 256 8-bit element array we call sbox (substitution box). In order to optimize the KSA, some implementations use 32-bit elements (OpenSSL for example) but of course, this requires more code and my goal is to optimize for size.

```for i from 0 to 255
S[i] := i
endfor
j := 0
for i from 0 to 255
j := (j + S[i] + key[i mod keylength]) mod 256
swap values of S[i] and S[j]
endfor
```

The structure I’m using for key context is similar to those you see in many open source libraries. x and y are 32-bit integers because when loading/storing we only use 1 byte each which also happens to zero out the 32-bit register. You could load a byte but then have to zero the upper 24-bits for indexing the sbox array.

```typedef struct _RC4_KEY {
uint32_t x, y;
uint8_t s[256];
} RC4_KEY;
```

Written in C, the above pseudocode looks like the following which is essentially the code posted to cipherpunks mailing list with some minor modifications.

```void RC4_set_key (RC4_KEY *c, uint32_t len, void *key)
{
int i, j, t;
uint8_t *k=(uint8_t*)key;

// Initialize ctx with identity sbox
for (i=0; i<256; i++) {
c->s[i] = (uint8_t)i;
}

c->x = 0;
c->y = 0;

// Randomize the sbox using key data
for (i=0, j=0; i<256; i++) {
j = (j + (c->s[i] + k[i % len])) % 256;

t = c->s[i];
c->s[i] = c->s[j];
c->s[j] = t;
}
}
```

And the x86 assembler version where we attempt to reduce space.

```_rc4_setkey:
rc4_setkey:
lea    esi, [esp+4]
lodsd
xchg   eax, ebp
lodsd
push   eax
lodsd
xchg   eax, esi
pop    esi
xor    eax, eax               ; i=0
xor    ecx, ecx
cdq                           ; j=0
stosd                         ; x=0
stosd                         ; y=0
r4_l0:
mov    [edi+eax], al          ; s[i] = i
inc    al                     ; i++
jnz    r4_l0
r4_l1:
cmp    ecx, ebp               ; if (key_idx == key_len)
jne    r4_l2
xor    ecx, ecx               ; key_idx = 0
r4_l2:
mov    bl, [edi+eax]          ; s[i]
add    dl, bl                 ; j += s[i]
add    dl, [esi+ecx]          ; j += key[key_idx]
xchg   bl, [edi+edx]          ; s[j] = s[i]
mov    [edi+eax], bl          ; s[i] = s[j]
inc    ecx                    ; key_idx++
inc    al                     ; i++
jnz    r4_l1                  ;

ret
```

### Pseudo-random generation algorithm (PRGA)

A PRNG originally part of OpenBSD until version 5.5 (released in may 2014) used RC4. It now uses ChaCha20 instead.

```i := 0
j := 0
while GeneratingOutput:
i := (i + 1) mod 256
j := (j + S[i]) mod 256
swap values of S[i] and S[j]
K := S[(S[i] + S[j]) mod 256]
output K
endwhile
```

The implementation in C is similar to the KSA.

```void RC4 (RC4_KEY *c, uint32_t len, void *in, void *out)
{
uint32_t i;
uint8_t  x=(uint8_t)c->x, y=(uint8_t)c->y, j=0, t;

for (i=0; i<len; i++) {
// Update x and y
x = (x + 1) % 256;
y = (y + c->s[x]) % 256;

// Modify sbox
t = c->s[x];
c->s[x] = c->s[y];
c->s[y] = t;

// Encrypt/decrypt next byte
j = (c->s[x] + c->s[y]) % 256;
((uint8_t*)in)[i] ^= c->s[j];
}
c->x = x;
c->y = y;
}
```

The assembler function.

```_rc4:
rc4:
lea    esi, [esp+4]
lodsd
xchg   eax, ecx
jecxz  r4_l1
lodsd
xchg   eax, edi
lodsd
xchg   eax, esi
push   esi               ; save pointer to ctx
lodsd                    ; eax = x
xchg   eax, ebx
lodsd                    ; ebx = y
xchg   eax, ebx
cdq
r4_l0:
inc    al                ; x++

mov    dl, [esi+eax]     ; dl = s[x]
add    bl, dl            ; y += dl
xchg   dl, [esi+ebx]     ; swap s[y], s[x]
mov    [esi+eax], dl     ; s[x] = s[y]

add    dl, [esi+ebx]     ; dl = s[x] + s[y]
mov    dl, [esi+edx]     ; dl = s[ dl ]
xor    [edi], dl         ; *p ^= (s[ s[x] + s[y] ])
inc    edi               ; p++
loop   r4_l0
pop    edi               ; edi = rc4key
stosd                    ; save x
xchg   eax, ebx
stosd                    ; save y
r4_l1:
ret
```

### Encryption in one call

I had an idea to shrink rc4 and the reason it wasn’t implemented was because you couldn’t save x+y.

However, since this version written by Peter Ferrie doesn’t save x+y either, I decided to revisit the initial idea and here it is in 64 bytes.

The size of buffer is BUFSIZ or whatever you define it to be.

```;
; Single call RC4 in x86 assembly (for 128-bit keys)
; Odzhan
;
; 64 bytes

;rc4_ctx struct
;  sbox db 256 dup (?)   ; uint8_t s[256];
;  key  db 16 dup (?)    ; uint8_t k[16];
;  len  dd ?             ; uint32_t len;
;  buf  db BUFSIZ dup(?) ; uint8_t buf[BUFSIZ];
;rc4_ctx ends

bits 32

%ifndef BIN
global RC4Z
global _RC4Z
%endif

_RC4Z:
RC4Z:
mov    edi, [esp+32+4]   ; edi=rc4_key
mov    esi, edi          ; esi=rc4_key
; x=0, y=0, i=0
xor    ecx, ecx          ;
mul    ecx
; --------------------------------------------
; KSA. edi=s + key, esi=s, eax=i, edx=j
; --------------------------------------------
init_sbox:
stosb                    ; s[i]=i
inc    al                ; i++
jnz    init_sbox
; edi now points to 128-bit key
init_key:
movzx  ebx, al           ; ebx = k_idx
and    bl, 15            ; %= 16

add    dl, [edi+ebx]     ; j += k[k_idx % 16]
jmp    swap
upd_idx:
inc    al                ; i++
jnz    init_key          ; i<256
cdq                      ; y=0
mov    ecx, [edi+16]     ; ecx = len
; -----------------------------------------------
; PRNG. edi=buf, esi=s, eax=x, edx=y, ecx=len
; -----------------------------------------------
crypt_data:
inc    al                ; x++
swap:
mov    bl, [esi+eax]     ; bl = s[i] or bl=s[x]
add    dl, bl            ; j += bl or y += bl
xchg   bl, [esi+edx]     ; XCHG(bl, s[j]) or XCHG(bl, s[y])
mov    [esi+eax], bl
jecxz  upd_idx           ; if ecx == 0 goto upd_idx

add    bl, [esi+edx]     ; bl = s[x] + s[y]
mov    bl, [esi+ebx]     ; bl = s[ bl ]
xor    [edi+16+4], bl    ; *p ^= (s[ s[x] + s[y] ])
inc    edi               ; p++
loop   crypt_data
ret
```

Below is just a C string with the RC4 assembly code which uses cdecl calling convention. It’s only for x86 architecture. If you wanted to change buf to void* and use malloc() instead, it would require changes to the code.

```#define RC4Z_SIZE 64

char RC4Z[] = {
/* 0000 */ "\x60"             /* pushad              */
/* 0001 */ "\x8b\x7c\x24\x24" /* mov edi, [esp+0x24] */
/* 0005 */ "\x89\xfe"         /* mov esi, edi        */
/* 0007 */ "\x31\xc9"         /* xor ecx, ecx        */
/* 0009 */ "\xf7\xe1"         /* mul ecx             */
/* 000B */ "\xaa"             /* stosb               */
/* 000C */ "\xfe\xc0"         /* inc al              */
/* 000E */ "\x75\xfb"         /* jnz 0xb             */
/* 0010 */ "\x0f\xb6\xd8"     /* movzx ebx, al       */
/* 0013 */ "\x80\xe3\x0f"     /* and bl, 0xf         */
/* 0016 */ "\x02\x14\x1f"     /* add dl, [edi+ebx]   */
/* 0019 */ "\xeb\x0a"         /* jmp 0x25            */
/* 001B */ "\xfe\xc0"         /* inc al              */
/* 001D */ "\x75\xf1"         /* jnz 0x10            */
/* 001F */ "\x99"             /* cdq                 */
/* 0020 */ "\x8b\x4f\x10"     /* mov ecx, [edi+0x10] */
/* 0023 */ "\xfe\xc0"         /* inc al              */
/* 0025 */ "\x8a\x1c\x06"     /* mov bl, [esi+eax]   */
/* 0028 */ "\x00\xda"         /* add dl, bl          */
/* 002A */ "\x86\x1c\x16"     /* xchg [esi+edx], bl  */
/* 002D */ "\x88\x1c\x06"     /* mov [esi+eax], bl   */
/* 0030 */ "\xe3\xe9"         /* jecxz 0x1b          */
/* 0032 */ "\x02\x1c\x16"     /* add bl, [esi+edx]   */
/* 0035 */ "\x8a\x1c\x1e"     /* mov bl, [esi+ebx]   */
/* 0038 */ "\x30\x5f\x14"     /* xor [edi+0x14], bl  */
/* 003B */ "\x47"             /* inc edi             */
/* 003C */ "\xe2\xe5"         /* loop 0x23           */
/* 003E */ "\x61"             /* popad               */
/* 003F */ "\xc3"             /* ret                 */
};
```

### Summary

The x64 version was surprisingly smaller than the 32-bit version, mostly because it uses the fastcall convention. The 32-bit could be modified to use fastcall instead of cdecl which would save more space but only the tiny version has this option.

Source code for x86 and x86-64 here

This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , , , , , , . Bookmark the permalink.