Asmcodes: Salsa20 Stream Cipher

Introduction

Salsa20 or Snuffle 2005 is a stream cipher designed, written and published in 2005 by mathematician D.J Bernstein. It supports 128/256-bit keys and 64-bit nonce values.

Another version XSalsa20 supports nonces up to 192-bits if 64-bits is insufficient to what you need.

It was selected for the final portfolio of the eStream (ECRYPT Stream Cipher Project) so we can presume it’s undergone extensive cryptanalysis during the selection process.

The implementation here only supports 256-bit keys.

About the assembly code

ChaCha20 was already implemented and is based on Salsa20 although ChaCha20 is apparently a better design for performance without losing security. The x86 code was a joint effort between Peter Ferrie and myself while writing ChaCha20 and so few changes were made to implement Salsa20.

Context/State

The keystream buffer is 64-bytes or 512-bits which gets updated for each time the Salsa20 core is invoked.

#define S20_BLK_LEN 64

typedef union _s20_blk_t {
  uint8_t  v8[S20_BLK_LEN];
  uint16_t v16[S20_BLK_LEN/2];
  uint32_t v32[S20_BLK_LEN/4];
  uint64_t v64[S20_BLK_LEN/8];
} s20_blk;

typedef struct _s20_ctx_t {
  s20_blk s;
} s20_ctx;

Key generation

This function expects 256-bit keys. The ChaCha20 stream cipher (Snuffle 2008) has a better design IMHO but hasn’t received as much scrutiny as Salsa20 according to the author.

// setup the key, must be 256-bits with 64-bit nonce
void s20_setkey(s20_ctx *c, void *key, void *nonce)
{
  s20_blk *iv=(s20_blk*)nonce;
  s20_blk *k=(s20_blk*)key;
  
  c->s.v32[ 0] = 0x61707865;
  c->s.v32[ 1] = k->v32[0];
  c->s.v32[ 2] = k->v32[1];
  c->s.v32[ 3] = k->v32[2];
  c->s.v32[ 4] = k->v32[3];
  c->s.v32[ 5] = 0x3320646E;
  c->s.v32[ 6] = iv->v32[0];
  c->s.v32[ 7] = iv->v32[1];
  c->s.v32[ 8] = 0;
  c->s.v32[ 9] = 0;
  c->s.v32[10] = 0x79622D32;
  c->s.v32[11] = k->v32[4];
  c->s.v32[12] = k->v32[5];
  c->s.v32[13] = k->v32[6];
  c->s.v32[14] = k->v32[7];
  c->s.v32[15] = 0x6B206574;
}
_s20_setkeyx:
s20_setkeyx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax
    lodsd
    push   eax
    lodsd
    xchg   ebx, eax
    pop    esi
    mov    eax, 061707865h
    cdq
    stosd
    ; copy 16 bytes of 256-bit key
    movsd    
    movsd    
    movsd    
    movsd    
    mov    eax, 03320646Eh
    stosd
    ; copy 64-bit iv to 6,7
    xchg   esi, ebx
    movsd
    movsd
    ; zero 64-bit counter at 8,9
    xchg   eax, edx
    stosd
    stosd
    ; store 32-bits at 10
    mov    eax, 079622D32h
    stosd
    ; store remainder of key at 11-14
    xchg   esi, ebx
    movsd    
    movsd    
    movsd    
    movsd
    ; store last 32-bits
    mov    eax, 06B206574h
    stosd
    popad
    ret

Salsa20 Core

The 64-byte input x to Salsa20 is viewed in little-endian form as 16 words x0, x1, x2, …, x15 in {0,1,…,2^32-1}. These 16 words are fed through 320 invertible modifications, where each modification changes one word. The resulting 16 words are added to the original x0, x1, x2, …, x15 respectively modulo 2^32, producing, in little-endian form, the 64-byte output Salsa20(x).

Each modification involves xor’ing into one word a rotated version of the sum of two other words modulo 2^32. Thus the 320 modifications involve, overall, 320 additions, 320 xor’s, and 320 rotations. The rotations are all by constant distances.

The entire series of modifications is a series of 10 identical double-rounds. Each double-round is a series of 2 rounds. Each round is a set of 4 parallel quarter-rounds. Each quarter-round modifies 4 words.

// transforms block using ARX instructions
void QUARTERROUND(s20_blk *blk, uint16_t idx) 
{
    uint32_t a, b, c, d;
    uint32_t *x=(uint32_t*)&blk->v8;
    
    a = (idx         & 0xF);
    b = ((idx >>  4) & 0xF);
    c = ((idx >>  8) & 0xF);
    d = ((idx >> 12) & 0xF);

    x[b] ^= ROTL32((x[a] + x[d]), 7);
    x[c] ^= ROTL32((x[b] + x[a]), 9);
    
    x[d] ^= ROTL32((x[c] + x[b]),13);
    x[a] ^= ROTL32((x[d] + x[c]),18);
}

// encrypt/decrypt 64 byte block
// do not call directly, use s20_encrypt instead
void s20_core (s20_ctx *c, uint8_t *in, uint32_t len)
{
    s20_blk x;
    int     i, j;

    // 16-bit integers of each index
    uint16_t idx16[8]=
    { 0xC840, 0x1D95, 0x62EA, 0xB73F, 
      0x3210, 0x4765, 0x98BA, 0xEDCF };
    
    // copy state to local space
    for (i=0; i<16; i++) { 
      x.v32[i] = c->s.v32[i];
    }
    // apply 20 rounds
    for (i=0; i<20; i+=2) {
      for (j=0; j<8; j++) {
        QUARTERROUND(&x, idx16[j]);
      }
    }
    // add state to x
    for (i=0; i<16; i++) {
      x.v32[i] += c->s.v32[i];
    }
    // xor input with x
    for (i=0; i<len; i++) {
      ((uint8_t*)in)[i] ^= x.v8[i];
    }
    // update block counter
    c->s.v64[4]++;
    // stopping at 2^70 bytes per nonce is user's responsibility
}
%define a eax
%define b ebx
%define c edx
%define d esi
%define x edi

%define t0 ebp

; void QUARTERROUND(s20_blk *blk, uint16_t index)
; expects edi points to x array
QUARTERROUND:
    pushad
    push   a                ; save eax
    xchg   ah, al
    aam    16
    
    movzx  d, ah
    movzx  c, al

    pop    a
    aam    16
    
    movzx  b, ah
    movzx  a, al

    lea    a, [x+a*4]
    lea    b, [x+b*4]
    lea    c, [x+c*4]
    lea    d, [x+d*4]

    ; load ecx with rotate values
    ; 16, 12, 8, 7
    mov    ecx, 0120D0907h
s20_q0:
    ; x[b] ^= ROTL32((x[a] + x[d]), 7);
    mov    t0, [a]           ; ebp=x[a]
    add    t0, [d]
    rol    t0, cl
    xor    [b], t0
    
    xchg   cl, ch
    
    mov    t0, [b]           ; ebp=x[b]
    add    t0, [a]
    rol    t0, cl
    xor    [c], t0
    
    xchg   a, c
    xchg   b, d
    
    shr    ecx, 16
    jnz    s20_q0
    
    popad
    ret
  
; void s20_corex (s20_ctx *ctx, void *in, uint32_t len)
; do not call directly
; expects state in ebx, length in eax, input in edx
_s20_corex:
s20_corex:
    pushad

    ; allocate 64-bytes local space, x
    push   64
    pop    ecx
    sub    esp, ecx
    
    ; copy state to x
    mov    edi, esp
    mov    esi, ebx
    rep    movsb

    ; move x into edi
    mov    edi, esp
    push   eax
    push   20/2  ; 20 rounds
    pop    ebp
s20_c0:
    ; load indexes
    call   s20_c1
    dw 0C840h, 01D95h, 062EAh, 0B73Fh
    dw 03210h, 04765h, 098BAh, 0EDCFh
s20_c1:
    pop    esi  ; pointer to indexes
    mov    cl, 8
s20_c2:
    lodsw
    call   QUARTERROUND
    loop   s20_c2
    dec    ebp
    jnz    s20_c0
    
    ; add state to x
    mov    cl, 16
s20_c3:
    mov    eax, [ebx+ecx*4-4]
    add    [edi+ecx*4-4], eax
    loop   s20_c3

    ; xor input with x
    pop    ecx               ; ecx=len
s20_c4:
    mov    al, byte[edi+ecx-1]
    xor    byte[edx+ecx-1], al
    loop   s20_c4
    
    ; update block counter
    stc
    adc    dword[ebx+8*4], ecx
    adc    dword[ebx+9*4], ecx
    
    ; free stack
    popad
    popad
    ; restore registers
    popad
    ret

Encryption and Decryption

The same function is used to encrypt/decrypt providing the same key and nonce are used during key setup.

// encrypt stream of bytes
void s20_crypt (s20_ctx *c, void *in, uint32_t len) 
{
    uint32_t r;
    uint8_t  *p=(uint8_t*)in;
    
    while (len) {
      r=(len>64) ? 64 : len;
      s20_core(c, p, r);
      len -= r;
      p += r;
    }
}
_s20_cryptx:
s20_cryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   ebx, eax
    lodsd
    xchg   edx, eax
    lodsd
    xchg   ecx, eax
    jecxz  s20_l1            ; exit if len==0
    xor    eax, eax          ; r=0
s20_l0:
    mov    al, 64
    cmp    ecx, eax          ; eax=len < 64 ? len : 64
    cmovb  eax, ecx
    call   s20_corex
    add    edx, eax          ; p += r;
    sub    ecx, eax          ; len -= r;
    jnz    s20_l0
s20_l1:
    popad
    ret

Summary

architecture size
x86 245
x64 n/a

For source code and any future updates, please refere to code here.

Advertisements
This entry was posted in cryptography, encryption, programming, security and tagged , , , , . Bookmark the permalink.

One Response to Asmcodes: Salsa20 Stream Cipher

  1. Pingback: Asmcodes: Rabbit Stream Cipher | x86 crypto

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s