Asmcodes: Threefish-256 block cipher

Introduction

Threefish is a symmetric block cipher designed and published in 2008 by Niels Ferguson, Stefan Lucks, Bruce Schneier, Doug Whiting, Mihir Bellare, Tadayoshi Kohno, Jon Callas and Jesse Walker.

Only the 256-bit key version is presented here which also processes 256-bit blocks of data. Threefish is an ARX(Add-Rotate-Xor) based cipher which makes it suitable for wide range of architectures.

This code is optimized for size so please consider another implementation if speed is required.

Key schedule

#define K(s) (((uint64_t*)key)[(s)])
#define T(s) (((uint64_t*)tweak)[(s)])

void threefish_setkey(
    threefish_ctx_t *c,
    const void *key, 
    const void *tweak)
{
  uint8_t i;
  
  memcpy((void*)c->k, key,   32);
  memcpy((void*)c->t, tweak, 16);
  
  c->k[4] = 0x1BD11BDAA9FC1A22ULL;
  
  for(i=0; i<4; i++){
    c->k[4] ^= K(i);
  }
  c->t[2] = T(0) ^ T(1);  
}
threefish_setkeyx:
_threefish_setkeyx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   eax, edi          ; edi = ctx   
    lodsd
    xchg   eax, esi          ; esi = key
    push   edi               ; save ctx 
    
    ; memcpy((void*)c->k, key,   32);
    push   8
    pop    ecx
    rep    movsd
    scasd
    scasd
    
    ; memcpy((void*)c->t, tweak, 16);
    xchg   eax, esi
    lodsd
    xchg   eax, esi
    mov    cl, 4
    rep    movsd

    pop    edi 
    ; c->k[4] = 0x1BD11BDAA9FC1A22ULL;
    mov    eax, 0xA9FC1A22
    mov    ebx, 0x1BD11BDA
    mov    cl, 4    
tf_sk:
    xor    eax, [edi]
    scasd
    xor    ebx, [edi]
    scasd
    loop   tf_sk
    
    stosd
    xchg   eax, ebx
    stosd
    movq   t0, [edi]
    pxor   t0, [edi+8]
    movq   [edi+16], t0
    popad
    ret

Add Key

// perform both addition and subtraction on data
// enc should be 0 for addition or 1 for subtraction
void addkey(
    const threefish_ctx_t *c, 
    void *data, uint8_t s, 
    uint64_t enc)
{
  int i;
  uint64_t x0, x1, x2;
  uint64_t *x=(uint64_t*)data;
  
  for (i=0; i<4; i++) {
    x0 = x[i];
    x1 = c->k[(s + i) % 5];
    x2 = 0;
    
    if (i==1) x2 = c->t[s % 3];
    if (i==2) x2 = c->t[(s+1) % 3];
    if (i==3) x2 = s;
    
    // add or subtract depending on enc
    x[i] = ((x0 ^ -enc) + x1 + x2) ^ -enc;
  }
}
; void addkey(const threefish_ctx_t *c, 
;    void *data, uint8_t s, 
;    uint64_t enc)
addkey:
    pushad

    pxor    x3, x3           ; x3 = 0 
    jecxz   ak_l0
    
    pcmpeqb x3, x3           ; x3 = ~0
    dec     ecx              ; ecx = 0
ak_l0:  
    push    5
    pop     ebp  
    
    ; x0 = x[i];
    movq    x0, [esi+ecx*8]   
    
    ; x1 = c->k[(s + i) % 5];
    lea     eax, [ebx+ecx]    
    cdq    
    idiv    ebp        
    movq    x1, [edi+edx*8]
    
    dec     ebp
    dec     ebp    
    
    pxor    x2, x2    
    jecxz   ak_lx
    
    ; if (i==3) x2 = s;
    movd    x2, ebx
    cmp     cl, 3
    je      ak_lx

    mov     eax, ebx
    cdq
    cmp     cl, 1
    je      ak_lxx
    inc     eax
ak_lxx:    
    idiv    ebp
    
    ; if (i==1) x2 = c->t[s % 3];
    ; if (i==2) x2 = c->t[(s+1) % 3];    
    movq    x2, [edi+8*edx+40]
ak_lx:    
    ; x[i] = ((x0 ^ -enc) + x1 + x2) ^ -enc;
    pxor    x0, x3
    paddq   x0, x1
    paddq   x0, x2
    pxor    x0, x3
    
    movq    [esi+ecx*8], x0
    
    inc     ecx
    cmp     cl, 4
    jnz     ak_l0    
    popad
    ret

Mix Function

Although for the inverse function we’re supposed to rotate right, subtracting the rotation constant from 64 and rotating left does the same thing.

void mix(void *data, 
    uint8_t rc[], 
    int rnd, int enc)
{
  int     i;
  uint64_t *x;
  uint8_t  r;
  
  x = (uint64_t*)data;
  
  for (i=0; i<4; i += 2)
  {
    r = rc[(rnd & 7) + (i << 2)];

    if (enc==THREEFISH_DECRYPT)
    {
      x[i+1] ^= x[i];
      x[i+1]  = ROTL64(x[i+1], -(r-64));
      x[i]   -= x[i+1];      
    } else {
      x[i]   += x[i+1];
      x[i+1]  = ROTL64(x[i+1], r);
      x[i+1] ^= x[i];    
    }
  }
}

Assembly uses MMX to reduce code

; void mix(void *data, 
;    uint8_t rc[], 
;    int rnd, int enc)
mix:
    pushad
    xor    eax, eax
m_l0:    
    movq   x0, [esi+eax*8]
    movq   x1, [esi+eax*8+8]
    
    ; r = rc[(rnd & 7) + (i << 2)];
    mov    edi, edx
    and    edi, 7
    lea    edi, [edi+eax*4] 
    movzx  edi, byte[ebp+edi]
    jecxz  m_l1
    
    sub    edi, 64
    neg    edi
    
    pxor   x1, x0
    call   rotl64
    psubq  x0, x1
    jmp    m_l2
m_l1    
    paddq  x0, x1
    call   rotl64
    pxor   x1, x0
m_l2:
    movq   [esi+eax*8  ], x0    
    movq   [esi+eax*8+8], x1    
    add    al, 2
    cmp    al, 4
    jnz    m_l0
    popad
    ret

Permute

void permute(void *data)
{
  uint64_t t;
  uint64_t *x=(uint64_t*)data;
  
  t    = x[1];
  x[1] = x[3];
  x[3] = t;
}
; void permute(void *data)    
permute:
    movq   x0, [esi+1*8]
    movq   x1, [esi+3*8]

    movq   [esi+1*8], x1 
    movq   [esi+3*8], x0 
    ret

Encryption/Decryption

void threefish_encrypt(
    const threefish_ctx_t *c, 
    void *data, 
    uint32_t enc)
{
  uint8_t i, s=0, ofs=1;
  uint32_t x0, x1;
  
  uint8_t rc[16] = 
  { 14, 52, 23,  5, 25, 46, 58, 32, 
    16, 57, 40, 37, 33, 12, 22, 32};

  if (enc == THREEFISH_DECRYPT) {
    s   = 18;
    ofs = ~0;
    
    // swap rotation constants if
    // decrypting
    for (i=0; i<4; i += 2) {
      x0 = ((uint32_t*)rc)[i+0];
      x1 = ((uint32_t*)rc)[i+1];

      ((uint32_t*)rc)[i+0] = SWAP32(x1);
      ((uint32_t*)rc)[i+1] = SWAP32(x0);
    }
  }
  // apply 72 rounds
  for (i=0; i<72; i++)
  {
    // add key every 4 rounds
    if((i & 3) == 0) {
      addkey(c, data, s, enc);
      s += ofs;
    }

    // permute if decrypting
    if (enc == THREEFISH_DECRYPT) {
      permute(data);
    }
    
    // apply mixing function
    mix (data, rc, i, enc);
    
    // permute if encrypting
    if (enc == THREEFISH_ENCRYPT) {
      permute(data);
    }
  }
  // add key
  addkey(c, data, s, enc);
}
; void threefish_encrypt(
;    const threefish_ctx_t *c, 
;    void *data, 
;    uint32_t enc)
threefish_encryptx:
_threefish_encryptx
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax    
    lodsd
    push   eax
    lodsd
    cdq
    xchg   eax, ecx
    xor    ebx, ebx
    pop    esi
    push   1
    pop    eax
    ; load rotation constants
    push   0x20160C21
    push   0x25283910
    push   0x203A2E19
    push   0x0517340E
    mov    ebp, esp
    jecxz  tf_l1
    
    neg    eax               ; ofs = -1
    mov    bl, 18            ; s   = 18
    pushad
    mov    esi, ebp
    mov    edi, ebp
tf_l0:    
    lodsd
    xchg   eax, ebx
    lodsd
    bswap  eax
    stosd
    xchg   eax, ebx
    bswap  eax
    stosd
    dec    edx
    jp     tf_l0
    popad
    ; apply 72 rounds
tf_l1:    
    ; add key every 4 rounds
    test  dl, 3
    jnz   tf_l2
    call  addkey
    add   ebx, eax
tf_l2:    
    ; permute if decrypting
    jecxz tf_l3    
    call  permute
tf_l3:    
    ; apply mixing function
    call  mix
    ; permute if encrypting
    test  ecx, ecx
    jnz   tf_l4
    call  permute
tf_l4:    
    inc   edx                 
    cmp   dl, 72
    jnz   tf_l1 
    ; add key
    call  addkey
    add   esp, 16
    popad
    ret

Summary

Using /Os /O2 flags with MSVC 2010 results in 747 bytes. The assembly version is currently 371 bytes but may shrink in future. See the full sources here.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming and tagged , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s