Asmcodes: Threefish-256 block cipher

Introduction

Threefish is a symmetric block cipher designed and published in 2008 by Niels Ferguson, Stefan Lucks, Bruce Schneier, Doug Whiting, Mihir Bellare, Tadayoshi Kohno, Jon Callas and Jesse Walker.

Only the 256-bit key version is presented here which also processes 256-bit blocks of data. Threefish is an ARX(Add-Rotate-Xor) based cipher which makes it suitable for wide range of architectures.

This code is optimized for size so please consider another implementation if speed is required.

Key schedule

#define K(s) (((uint64_t*)key)[(s)])
#define T(s) (((uint64_t*)tweak)[(s)])

void threefish_setkey(
    threefish_ctx_t *c,
    const void *key, 
    const void *tweak)
{
  uint8_t i;
  
  memcpy((void*)c->k, key,   32);
  memcpy((void*)c->t, tweak, 16);
  
  c->k[4] = 0x1BD11BDAA9FC1A22ULL;
  
  for(i=0; i<4; i++){
    c->k[4] ^= K(i);
  }
  c->t[2] = T(0) ^ T(1);  
}
threefish_setkeyx:
_threefish_setkeyx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   eax, edi          ; edi = ctx   
    lodsd
    xchg   eax, esi          ; esi = key
    push   edi               ; save ctx 
    
    ; memcpy((void*)c->k, key,   32);
    push   8
    pop    ecx
    rep    movsd
    scasd
    scasd
    
    ; memcpy((void*)c->t, tweak, 16);
    xchg   eax, esi
    lodsd
    xchg   eax, esi
    mov    cl, 4
    rep    movsd

    pop    edi 
    ; c->k[4] = 0x1BD11BDAA9FC1A22ULL;
    mov    eax, 0xA9FC1A22
    mov    ebx, 0x1BD11BDA
    mov    cl, 4    
tf_sk:
    xor    eax, [edi]
    scasd
    xor    ebx, [edi]
    scasd
    loop   tf_sk
    
    stosd
    xchg   eax, ebx
    stosd
    movq   t0, [edi]
    pxor   t0, [edi+8]
    movq   [edi+16], t0
    popad
    ret

Add Key

// perform both addition and subtraction on data
// enc should be 0 for addition or 1 for subtraction
void addkey(
    const threefish_ctx_t *c, 
    void *data, uint8_t s, 
    uint64_t enc)
{
  int i;
  uint64_t x0, x1, x2;
  uint64_t *x=(uint64_t*)data;
  
  for (i=0; i<4; i++) {
    x0 = x[i];
    x1 = c->k[(s + i) % 5];
    x2 = 0;
    
    if (i==1) x2 = c->t[s % 3];
    if (i==2) x2 = c->t[(s+1) % 3];
    if (i==3) x2 = s;
    
    // add or subtract depending on enc
    x[i] = ((x0 ^ -enc) + x1 + x2) ^ -enc;
  }
}
; void addkey(const threefish_ctx_t *c, 
;    void *data, uint8_t s, 
;    uint64_t enc)
addkey:
    pushad

    pxor    x3, x3           ; x3 = 0 
    jecxz   ak_l0
    
    pcmpeqb x3, x3           ; x3 = ~0
    dec     ecx              ; ecx = 0
ak_l0:  
    push    5
    pop     ebp  
    
    ; x0 = x[i];
    movq    x0, [esi+ecx*8]   
    
    ; x1 = c->k[(s + i) % 5];
    lea     eax, [ebx+ecx]    
    cdq    
    idiv    ebp        
    movq    x1, [edi+edx*8]
    
    dec     ebp
    dec     ebp    
    
    pxor    x2, x2    
    jecxz   ak_lx
    
    ; if (i==3) x2 = s;
    movd    x2, ebx
    cmp     cl, 3
    je      ak_lx

    mov     eax, ebx
    cdq
    cmp     cl, 1
    je      ak_lxx
    inc     eax
ak_lxx:    
    idiv    ebp
    
    ; if (i==1) x2 = c->t[s % 3];
    ; if (i==2) x2 = c->t[(s+1) % 3];    
    movq    x2, [edi+8*edx+40]
ak_lx:    
    ; x[i] = ((x0 ^ -enc) + x1 + x2) ^ -enc;
    pxor    x0, x3
    paddq   x0, x1
    paddq   x0, x2
    pxor    x0, x3
    
    movq    [esi+ecx*8], x0
    
    inc     ecx
    cmp     cl, 4
    jnz     ak_l0    
    popad
    ret

Mix Function

Although for the inverse function we’re supposed to rotate right, subtracting the rotation constant from 64 and rotating left does the same thing.

void mix(void *data, 
    uint8_t rc[], 
    int rnd, int enc)
{
  int     i;
  uint64_t *x;
  uint8_t  r;
  
  x = (uint64_t*)data;
  
  for (i=0; i<4; i += 2)
  {
    r = rc[(rnd & 7) + (i << 2)];

    if (enc==THREEFISH_DECRYPT)
    {
      x[i+1] ^= x[i];
      x[i+1]  = ROTL64(x[i+1], -(r-64));
      x[i]   -= x[i+1];      
    } else {
      x[i]   += x[i+1];
      x[i+1]  = ROTL64(x[i+1], r);
      x[i+1] ^= x[i];    
    }
  }
}

Assembly uses MMX to reduce code

; void mix(void *data, 
;    uint8_t rc[], 
;    int rnd, int enc)
mix:
    pushad
    xor    eax, eax
m_l0:    
    movq   x0, [esi+eax*8]
    movq   x1, [esi+eax*8+8]
    
    ; r = rc[(rnd & 7) + (i << 2)];
    mov    edi, edx
    and    edi, 7
    lea    edi, [edi+eax*4] 
    movzx  edi, byte[ebp+edi]
    jecxz  m_l1
    
    sub    edi, 64
    neg    edi
    
    pxor   x1, x0
    call   rotl64
    psubq  x0, x1
    jmp    m_l2
m_l1    
    paddq  x0, x1
    call   rotl64
    pxor   x1, x0
m_l2:
    movq   [esi+eax*8  ], x0    
    movq   [esi+eax*8+8], x1    
    add    al, 2
    cmp    al, 4
    jnz    m_l0
    popad
    ret

Permute

void permute(void *data)
{
  uint64_t t;
  uint64_t *x=(uint64_t*)data;
  
  t    = x[1];
  x[1] = x[3];
  x[3] = t;
}
; void permute(void *data)    
permute:
    movq   x0, [esi+1*8]
    movq   x1, [esi+3*8]

    movq   [esi+1*8], x1 
    movq   [esi+3*8], x0 
    ret

Encryption/Decryption

void threefish_encrypt(
    const threefish_ctx_t *c, 
    void *data, 
    uint32_t enc)
{
  uint8_t i, s=0, ofs=1;
  uint32_t x0, x1;
  
  uint8_t rc[16] = 
  { 14, 52, 23,  5, 25, 46, 58, 32, 
    16, 57, 40, 37, 33, 12, 22, 32};

  if (enc == THREEFISH_DECRYPT) {
    s   = 18;
    ofs = ~0;
    
    // swap rotation constants if
    // decrypting
    for (i=0; i<4; i += 2) {
      x0 = ((uint32_t*)rc)[i+0];
      x1 = ((uint32_t*)rc)[i+1];

      ((uint32_t*)rc)[i+0] = SWAP32(x1);
      ((uint32_t*)rc)[i+1] = SWAP32(x0);
    }
  }
  // apply 72 rounds
  for (i=0; i<72; i++)
  {
    // add key every 4 rounds
    if((i & 3) == 0) {
      addkey(c, data, s, enc);
      s += ofs;
    }

    // permute if decrypting
    if (enc == THREEFISH_DECRYPT) {
      permute(data);
    }
    
    // apply mixing function
    mix (data, rc, i, enc);
    
    // permute if encrypting
    if (enc == THREEFISH_ENCRYPT) {
      permute(data);
    }
  }
  // add key
  addkey(c, data, s, enc);
}
; void threefish_encrypt(
;    const threefish_ctx_t *c, 
;    void *data, 
;    uint32_t enc)
threefish_encryptx:
_threefish_encryptx
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax    
    lodsd
    push   eax
    lodsd
    cdq
    xchg   eax, ecx
    xor    ebx, ebx
    pop    esi
    push   1
    pop    eax
    ; load rotation constants
    push   0x20160C21
    push   0x25283910
    push   0x203A2E19
    push   0x0517340E
    mov    ebp, esp
    jecxz  tf_l1
    
    neg    eax               ; ofs = -1
    mov    bl, 18            ; s   = 18
    pushad
    mov    esi, ebp
    mov    edi, ebp
tf_l0:    
    lodsd
    xchg   eax, ebx
    lodsd
    bswap  eax
    stosd
    xchg   eax, ebx
    bswap  eax
    stosd
    dec    edx
    jp     tf_l0
    popad
    ; apply 72 rounds
tf_l1:    
    ; add key every 4 rounds
    test  dl, 3
    jnz   tf_l2
    call  addkey
    add   ebx, eax
tf_l2:    
    ; permute if decrypting
    jecxz tf_l3    
    call  permute
tf_l3:    
    ; apply mixing function
    call  mix
    ; permute if encrypting
    test  ecx, ecx
    jnz   tf_l4
    call  permute
tf_l4:    
    inc   edx                 
    cmp   dl, 72
    jnz   tf_l1 
    ; add key
    call  addkey
    add   esp, 16
    popad
    ret

Summary

Using /Os /O2 flags with MSVC 2010 results in 747 bytes. The assembly version is currently 371 bytes but may shrink in future. See the full sources here.

Posted in assembly, cryptography, encryption, programming | Tagged , , | Leave a comment

Asmcodes: Half SipHash

Introduction

SipHash: a fast short-input Pseudo-Random-Function(PRF) was designed and published in 2012 by Jean-Philippe Aumasson and Daniel J. Bernstein.

Last month, MR. Aumasson posted to the kernel-hardening mail list a link to a “Half SipHash” implementation which is intended to boost performance. What I show here is Half SipHash but only generates 32-bit hashes.

I already looked at the original code and felt it didn’t fit well onto a 32-bit architecture. Incidentally ChasKey by Nicky Mouha is derived from SipHash, designed specifically for 32-bit architectures and provides 128-bit security.

So this code is really just half of the Half SipHash function returning 32-bits which might not be adequate for some people. The assembly code is presumably slower than a compiler generated version due to the choice of instructions.

Round function

The round function consists of simple ARX (Add-Rotate-Xor) operations which makes it ideal for almost any CPU.

Some of the more common operations on the 128-bit state before and after ARX rounds are now built into the round function itself and work depending on additional parameter: flag which is either zero or one.

After the message, key and length have been absorbed into the state, it performs the following for 32-bit hashes.

v[2] ^= 0xff;

This is simply flipping the lower 8 bits so the additional parameter to SIPROUND is either zero or one which when negated can either flip bits or have no effect at all. Xor’ing by zero does nothing but if we negate 1 and Xor the result, that is the same as performing a logical not.

Since our cROUNDS are 2 and dROUNDS for the end are 4, we can multiply last flag by 2 and add to rnds counter thus avoiding the need for any conditional testing.

void SIPROUND(
    uint32_t v[4], 
    uint32_t w, 
    uint8_t last)
{
  int i, rnds=cROUNDS;
  
  v[2] ^= (uint8_t)-last;
  rnds += (last * 2);
  
  v[3] ^= w;
  
  for (i=0; i<rnds; i++)
  {
    v[0] += v[1];                                                              
    v[1] = ROTL(v[1], 5);                                                      
    v[1] ^= v[0];                                                              
    v[0] = ROTL(v[0], 16);                                                     
    v[2] += v[3];                                                              
    v[3] = ROTL(v[3], 8);                                                      
    v[3] ^= v[2];                                                              
    v[0] += v[3];                                                              
    v[3] = ROTL(v[3], 7);                                                      
    v[3] ^= v[0];                                                              
    v[2] += v[1];                                                              
    v[1] = ROTL(v[1], 13);                                                     
    v[1] ^= v[2];                                                              
    v[2] = ROTL(v[2], 16);   
  }    
  v[0] ^= w;
}

uint32_t hsh32(
    const uint8_t *data, 
    const size_t inlen, 
    const uint8_t *key) 
{
    uint32_t v[4];
    uint8_t i;
    uint32_t m;
    
    uint8_t *in = (uint8_t*)data;
    uint32_t *k = (uint32_t*)key;
    
    uint32_t k0 = k[0];
    uint32_t k1 = k[1];
    
    int left = inlen & 3;        
    const uint8_t *end = in + inlen - left;
    uint32_t b = ((uint32_t)inlen) << 24;
    
    v[0] = k0;
    v[1] = k1;
    v[2] = k0 ^ 0x6c796765;
    v[3] = k1 ^ 0x74656462;

    for (; in != end; in += 4) {
      m = ((uint32_t*)in)[0];
      SIPROUND(v, m, 0);
    }

    while (--left >= 0) {
      b |= ((uint32_t)in[left]) << (8 * left);   
    }

    for (i=0; i<2; i++) {
      SIPROUND(v, b, i);
      b = 0;
    }
    
    return v[1] ^ v[3];
}

With the above code in C, it’s much easier to implement assembly. Instead of using a flag parameter, we clear or set the Carry Flag (CF) using Clear Carry CLC or Set Carry STC. You can see in the last loop, EFLAGS register is saved with PUSHFD/POPFD and then flipped using Complement Carry CMC.

Inside the round function, we avoid flipping bits using Jump If No Carry JNC

; don't change these    
%define cROUNDS 2
%define dROUNDS 4

%define v0 ebx    
%define v1 ebp    
%define v2 edx    
%define v3 edi

_hsh32x:
hsh32x:
    pushad
    lea    esi, [esp+32+ 4]
    lodsd
    push   eax               ; save data
    lodsd
    push   eax               ; save inlen
    lodsd
    xchg   eax, esi          ; esi = key 
    
    ; initialize state
    lodsd                    ; eax = k0
    mov    v0, eax           ; v0  = k0 
    mov    v2, eax           ; v2  = k0
    lodsd                    ; eax = k1
    mov    v1, eax           ; v1  = k1
    mov    v3, eax           ; v3  = k1
    xor    v2, 0x6c796765
    xor    v3, 0x74656462
    
    ; update state in 4-byte blocks
    pop    ecx               ; ecx = inlen
    pop    esi               ; esi = data 
    push   ecx               ; save inlen
    shr    ecx, 2            ; inlen /= 4 
    jz     shx_l2
shx_l0:
    lodsd
shx_l1:
    clc    
    call   SIPROUND
    loop   shx_l0
shx_l2:    
    pop    ecx               ; restore inlen
    mov    eax, ecx
    push   edx               ; save edx
    cdq                      ; edx = 0    
    shl    eax, 24           ; eax = inlen << 24
    and    ecx, 3            ; inlen &= 3
shx_l3:
    dec    ecx               ; if (--left >= 0) 
    js     shx_l4            ;   goto shx_l4
    shl    edx, 8            ; t <<= 8
    mov    dl, byte[esi+ecx] ; t |= in[left]
    jmp    shx_l3
shx_l4:
    or     eax, edx          ; b |= t
    pop    edx               ; restore edx
    clc                      ; CF=0  
shx_l5:    
    pushfd                   ; save flags
    call   SIPROUND
    popfd                    ; restore flags
    cmc                      ; CF=1 for last round
    jc     shx_l5
    xor    v1, v3            ; v[1] ^= v[3]
    mov    [esp+28], v1      ; pushad_t._eax = v1
    popad
    ret

; CF=0 for normal update    
; CF=1 for final update    
SIPROUND:
    push   ecx               ; save ecx
    push   cROUNDS 
    pop    ecx
    jnc    sr_l0             ; skip if no carry
    
    xor    eax, eax          ; w=0 if last round
    add    ecx, ecx          ; assumes: cROUNDS=2, dROUNDS=4
    not    dl                ; requires edx to be v2
sr_l0:
    xor    v3, eax           ; v3 ^= w    
sr_l1:    
    add    v0, v1            ; v[0] += v[1];
    rol    v1, 5             ; v[1] = ROTL(v[1], 5);
    xor    v1, v0            ; v[1] ^= v[0];
    rol    v0, 16            ; v[0] = ROTL(v[0], 16);
    add    v2, v3            ; v[2] += v[3]; 
    rol    v3, 8             ; v[3] = ROTL(v[3], 8); 
    xor    v3, v2            ; v[3] ^= v[2]; 
    add    v0, v3            ; v[0] += v[3];
    rol    v3, 7             ; v[3] = ROTL(v[3], 7);
    xor    v3, v0            ; v[3] ^= v[0];
    add    v2, v1            ; v[2] += v[1];
    rol    v1, 13            ; v[1] = ROTL(v[1], 13);
    xor    v1, v2            ; v[1] ^= v[2];
    rol    v2, 16            ; v[2] = ROTL(v[2], 16);
    loop   sr_l1
    xor    v0, eax           ; v0 ^= w
    pop    ecx               ; restore ecx
    ret

Summary

The code generated by MSVC compiler using /Os /O2 switches resulted in 261 bytes. The assembly version is 148 bytes. See sources here for any future updates.

Posted in assembly, cryptography, programming, security | Tagged , , , , | Leave a comment

Asmcodes: Poly1305 Message Authentication Code (MAC)

Introduction

Poly1305 is a cryptographic Message Authentication Code (MAC) designed and published in 2004 by Daniel J. Bernstein. It can be used to verify the data integrity and authenticity of a message.

Adam Langley has published details in RFC 7539 which uses ChaCha20 and Poly1305 for Authenticated Encryption Associated Data (AEAD) as an alternative to AES-GCM (Galois Counter Mode)

The reasons to use ChaCha20-Poly1305 instead of AES-GCM are not for lack of confidence in AES but due to its performance on ARM architectures that do not have hardware acceleration.

Dan’s ciphers

Many of Dan’s designs have become popular in the last number of years and there are some good observations on the reasons why in an essay titled On the Impending Crypto Monoculture by Peter Gutmann.

It’s not a case of Dan being a crypto king (although he does awesome work). IMHO the choice of using his algorithms comes down to a number of reasons.

  • Provably secure
  • Patent free
  • Low overhead in hardware and software
  • Simple and secure implementations already provided by author
  • Not tainted by government agencies

Dan’s designs are intended to be resistant to timing attacks, side channel analysis which AES was never intended to be.

CCM, EAX and GCM can be challenging to implement correctly. Poly1305 is simple with just addition, modular multiplication and therefore less prone to error.

Vlad Krasnov from Cloudflare offers a sound review of both poly1305 and chacha20 in It takes two to ChaCha (Poly)

About C code

As most of you might know, the codes presented here in all entries are optimized for size so if you want an implementation optimized for speed, this will certainly disappoint you.

As per usual with these entries, I’ll show code in C first and then x86 assembly. If you can spot ways to optimize the code for size, feel free to fork the project and submit a pull request with your name included in source code as author.

The C code here is derived from a reference implementation provided by Dan on his site except I’m following the design presented by Langley in RFC 7539 for ChaCha20. The RFC is easy to follow and if you want to better understand implementation, I highly recommend it.

Addition

The addition is performed on 8-bit bytes stored as 32-bit words. At the end of every number added to the accumulator is a byte: 0, 1 or 252. So rather than expecting the callee to pad a buffer with zeros, poly1305_add() takes care of this.

/**********************************************
 *
 * poly1305 addition
 *
 **********************************************/
void poly1305_add(
    uint32_t out[], 
    const uint8_t in[], 
    int inlen, 
    uint8_t last)
{
  uint32_t c, i, x = 0;
  uint8_t *p = (uint8_t*)in;

  // add in to out
  for (i=0; i<17; i++) {
    c = *p;
    c = (i == inlen) ? last : c;
    c = (i  > inlen) ? 0    : c;
    p = (i  < inlen) ? p+1  : p;
    
    x += out[i] + c; 
    out[i] = x & 255; 
    x >>= 8; 
  }
}

Even though ADC (Add with Carry) is available and in fact works fine I’ve stuck with the original reference provided by MR. Dan. Function expects accumulator in EDI, the input bytes in ESI, inlen in ECX and last byte in AL. ADC only increments p if i is less than inlen.

; ***************************************
;
; poly1305 addition
;
; al  = last 
; ecx = inlen
; edi = out
; esi = p
;
; ***************************************
_poly1305_addx:
poly1305_addx:
    pushad
    movzx  edx, al            ; edx = last
    xor    ebx, ebx           ; x = 0
    xor    ecx, ecx           ; ecx = 0    
    xor    ebp, ebp           ; i = 0
add_l0:
    movzx  eax, byte[esi]     ; c = *p 
    cmp    ebp, [esp+_ecx]    ; EFLAGS = (i - inlen)
    cmovz  eax, edx           ; c = (i == inlen) ? last : c;
    cmova  eax, ecx           ; c = (i  > inlen) ? 0    : c;
    adc    esi, ecx           ; p = (i  < inlen) ? p+1  : p;
    add    ebx, [edi]         ; x += out[i]
    add    ebx, eax           ; x += c
    mov    al, bl             ; al = x
    stosd                     ; out[i] = x & 255
    shr    ebx, 8             ; x >>= 8
    inc    ebp                ; i++
    cmp    ebp, 17 
    jb     add_l0
    popad
    ret

Modular Multiplication

The modulus is 2^130-5 or 3fffffffffffffffffffffffffffffffb and Dan explains in his paper the reason for choosing this number specifically.

I chose 2^130-5 because its sparse form makes divisions particularly easy in both software and hardware. My encoding of messages as polynomials takes advantage of the gap between 2^128 and 2^130-5.

/**********************************************
 *
 * poly1305 modular multiplication
 *
 * "P" is 2^130-5 or 3fffffffffffffffffffffffffffffffb
 *
 **********************************************/
void poly1305_mulmod(
    uint32_t acc[17],
    const uint32_t r[17])
{
  uint32_t hr[17];
  uint32_t i, j, u;

  memcpy ((uint8_t*)hr, (uint8_t*)acc, 17*4);
  
  // multiply
  for (i=0; i<17; i++) {
    u = 0;
    for (j=0; j<=i; j++) {
      u += hr[j] * r[i - j];
    }
    for (; j<17; j++) {
      u += hr[j] * r[i + (17 - j)] * 320;
    }
    acc[i] = u;
  }
  
  for (u=0, j=0; j<2; j++)
  {
    for (i=0; i<16; i++) 
    { 
      u += acc[i];
      acc[i] = u & 255; 
      u >>= 8; 
    }
    if (!j) 
    {
      u += acc[16];
      acc[16] = u & 3;
      u = (u >> 2) * 5;
    }
  }
  acc[16] += u;
}

The assembly sticks close to the original C code provided. I attempted to use a different implementation of modular multiplication but the reduction in code was negligible.

; ***************************************
;
; poly1305 modular multiplication
;
; "P" is 2^130-5 or 3fffffffffffffffffffffffffffffffb
;
; esi = acc 
; ebx = r
;
; ***************************************
poly1305_mulmodx:
_poly1305_mulmodx:
    pushad
    ; memcpy ((uint8_t*)hr, (uint8_t*)acc, 17*4); 
    push   17*4
    pop    ecx
    sub    esp, ecx
    mov    esi, edi            ; esi = acc
    mov    edi, esp            ; edi = hr
    push   esi                 ; save acc
    rep    movsb
    pop    edi                 ; edi = acc
    
    ; multiply
    mov    esi, ebx            ; esi = r
    xor    ebx, ebx            ; i = 0
mm_l0:
    xor    ecx, ecx            ; j = 0
    xor    edx, edx            ; u = 0
mm_l1:
    mov    eax, [esp+ecx*4]    ; eax = hr[j]
    mov    ebp, ebx            ; ebp = i
    sub    ebp, ecx            ; ebp = i - j
    imul   eax, [esi+ebp*4]    ; eax *= r[i - j]
    add    edx, eax            ; u += eax
    inc    ecx                 ; j++
    cmp    ecx, ebx            ; j <= i
    jbe    mm_l1
mm_l2:
    push   17
    pop    ebp
    
    cmp    ecx, ebp            ; i < 17
    jae    mm_l3

    sub    ebp, ecx            ; ebp = 17 - j          
    add    ebp, ebx            ; ebp += i
    
    push   64
    pop    eax
    inc    ah                  ; eax = 320
    imul   eax, [esi+ebp*4]    ; eax *= r[i+17-j]
    imul   eax, [esp+ecx*4]    ; eax *= hr[j]
    add    edx, eax            ; u += eax 
    inc    ecx                 ; j++
    jmp    mm_l2
mm_l3:
    mov    [edi+ebx*4], edx    ; acc[i] = u
    inc    ebx
    cmp    ebx, 17
    jb     mm_l0
    
    ; reduce
    xor    ebx, ebx
    xor    edx, edx            ; u = 0
f_lx:    
    mov    cl, 16
    mov    esi, edi            ; esi = acc
    push   edi    
f_l0:
    lodsd                      ; eax = acc[i]
    add    edx, eax            ; u += acc[i]
    movzx  eax, dl             ; eax = u & 255
    stosd                      ; acc[i] = eax
    shr    edx, 8              ; u >>= 8
    loop   f_l0
    dec    ebx                 ;  
    jnp    f_l1    
    ; ---------------
    lodsd                      ; eax = acc[16]
    add    edx, eax            ; u += eax
    movzx  eax, dl             ; 
    and    al, 3               ; al &= 3
    stosd                      ; acc[16] = eax
    shr    edx, 2              ; u >>= 2
    lea    edx, [edx+edx*4]    ; u = (u >> 2) * 5
    pop    edi
    jmp    f_lx
f_l1:    
    add    [edi], edx          ; acc[16] += u;
    add    esp, 17*4 + 4
    popad
    ret

MAC function

This is the main function called to generate 128-bit MAC of message. First r is “clamped”

I constrained r to simplify and accelerate implementations of Poly1305-AES in various contexts. A wider range of r|e.g., all 128-bit integers would allow a quantitatively better security bound, but the current bound DL=2106 will be perfectly satisfactory for the foreseeable future, whereas slower authenticator computations would not be perfectly satisfactory.

/**********************************************
 *
 * poly1305 mac function
 *
 **********************************************/
void poly1305_mac (
    uint8_t *out, 
    const uint8_t *in, 
    uint32_t inlen, 
    const uint8_t *k)
{
  uint32_t i, len, neg;
  uint32_t r[17], acc[17];
  uint8_t  minusp[16]={5};
  
  // copy r
  for (i=0; i<16; i++) {
    r[i] = k[i];
  }
  // clamp r
  r[ 3] &= 15;
  r[ 7] &= 15;
  r[11] &= 15;
  r[15] &= 15;
  
  r[ 4] &= 252;
  r[ 8] &= 252;
  r[12] &= 252;
  r[16]  = 0;

  // zero initialize accumulator
  memset ((uint8_t*)acc, 0, 17*4);

  for (;;) 
  {
    // if zero length, break
    if (inlen==0) break;
    
    // process 16 bytes or remaining
    len = inlen < 16 ? inlen : 16;
    
    // add bytes to acc
    poly1305_add (acc, in, len, 1);

    // multiply accumulator by r mod p
    poly1305_mulmod (acc, r);

    // update length and buffer position
    in    += len;
    inlen -= len;
  }

  memcpy (r, acc, sizeof(r));

  poly1305_add (acc, minusp, 16, 252);
  neg = -(acc[16] >> 7);
  
  for (i=0; i<17; i++) {
    acc[i] ^= neg & (r[i] ^ acc[i]);
  }
  
  // add s
  poly1305_add (acc, &k[16], 16, 0);
  
  // return tag
  for (i=0; i<16; i++) {
    out[i] = acc[i];
  }
}

Summary

Assembly generated by MSVC 2010 with /O2 /Os flags results in 507 bytes. The assembly implementation is currently 339 bytes but may shrink in the future.

See here for future updates or if you want to contribute.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , | 1 Comment

Asmcodes: Bel-T Block Cipher

Introduction

Bel-T is a block cipher that was adopted as a standard by the Republic of Belarus in 2011 and defined in STB 34.101.31-2011. It takes 128-bits of data and encrypts using an n-bit key (where n can be 128, 192 or 256).

It uses 8 rounds of a function based on the Lai-Massey scheme also used by IDEA and IDEA-NXT. There’s a key schedule procedure however because it’s only relevant for 128 and 192-bit keys, it’s not included here. If you’re interested in the key scheduling function, you can find it in the code this version was derived from.

The C code here is derived from this code written by Philipp Jovanovic however they look very different to one another after much tweaking by myself. Essentially what I’ve done is reduce the amount of code at the expense of performance. Moreover, the C code here is unlikely to run on big endian architectures without some modification.

Structures used

I’ve assigned a number of registers for each variable while using stack for everything else. The work_t structure defines each position on stack for some variables. For example, ‘flgs’ holds copy of flags register saved with pushfd and ‘key’ points to 32-byte area of memory containing encryption/decryption key.

struc pushad_t
  _edi dd ?
  _esi dd ?
  _ebp dd ?
  _esp dd ?
  _ebx dd ?
  _edx dd ?
  _ecx dd ?
  _eax dd ?
  .size:
endstruc

%define BELT_ENCRYPT 0
%define BELT_DECRYPT 1

%define t   eax
%define a   ebx
%define b   ecx
%define c   edx
%define d   esi
%define e   edi
%define G   ebp

; workspace on stack
struc work_t
  flgs dd ?
  j    dd ?
  key  dd ?
  i    dd ?
  blk  dd ?
endstruc

For the C code, I try instruct the compiler to use stack for all variables although any good compiler will likely decide itself what’s best depending on parameters and target architecture. At least for my own compiler options, using structures for both key and data resulted in less code being generated.

// 32-bit type
typedef union {
    uint8_t  b[4];
    uint32_t w;
} w32_t;

// 128-bit type
typedef struct {
    union {
      uint8_t  v8[16];
      uint32_t w[4];
      struct {
        uint32_t a, b, c, d;
      };
    };
} w128_t;

// 256-bit type
typedef struct {
    union {
      uint8_t  v8[32];
      uint32_t w[8];
      struct {
        uint32_t a, b, c, d;
        uint32_t e, f, g, h;
      };
    };
} w256_t;

Substitution box

Simply maps 32-bits of input to 32-bits of output using fixed 256-byte array. I don’t know how the sbox was generated although if anyone does know, please respond in comments. If it’s possible to generate at runtime, it might also be possible to save additional space for this code in ROM.

// sbox
const uint8_t H[256] =
{ 0xB1, 0x94, 0xBA, 0xC8, 0x0A, 0x08, 0xF5, 0x3B,
  0x36, 0x6D, 0x00, 0x8E, 0x58, 0x4A, 0x5D, 0xE4,
  0x85, 0x04, 0xFA, 0x9D, 0x1B, 0xB6, 0xC7, 0xAC,
  0x25, 0x2E, 0x72, 0xC2, 0x02, 0xFD, 0xCE, 0x0D,
  0x5B, 0xE3, 0xD6, 0x12, 0x17, 0xB9, 0x61, 0x81,
  0xFE, 0x67, 0x86, 0xAD, 0x71, 0x6B, 0x89, 0x0B,
  0x5C, 0xB0, 0xC0, 0xFF, 0x33, 0xC3, 0x56, 0xB8,
  0x35, 0xC4, 0x05, 0xAE, 0xD8, 0xE0, 0x7F, 0x99,
  0xE1, 0x2B, 0xDC, 0x1A, 0xE2, 0x82, 0x57, 0xEC,
  0x70, 0x3F, 0xCC, 0xF0, 0x95, 0xEE, 0x8D, 0xF1,
  0xC1, 0xAB, 0x76, 0x38, 0x9F, 0xE6, 0x78, 0xCA,
  0xF7, 0xC6, 0xF8, 0x60, 0xD5, 0xBB, 0x9C, 0x4F,
  0xF3, 0x3C, 0x65, 0x7B, 0x63, 0x7C, 0x30, 0x6A,
  0xDD, 0x4E, 0xA7, 0x79, 0x9E, 0xB2, 0x3D, 0x31,
  0x3E, 0x98, 0xB5, 0x6E, 0x27, 0xD3, 0xBC, 0xCF,
  0x59, 0x1E, 0x18, 0x1F, 0x4C, 0x5A, 0xB7, 0x93,
  0xE9, 0xDE, 0xE7, 0x2C, 0x8F, 0x0C, 0x0F, 0xA6,
  0x2D, 0xDB, 0x49, 0xF4, 0x6F, 0x73, 0x96, 0x47,
  0x06, 0x07, 0x53, 0x16, 0xED, 0x24, 0x7A, 0x37,
  0x39, 0xCB, 0xA3, 0x83, 0x03, 0xA9, 0x8B, 0xF6,
  0x92, 0xBD, 0x9B, 0x1C, 0xE5, 0xD1, 0x41, 0x01,
  0x54, 0x45, 0xFB, 0xC9, 0x5E, 0x4D, 0x0E, 0xF2,
  0x68, 0x20, 0x80, 0xAA, 0x22, 0x7D, 0x64, 0x2F,
  0x26, 0x87, 0xF9, 0x34, 0x90, 0x40, 0x55, 0x11,
  0xBE, 0x32, 0x97, 0x13, 0x43, 0xFC, 0x9A, 0x48,
  0xA0, 0x2A, 0x88, 0x5F, 0x19, 0x4B, 0x09, 0xA1,
  0x7E, 0xCD, 0xA4, 0xD0, 0x15, 0x44, 0xAF, 0x8C,
  0xA5, 0x84, 0x50, 0xBF, 0x66, 0xD2, 0xE8, 0x8A,
  0xA2, 0xD7, 0x46, 0x52, 0x42, 0xA8, 0xDF, 0xB3,
  0x69, 0x74, 0xC5, 0x51, 0xEB, 0x23, 0x29, 0x21,
  0xD4, 0xEF, 0xD9, 0xB4, 0x3A, 0x62, 0x28, 0x75,
  0x91, 0x14, 0x10, 0xEA, 0x77, 0x6C, 0xDA, 0x1D };

// substitute 32-bits
uint32_t G(uint32_t x, w256_t *key, int idx, int r) {
    int   i;
    w32_t u;

    u.w = key->w[idx & 7] + x;

    for (i=0; i<4; i++) {
      u.b[i] = H[u.b[i]];
    }
    return ROTL32(u.w, r);
}

Since this function is called quite a lot, the address is loaded into register.

; ------------------------
    ; G function
    ; ------------------------
    pushad
    mov    esi, [esp+_esp]   ; esi=esp
    lodsd                    ; eax=return address
    lodsd                    ; eax=x
    xchg   eax, edx          ; edx=x
    lodsd                    ; eax=r
    push   eax               ; save r
    lodsd                    ; eax=eflags
    lodsd                    ; eax=j
    inc    dword[esi-4]      ; j++
    xchg   eax, ebx          ; ebx=j
    lodsd                    ; eax=key
    xchg   eax, edi          ; edi=key
    and    ebx, 7            ; j &= 7
    add    edx, [edi+ebx*4]  ; x += key[j & 7]
    xchg   eax, edx
    push   4
    pop    ecx               ; ecx=4
    call   g_l0
    ; sbox
    db 0xB1, 0x94, 0xBA, 0xC8, 0x0A, 0x08, 0xF5, 0x3B,
    db 0x36, 0x6D, 0x00, 0x8E, 0x58, 0x4A, 0x5D, 0xE4,
    db 0x85, 0x04, 0xFA, 0x9D, 0x1B, 0xB6, 0xC7, 0xAC,
    db 0x25, 0x2E, 0x72, 0xC2, 0x02, 0xFD, 0xCE, 0x0D,
    db 0x5B, 0xE3, 0xD6, 0x12, 0x17, 0xB9, 0x61, 0x81,
    db 0xFE, 0x67, 0x86, 0xAD, 0x71, 0x6B, 0x89, 0x0B,
    db 0x5C, 0xB0, 0xC0, 0xFF, 0x33, 0xC3, 0x56, 0xB8,
    db 0x35, 0xC4, 0x05, 0xAE, 0xD8, 0xE0, 0x7F, 0x99,
    db 0xE1, 0x2B, 0xDC, 0x1A, 0xE2, 0x82, 0x57, 0xEC,
    db 0x70, 0x3F, 0xCC, 0xF0, 0x95, 0xEE, 0x8D, 0xF1,
    db 0xC1, 0xAB, 0x76, 0x38, 0x9F, 0xE6, 0x78, 0xCA,
    db 0xF7, 0xC6, 0xF8, 0x60, 0xD5, 0xBB, 0x9C, 0x4F,
    db 0xF3, 0x3C, 0x65, 0x7B, 0x63, 0x7C, 0x30, 0x6A,
    db 0xDD, 0x4E, 0xA7, 0x79, 0x9E, 0xB2, 0x3D, 0x31,
    db 0x3E, 0x98, 0xB5, 0x6E, 0x27, 0xD3, 0xBC, 0xCF,
    db 0x59, 0x1E, 0x18, 0x1F, 0x4C, 0x5A, 0xB7, 0x93,
    db 0xE9, 0xDE, 0xE7, 0x2C, 0x8F, 0x0C, 0x0F, 0xA6,
    db 0x2D, 0xDB, 0x49, 0xF4, 0x6F, 0x73, 0x96, 0x47,
    db 0x06, 0x07, 0x53, 0x16, 0xED, 0x24, 0x7A, 0x37,
    db 0x39, 0xCB, 0xA3, 0x83, 0x03, 0xA9, 0x8B, 0xF6,
    db 0x92, 0xBD, 0x9B, 0x1C, 0xE5, 0xD1, 0x41, 0x01,
    db 0x54, 0x45, 0xFB, 0xC9, 0x5E, 0x4D, 0x0E, 0xF2,
    db 0x68, 0x20, 0x80, 0xAA, 0x22, 0x7D, 0x64, 0x2F,
    db 0x26, 0x87, 0xF9, 0x34, 0x90, 0x40, 0x55, 0x11,
    db 0xBE, 0x32, 0x97, 0x13, 0x43, 0xFC, 0x9A, 0x48,
    db 0xA0, 0x2A, 0x88, 0x5F, 0x19, 0x4B, 0x09, 0xA1,
    db 0x7E, 0xCD, 0xA4, 0xD0, 0x15, 0x44, 0xAF, 0x8C,
    db 0xA5, 0x84, 0x50, 0xBF, 0x66, 0xD2, 0xE8, 0x8A,
    db 0xA2, 0xD7, 0x46, 0x52, 0x42, 0xA8, 0xDF, 0xB3,
    db 0x69, 0x74, 0xC5, 0x51, 0xEB, 0x23, 0x29, 0x21,
    db 0xD4, 0xEF, 0xD9, 0xB4, 0x3A, 0x62, 0x28, 0x75,
    db 0x91, 0x14, 0x10, 0xEA, 0x77, 0x6C, 0xDA, 0x1D
g_l0:
    pop    ebx        ; ebx=H 
g_l1:
    xlatb             ; u.b[i] = H[u.b[i]];
    ror    eax, 8     ; u.w = ROTR(u.w, 8)
    loop   g_l1
    pop    ecx        ; ecx=r
    rol    eax, cl    ; return ROTL32(u.w, r);
    mov    [esp+_eax], eax
    popad
    retn   2*4
    ; -----------------------

Encrypion/Decryption

I’ve written this so it handles both operations based on parameter passed to the function. This makes it much more compact than having 2 separate functions.

// perform encryption and decryption based on enc parameter
void belt_encrypt(void *blk, const void *ks, int enc)
{
    w256_t   v; 
    w256_t   key;
    uint32_t i, j, t, e;
    uint32_t *x=(uint32_t*)blk;

    // load 256-bit key into local space
    memcpy (key.v8, (uint8_t*)ks, 32);

    // if decryption, rotate key 128-bits
    if (enc==BELT_DECRYPT) {
      for (i=0; i<4; i++) {
        XCHG (key.w[i], key.w[7-i], t);
      }
    }
    
    // load 128-bit data into local space
    memcpy (v.v8, (uint8_t*)blk, 16);

    // apply 8 rounds
    for (i=0, j=0; i<8; i++, j += 7)
    {
      v.b ^= G(v.a,       &key, j+0, 5);
      v.c ^= G(v.d,       &key, j+1,21);
      v.a -= G(v.b,       &key, j+2,13);
      e    = G(v.b + v.c, &key, j+3,21);
      t    = i + 1;

      if (enc==BELT_ENCRYPT) 
          goto b_l3;
      
      t = (7 - i) + 1;
b_l3:
      e   ^= t;
      v.b += e;
      v.c -= e;
      v.d += G(v.c,     &key, j+4,13);
      v.b ^= G(v.a,     &key, j+5,21);
      v.c ^= G(v.d,     &key, j+6, 5);

      XCHG(v.a, v.b, t);
      XCHG(v.c, v.d, t);
      XCHG(v.b, v.c, t);

      if (enc==BELT_ENCRYPT)
          continue;
      
      // swap for decryption
      XCHG(v.b, v.c, t);
      XCHG(v.a, v.d, t);
    }
    // save data for encryption
    x[0] = v.b; x[1] = v.d;
    x[2] = v.a; x[3] = v.c;

    if (enc == BELT_ENCRYPT)
        return;
      
    // save data for decryption
    x[0] = v.c; x[1] = v.a;
    x[2] = v.d; x[3] = v.b;
}

There might be a better way to implement this in x86 assembly although i don’t see how at the moment.

belt_encryptx:
_belt_encryptx:
    pushad
    lea    esi, [esp+32+4]
    pushad            ; allocate 32-bytes for key
    mov    edi, esp
    lodsd
    xchg   ebx, eax   ; ebx=blk
    lodsd
    push   eax        ; save ks
    lodsd
    cdq               ; edx=0
    ; memcpy (key.v8, (uint8_t*)ks, 32);
    push   32
    pop    ecx
    pop    esi        ; esi=ks
    push   edi
    rep    movsb      ; copy key to local buffer
    pop    edi
    
    push   ebx        ; save blk ptr
    push   edx        ; save i
    push   edi        ; save key
    push   edx        ; save j
    dec    eax        ; BELT_ENCRYPT?
    pushfd            ; save flags
    js     b_l1
    
    ; rotate key 128-bits for decryption
    lea    esi, [edi+7*4]
    mov    cl, 4
rotate_key:
    mov    eax, [edi]  ; eax=key[i]
    xchg   eax, [esi]
    stosd
    sub    esi, 4
    loop   rotate_key
b_l1:
    mov    esi, ebx   ; esi=blk
    lodsd
    xchg   a, eax     ; a = blk[0]
    lodsd
    xchg   b, eax     ; b = blk[1]
    lodsd
    xchg   c, eax     ; c = blk[2]
    lodsd
    xchg   d, eax     ; d = blk[3]
    call   b_l2
    ; ------------------------
    ; G function goes here
    ; -----------------------
b_l2:
    pop    G
b_l3:
    push   5
    push   a
    call   G   
    xor    b, t       ; b ^= G(a, key, j+0, 5);
    
    push   21
    push   d
    call   G     
    xor    c, t       ; c ^= G(d, key, j+1,21);
    
    push   13
    push   b
    call   G       
    sub    a, t       ; a -= G(b, key, j+2,13);
    
    push   21
    lea    t, [b + c]
    push   t
    call   G          ; e  = G(b + c, key, j+3,21);
    xchg   t, e
    
    inc    dword[esp+i] ; i++
    
    mov    t, [esp+i] ; t = i;

    popfd             ; set flags
    pushfd            ; save flags
    js     b_l4
    
    sub    t, 9
    neg    t         ; t = (7 - i) + 1;
b_l4:
    xor    e, t      ; e   ^= t;
    add    b, e      ; v.b += e;
    sub    c, e      ; v.c -= e;
    
    push   13
    push   c
    call   G
    add    d, t      ; v.d += G(v.c, &key, j+4,13);
    
    push   21
    push   a
    call   G
    xor    b, t      ; v.b ^= G(v.a, &key, j+5,21);
    
    push   5
    push   d
    call   G
    xor    c, t      ; v.c ^= G(v.d, &key, j+6, 5);
    
    xchg   a, b
    xchg   c, d
    xchg   b, c
    
    popfd            ; set flags
    pushfd           ; save flags
    js     b_l5      ; if (enc==BELT_ENCRYPT) 
                     ;     continue;
    xchg   b, c
    xchg   a, d
b_l5:
    cmp    dword[esp+i], 8
    jnz    b_l3
    
    popfd            ; restore flags
    pop    eax       ; remove j
    pop    eax       ; remove key
    pop    eax       ; remove i
    pop    edi       ; restore blk
    push   edi       ; save blk

    xchg   eax, b
    stosd            ; blk[0] = b
    xchg   eax, d
    stosd            ; blk[1] = d
    xchg   eax, a
    stosd            ; blk[2] = a
    xchg   eax, c
    stosd            ; blk[3] = c
    
    pop    edi       ; restore blk
    js     b_l6
    
    stosd            ; blk[0] = c 
    xchg   eax, c    
    stosd            ; blk[1] = a
    xchg   eax, a
    stosd            ; blk[2] = d
    xchg   eax, d
    stosd            ; blk[3] = b
b_l6:
    popad            ; release stack for key
    popad            ; restore registers
    ret

Summary

Some people have expressed concerns about the low number of rounds but the round function itself is strong so 8 should be enough for now. It’s still bigger than AES but that’s simply because of the fixed sbox. I’ve not measured performance between AES and Bel-T.

architecture size
x86 490
x64 n/a

Please refer to here for sources, documentation and any updates in future.

Posted in assembly, cryptography, encryption, security | Tagged , , , , ,

Asmcodes: Kuznyechik “Grasshopper” Block cipher

Introduction

Kuznyechik (Grasshopper in Russian) is a 128-bit block cipher supporting 256-bit keys defined in the National Standard of the Russian Federation GOST R 34.12-2015 and RFC 7801 published in 2015 to supercede GOST 28147-89 cipher also referred to as Magma that was declassified in 1994.

Kuznyechik uses an SPN (Substitution-permutation network) design the same as AES (Rijndael) and like the 128-bit key version of Rijndael uses 10 rounds. There’s Key Whitening, Linear transformation and byte substitution, design properties all used by AES finalists.

About the assembly code

Initially, I didn’t intend to implement this cipher in x86 assembly due to the overall size of code generated by C compiler. However, after removing the inverse sbox and computing these values at runtime, it resulted in significantly smaller code than Serpent and Twofish implementations which I didn’t expect. The assembly version is a joint effort between Peter Ferrie and myself resulting in a binary similar to the size of Twofish-256.

Dr. Markku-Juhani O. Saarinen deserves credit for the C code here which I’ve shamelessly copied and modified to be more compact. Markku has a knack for simplifying cryptographic algorithms and making them easier to read.

While I have derived my own implementation from code originally written by Markku, he has not endorsed the changes I’ve made.

Sbox initialization

Although the designers of Kuznyechik asserted the sbox values were chosen “randomly”, Alex Biryukov, Leo Perrin, and Aleksei Udovenko show in their paper Reverse-Engineering the S-Box of Streebog, Kuznyechik and STRIBOBr1 that the S-Boxes of both Kuznyechik and Streebog were not created pseudo-randomly but using a hidden algorithm which they were able to reverse engineer.

The following code will generate the pi array based on their research.

#include <stdio.h>
#include <stdint.h>

// gcc -std=c11 genpi.c -o genpi && ./genpi

// multiplcation by matrix over GF(2)
uint8_t matmul(uint8_t a, uint8_t mat[8][8]) {
  uint8_t res = 0;
  int x, y;
  
  for (x = 0; x < 8; x++)
    if (a & (1 << (7 - x)))
      for (y = 0; y < 8; y++)
        res ^= mat[y][x] << (7 - y);
  return res;
}

// https://en.wikipedia.org/wiki/Finite_field_arithmetic
uint8_t fieldmul(uint8_t a, uint8_t b) {
  uint8_t p = 0;
  
  while (b) {
    if (b & 1) 
      p ^= a;
    if (a & 0x8)
      a = (a << 1) ^ 0x19;  // x^4 + x^3 + 1
    else
      a <<= 1;
    b >>= 1;
  }
  return p;
}

int main(int argc, char *argv[]) {
    int i, x, y, l, r, n, z;
    
uint8_t inv[16]   = 
{0x0,0x1,0xc,0x8,0x6,0xf,0x4,0xe,
 0x3,0xd,0xb,0xa,0x2,0x9,0x7,0x5};

uint8_t nu_0[16]  = 
{0x2,0x5,0x3,0xb,0x6,0x9,0xe,0xa,
 0x0,0x4,0xf,0x1,0x8,0xd,0xc,0x7};

uint8_t nu_1[16]  = 
{0x7,0x6,0xc,0x9,0x0,0xf,0x8,0x1,
 0x4,0x5,0xb,0xe,0xd,0x2,0x3,0xa};

uint8_t sigma[16] = 
{0xc,0xd,0x0,0x4,0x8,0xb,0xa,0xe,
 0x3,0x9,0x5,0x2,0xf,0x1,0x6,0x7};

uint8_t phi[16]   = 
{0xb,0x2,0xb,0x8,0xc,0x4,0x1,0xc,
 0x6,0x3,0x5,0x8,0xe,0x3,0x6,0xb};

uint8_t alpha[8][8] = {
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,0,0,1},
    {0,1,0,0,0,0,1,1},
    {1,1,1,0,1,1,1,1},
    {1,0,0,0,1,0,1,0},
    {0,1,0,0,0,1,0,0},
    {0,0,0,1,1,0,1,0},
    {0,0,1,0,0,0,0,0},
};

uint8_t omega[8][8] = {
    {0,0,0,0,1,0,1,0},
    {0,0,0,0,0,1,0,0},
    {0,0,1,0,0,0,0,0},
    {1,0,0,1,1,0,1,0},
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,1,0,0},
    {1,0,0,0,0,0,1,0},
    {0,0,0,0,0,0,0,1},
};
  
    for(x = 0; x < 256; x++) {
        y = matmul(x, alpha);
        l = (y >> 4) & 0xf;
        r = y & 0xf;
        l = (r == 0) ? nu_0[l] : nu_1[fieldmul(l, inv[r])];
        r = sigma[fieldmul(r, phi[l])];
        y = matmul((l << 4) | r, omega);
        if ((x & 7)==0) putchar('\n');
        printf("0x%02x, ", y);
    }
    return 0;
}

I have only assumed based on code generated by C compiler that Implementing this in assembly would not reduce size of code and so only the pi array is included in source and only the inverse sbox is created at runtime. However, I make no claim it can’t be implemented in less than 256 bytes which pi takes up, I just didn’t feel it was worth the effort.

kuz_init is called the same time as kuz_setkey so if you don’t call kuz_setkey first, encryption/decryption won’t work.

// The S-Box from section 5.1.1
const uint8_t kuz_pi[0x100] = {
  0xFC, 0xEE, 0xDD, 0x11, 0xCF, 0x6E, 0x31, 0x16,   // 00..07
  0xFB, 0xC4, 0xFA, 0xDA, 0x23, 0xC5, 0x04, 0x4D,   // 08..0F
  0xE9, 0x77, 0xF0, 0xDB, 0x93, 0x2E, 0x99, 0xBA,   // 10..17
  0x17, 0x36, 0xF1, 0xBB, 0x14, 0xCD, 0x5F, 0xC1,   // 18..1F
  0xF9, 0x18, 0x65, 0x5A, 0xE2, 0x5C, 0xEF, 0x21,   // 20..27
  0x81, 0x1C, 0x3C, 0x42, 0x8B, 0x01, 0x8E, 0x4F,   // 28..2F
  0x05, 0x84, 0x02, 0xAE, 0xE3, 0x6A, 0x8F, 0xA0,   // 30..37
  0x06, 0x0B, 0xED, 0x98, 0x7F, 0xD4, 0xD3, 0x1F,   // 38..3F
  0xEB, 0x34, 0x2C, 0x51, 0xEA, 0xC8, 0x48, 0xAB,   // 40..47
  0xF2, 0x2A, 0x68, 0xA2, 0xFD, 0x3A, 0xCE, 0xCC,   // 48..4F
  0xB5, 0x70, 0x0E, 0x56, 0x08, 0x0C, 0x76, 0x12,   // 50..57
  0xBF, 0x72, 0x13, 0x47, 0x9C, 0xB7, 0x5D, 0x87,   // 58..5F
  0x15, 0xA1, 0x96, 0x29, 0x10, 0x7B, 0x9A, 0xC7,   // 60..67
  0xF3, 0x91, 0x78, 0x6F, 0x9D, 0x9E, 0xB2, 0xB1,   // 68..6F
  0x32, 0x75, 0x19, 0x3D, 0xFF, 0x35, 0x8A, 0x7E,   // 70..77
  0x6D, 0x54, 0xC6, 0x80, 0xC3, 0xBD, 0x0D, 0x57,   // 78..7F
  0xDF, 0xF5, 0x24, 0xA9, 0x3E, 0xA8, 0x43, 0xC9,   // 80..87
  0xD7, 0x79, 0xD6, 0xF6, 0x7C, 0x22, 0xB9, 0x03,   // 88..8F
  0xE0, 0x0F, 0xEC, 0xDE, 0x7A, 0x94, 0xB0, 0xBC,   // 90..97
  0xDC, 0xE8, 0x28, 0x50, 0x4E, 0x33, 0x0A, 0x4A,   // 98..9F
  0xA7, 0x97, 0x60, 0x73, 0x1E, 0x00, 0x62, 0x44,   // A0..A7
  0x1A, 0xB8, 0x38, 0x82, 0x64, 0x9F, 0x26, 0x41,   // A8..AF
  0xAD, 0x45, 0x46, 0x92, 0x27, 0x5E, 0x55, 0x2F,   // B0..B7
  0x8C, 0xA3, 0xA5, 0x7D, 0x69, 0xD5, 0x95, 0x3B,   // B8..BF
  0x07, 0x58, 0xB3, 0x40, 0x86, 0xAC, 0x1D, 0xF7,   // C0..C7
  0x30, 0x37, 0x6B, 0xE4, 0x88, 0xD9, 0xE7, 0x89,   // C8..CF
  0xE1, 0x1B, 0x83, 0x49, 0x4C, 0x3F, 0xF8, 0xFE,   // D0..D7
  0x8D, 0x53, 0xAA, 0x90, 0xCA, 0xD8, 0x85, 0x61,   // D8..DF
  0x20, 0x71, 0x67, 0xA4, 0x2D, 0x2B, 0x09, 0x5B,   // E0..E7
  0xCB, 0x9B, 0x25, 0xD0, 0xBE, 0xE5, 0x6C, 0x52,   // E8..EF
  0x59, 0xA6, 0x74, 0xD2, 0xE6, 0xF4, 0xB4, 0xC0,   // F0..F7
  0xD1, 0x66, 0xAF, 0xC2, 0x39, 0x4B, 0x63, 0xB6,   // F8..FF
};

// initialize sbox tables
void kuz_init(kuz_key_t *key)
{
  int i;

  for (i=0; i<256; i++) {
    key->pi[i] = kuz_pi[i];
    key->pi_inv[kuz_pi[i]] = i;
  }
}

Creating the inverse sbox tables with code is smaller than including it as array.

; initialize sbox tables 
; edi = key context
load_sbox:
    pop    ebx               ; ebx = kuz_pi
sbox_loop:
    mov    al, dl            ; al = i
    xlatb                    ; al = kuz_pi[i]
    stosb                    ; key->pi[i] = al
    mov    [esi+eax], dl     ; key->pi_inv[kuz_pi[i]] = i
    inc    dl                ; i++
    jnz    sbox_loop         ; i<256
    popad
    ret

kuz_init:
    pushad
    inc    dh                ; edx=256
    add    edi, edx          ; edi = key->pi
    lea    esi, [edi+edx]    ; esi = key->pi_inv
    call   load_sbox
; The S-Box from section 5.1.1
kuz_pi:
  db   0xFC, 0xEE, 0xDD, 0x11, 0xCF, 0x6E, 0x31, 0x16,   ; 00..07
  db   0xFB, 0xC4, 0xFA, 0xDA, 0x23, 0xC5, 0x04, 0x4D,   ; 08..0F
  db   0xE9, 0x77, 0xF0, 0xDB, 0x93, 0x2E, 0x99, 0xBA,   ; 10..17
  db   0x17, 0x36, 0xF1, 0xBB, 0x14, 0xCD, 0x5F, 0xC1,   ; 18..1F
  db   0xF9, 0x18, 0x65, 0x5A, 0xE2, 0x5C, 0xEF, 0x21,   ; 20..27
  db   0x81, 0x1C, 0x3C, 0x42, 0x8B, 0x01, 0x8E, 0x4F,   ; 28..2F
  db   0x05, 0x84, 0x02, 0xAE, 0xE3, 0x6A, 0x8F, 0xA0,   ; 30..37
  db   0x06, 0x0B, 0xED, 0x98, 0x7F, 0xD4, 0xD3, 0x1F,   ; 38..3F
  db   0xEB, 0x34, 0x2C, 0x51, 0xEA, 0xC8, 0x48, 0xAB,   ; 40..47
  db   0xF2, 0x2A, 0x68, 0xA2, 0xFD, 0x3A, 0xCE, 0xCC,   ; 48..4F
  db   0xB5, 0x70, 0x0E, 0x56, 0x08, 0x0C, 0x76, 0x12,   ; 50..57
  db   0xBF, 0x72, 0x13, 0x47, 0x9C, 0xB7, 0x5D, 0x87,   ; 58..5F
  db   0x15, 0xA1, 0x96, 0x29, 0x10, 0x7B, 0x9A, 0xC7,   ; 60..67
  db   0xF3, 0x91, 0x78, 0x6F, 0x9D, 0x9E, 0xB2, 0xB1,   ; 68..6F
  db   0x32, 0x75, 0x19, 0x3D, 0xFF, 0x35, 0x8A, 0x7E,   ; 70..77
  db   0x6D, 0x54, 0xC6, 0x80, 0xC3, 0xBD, 0x0D, 0x57,   ; 78..7F
  db   0xDF, 0xF5, 0x24, 0xA9, 0x3E, 0xA8, 0x43, 0xC9,   ; 80..87
  db   0xD7, 0x79, 0xD6, 0xF6, 0x7C, 0x22, 0xB9, 0x03,   ; 88..8F
  db   0xE0, 0x0F, 0xEC, 0xDE, 0x7A, 0x94, 0xB0, 0xBC,   ; 90..97
  db   0xDC, 0xE8, 0x28, 0x50, 0x4E, 0x33, 0x0A, 0x4A,   ; 98..9F
  db   0xA7, 0x97, 0x60, 0x73, 0x1E, 0x00, 0x62, 0x44,   ; A0..A7
  db   0x1A, 0xB8, 0x38, 0x82, 0x64, 0x9F, 0x26, 0x41,   ; A8..AF
  db   0xAD, 0x45, 0x46, 0x92, 0x27, 0x5E, 0x55, 0x2F,   ; B0..B7
  db   0x8C, 0xA3, 0xA5, 0x7D, 0x69, 0xD5, 0x95, 0x3B,   ; B8..BF
  db   0x07, 0x58, 0xB3, 0x40, 0x86, 0xAC, 0x1D, 0xF7,   ; C0..C7
  db   0x30, 0x37, 0x6B, 0xE4, 0x88, 0xD9, 0xE7, 0x89,   ; C8..CF
  db   0xE1, 0x1B, 0x83, 0x49, 0x4C, 0x3F, 0xF8, 0xFE,   ; D0..D7
  db   0x8D, 0x53, 0xAA, 0x90, 0xCA, 0xD8, 0x85, 0x61,   ; D8..DF
  db   0x20, 0x71, 0x67, 0xA4, 0x2D, 0x2B, 0x09, 0x5B,   ; E0..E7
  db   0xCB, 0x9B, 0x25, 0xD0, 0xBE, 0xE5, 0x6C, 0x52,   ; E8..EF
  db   0x59, 0xA6, 0x74, 0xD2, 0xE6, 0xF4, 0xB4, 0xC0,   ; F0..F7
  db   0xD1, 0x66, 0xAF, 0xC2, 0x39, 0x4B, 0x63, 0xB6,   ; F8..FF

Key Whitening

Those of you familiar with previous asmcode entries should know by now the purpose of key whitening which essentially improves resistance to brute force attacks. It involves exclusive OR operation on the data being encrypted/decrypted using subkeys. It’s the exact same as AddRoundKey in AES and was first used by Ron Rivest in 1984 for DES-X.

static void kuz_whiten(w128_t *w, kuz_key_t *key, int idx) {
  int i;
  uint32_t *p=(uint32_t*)&key->k[idx].b[0];
  
  for (i=0; i<4; i++) {
    w->w[i] ^= p[i];
  }
}

Assembly is same as that used in AES, Serpent and Twofish.

; key whitening
; esi = w
; edi = key
; edx = round
kuz_whiten:
    pushad
    mov    cl, 4
    shl    edx, cl
w_l0:
    mov    eax, [edi+edx]     ; get 4 bytes of key
    xor    [esi], eax         ; xor state
    cmpsd
    loop   w_l0
    popad
    ret

Linear Transformation

Think of the linear transformation in Serpent, the mix columns operation in Rijndael or the Maximum Distance Seperable code in Twofish. They all serve the same purpose which is to create diffusion.

In Diffusion the statistical structure of the plaintext is dissipated into long range statistics of the cipher text. This is achieved by having each plaintext digit affect the value of many cipher text digits. Which is equivalent to saying that each cipher text digit is affected by many plaintext digits.

// poly multiplication mod p(x) = x^8 + x^7 + x^6 + x + 1
// totally not constant time
static uint8_t kuz_mul_gf256(uint8_t x, uint8_t y)
{
  uint8_t z;

  z = 0;

  while (y) {
    if (y & 1) {
      z ^= x;
    }
    x = (x << 1) ^ (x & 0x80 ? 0xC3 : 0x00);
    y >>= 1;
  }
  return z;
}

static void kuz_lt (w128_t *w, int enc) {
  int     i, j;
  uint8_t x;

  // Linear vector from sect 5.1.2
  const uint8_t kuz_lvec[16] = {
    0x94, 0x20, 0x85, 0x10, 0xC2, 0xC0, 0x01, 0xFB,
    0x01, 0xC0, 0xC2, 0x10, 0x85, 0x20, 0x94, 0x01 };

  // 16 rounds
  for (j=0; j<16; j++)
  {
    if (enc == KUZ_ENCRYPT)
    {
      // An LFSR with 16 elements from GF(2^8)
      x = w->b[15]; // since lvec[15] = 1

      for (i = 14; i >= 0; i--) {
        w->b[i + 1] = w->b[i];
        x ^= kuz_mul_gf256(w->b[i], kuz_lvec[i]);
      }
      w->b[0] = x;
    } else {
      x = w->b[0];

      for (i = 0; i < 15; i++) {
        w->b[i] = w->b[i + 1];
        x ^= kuz_mul_gf256(w->b[i], kuz_lvec[i]);
      }
      w->b[15] = x;
    }
  }
}

Peter was able to combine both operations in assembly using ECX (crypto flag).

; linear transformation
; esi = w
; ecx = enc
lt_l0:
    pop    edi
    push   16
    pop    eax               ; 16 rounds
lt_l1:
    pushad
    mov    dl, 15
    jecxz  lt_l2
    cdq

lt_l2:
    mov    ah, [esi+edx]     ; ah = w->b[00 or 15]

lt_l3:
    mov    bh, [edi+edx-1]     ; bh = kuz_lvec[i]
    mov    al, [esi+edx-1]     ; al = w->b[i]
    jecxz  lt_l4
    mov    bh, [edi+edx]     ; bh = kuz_lvec[i]
    mov    al, [esi+edx+1]   ; al = w->b[i+1]

lt_l4:
    mov    [esi+edx], al     ; w->b[i] = al
    call   kuz_mul_gf256
    dec    edx
    jecxz  lt_l5
    inc    edx
    inc    edx
    cmp    edx, 15

lt_l5:
    jnz    lt_l3
    mov    [esi+edx], ah     ; w->b[00 or 15] = x
    popad
    dec    eax
    jnz    lt_l1
    popad
    ret
kuz_lt:
    pushad
    call   lt_l0
    db 0x94, 0x20, 0x85, 0x10 
    db 0xC2, 0xC0, 0x01, 0xFB
    db 0x01, 0xC0, 0xC2, 0x10
    db 0x85, 0x20, 0x94, 0x01

; poly multiplication 
; mod p(x) = x^8 + x^7 + x^6 + x + 1 
mgf_l0:
    shr    bh, 1             ; if (y & 1), y >>= 1
    jnc    mgf_l1
    xor    ah, al            ; z ^= x
mgf_l1:
    add    al, al            ; if (x & 0x80), x <<= 1
    jnc    kuz_mul_gf256
    xor    al, 0xC3          ; x ^= 0xC3
kuz_mul_gf256:
    test   bh, bh            ; while (y)
    jnz    mgf_l0
    ret

Byte Substitution

The purpose of an sbox or substituion box is to obscure the relationship between key and ciphertext — Shannon’s property of confusion. Claude E. Shannon proposed ideas related to information security in his 1948 paper Communication Theory of Secrecy Systems

Confusion seeks to make a relationship between the statistics of the cipher text and the value of the encryption key as complex as possible. Thus even if the attacker can get some handle on the statistics of the cipher text, the way in which the key was used to produce that cipher text is so complex as to make it difficult to deduce the key.

// substitute bytes
static void kuz_subbytes(w128_t *w, kuz_key_t *key, int enc) {
  int           i;
  const uint8_t *sbox = (enc==KUZ_ENCRYPT) ? key->pi : key->pi_inv;

  for (i=0; i<16; i++) {
    w->b[i] = sbox[w->b[i]];
  }
}

Key scheduling

Based on Feistel Network design. It would certainly be possible to create subkeys during encryption or decryption but I wanted to keep it simple.

/ key setup
void kuz_setkey(kuz_key_t *kuz, const uint8_t key[32])
{
  uint32_t i, j;
  w128_t   c, z;
  w256_t   x;

  kuz_init(kuz);
  
  // copy key to context
  memcpy (&kuz->k[0].b[0], key, 32);

  // copy key to local buffer
  memcpy (&x.b[0], key, 32);
  
  for (i=1; i<=32; i++) {
    // C Value
    memset (&c.b[0], 0, 16);
    c.b[15] = i;    // load round in lsb

    kuz_lt(&c, KUZ_ENCRYPT);

    for (j=0; j<16; j++) {
      z.b[j] = x.b[j] ^ c.b[j];
    }

    kuz_subbytes(&z, kuz, KUZ_ENCRYPT);
    kuz_lt(&z, KUZ_ENCRYPT);

    for (j=0; j<16; j++) {
      z.b[j] ^= x.b[16+j];
    }
    
    memcpy (&x.b[16], &x.b[0], 16);
    memcpy (&x.b[0], &z.b[0], 16);
    
    if ((i & 7) == 0) {
      memcpy (&kuz->k[(i >> 2)].b[0], &x.b[0], 32);
    }
  }
}

Markku’s code uses 64-bit values here which I replaced with REP MOVSB operations. Using MMX code might reduce code further and for 64-bit implementations, it would make more sense to use 64-bit registers instead of stack memory.

; key setup
kuz_setkeyx:
_kuz_setkeyx:
    pushad
    mov    edi, [esp+32+4]   ; ebx=kuz context
    mov    esi, [esp+32+8]   ; esi=key
    mov    ebp, edi
    
    push   32                ; eax=32
    pop    eax
    cdq                      ; edx=0
    call   kuz_init
    
    xchg   eax, ecx
    ; copy key to context
    pushad
    rep    movsb
    popad
    ; copy 256-bit key to local buffer
    pushad                   ; allocate 32 bytes for x
    mov    edi, esp          ; edi=x
    push   edi
    rep    movsb
    pop    edi
    
    pushad                   ; allocate 32 bytes for c,z
    mov    ebx, esp          ; ebx=z
    lea    esi, [ebx+16]     ; esi=c
    ; do 32 rounds
ksk_l0:
    inc    edx
    cmp    dl, 32
    ja     ksk_l3
    
    mov    cl, 16
    pushad                   ; save x
    mov    edi, esi
    xor    eax, eax
    rep    stosb             ; memset (&c.b[0], 0, 16);
    mov    [esi+15], dl      ; c.b[15] = i;
    
    mov    edi, esi
    call   kuz_lt            ; kuz_lt(&c, KUZ_ENCRYPT);
    popad

ksk_l1:
    mov    al, [edi+ecx-1]   ; al=x.b[j]
    xor    al, [esi+ecx-1]   ; ^= c.b[j]
    mov    [ebx+ecx-1], al
    loop   ksk_l1
    
    xchg   esi, ebx     ; XCHG(c, z)
    xchg   edi, ebp     ; XCHG(x, kuz)
    call   kuz_subbytes ; kuz_subbytes(&z, kuz, KUZ_ENCRYPT);
    call   kuz_lt       ; kuz_lt(&z, KUZ_ENCRYPT);
    xchg   esi, ebx     ; XCHG(c, z)
    xchg   edi, ebp     ; XCHG(x, kuz)

ksk_l2:
    mov    al, [edi+ecx+16] ; z.b[j] ^= x.b[16+j];
    xor    [ebx+ecx], al
    inc    ecx
    cmp    cl, 16
    jnz    ksk_l2
    
    ; memcpy (&x.b[16], &x.b[0], 16);
    pushad
    mov    esi, edi
    add    edi, ecx
    rep    movsb
    popad
    
    ; memcpy (&x.b[0], &z.b[0], 16);
    pushad
    mov    esi, ebx
    rep    movsb
    popad
    
    test   dl, 7
    jnz    ksk_l0
    
    ; memcpy (&kuz->k[(i >> 2)].b[0], &x.b[0], 32);
    pushad
    add    ecx, ecx            ; ecx *= 2
    mov    esi, edi
    lea    edi, [ebp+edx*4]
    rep    movsb
    popad
    jmp    ksk_l0
ksk_l3:
    popad
    popad
    popad
    ret

Encryption and Decryption

// encrypt/decrypt a block - 8 bit way
void kuz_encrypt(kuz_key_t *key, void *blk, int enc)
{
  int    i, j;
  w128_t *x=(w128_t*)blk;

  if (enc==KUZ_ENCRYPT)
  {
    i=0;
    for (;;)
    {
      kuz_whiten(x, key, i);
      if (++i == KUZ_ROUNDS) break;
      kuz_subbytes(x, key, enc);
      kuz_lt(x, enc);
    }
  } else {
    i=KUZ_ROUNDS;
    for (;;)
    {
      i--;
      kuz_whiten(x, key, i);
      if (i == 0) break;
      kuz_lt(x, enc);
      kuz_subbytes(x, key, enc);
    }
  }
}

The assembly takes advantage of similarity between both encryption/decryption operations by storing pointer to functions in registers and swapping them depending on enc parameter stored in ecx.

; encrypt/decrypt a block
kuz_encryptx:
_kuz_encryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   eax, edi          ; edi=key
    lodsd
    push   eax               ; save blk
    lodsd
    cdq                      ; edx=0
    xchg   eax, ecx          ; ecx=enc
    pop    esi               ; esi=blk
    call   load_func
    ; kuz_whiten, kuz_subbytes and kuz_lt functions
    ; are placed here.
load_func:
    pop    ebx
    lea    eax, [ebx + (kuz_lt - kuz_whiten)]
    lea    ebp, [eax + (kuz_subbytes - kuz_lt)]
    
    jecxz  kde_l2
    xchg   eax, ebp
    mov    dl, KUZ_ROUNDS
    jmp    kde_l1
kde_l0:
    call   ebp ; kuz_lt or kuz_subbytes
    call   eax ; kuz_subbytes or kuz_lt
kde_l1:
    dec    edx
kde_l2:
    call   ebx ; kuz_whiten
    inc    ecx
    test   edx, edx
    loop   kde_l3
    inc    edx
    inc    edx
    cmp    dl, KUZ_ROUNDS+1
kde_l3:
    jnz    kde_l0
    popad
    ret

Summary

Please refer to sources here and here for any future updates.

architecture size
x86 615
x64 n/a

I cannot say there’s any advantage to using this cipher compared to Rijndael (AES), Serpent or Twofish. The design properties of all these ciphers are very similar but Kuznyechik is slower than Rinjdael. They all use similar functions but the advantage of AES (Rijndael) is that it’s likely to be more scrutinized by the cryptographic community than Kuznyechik, Serpent or Twofish. Having said all that, it’s still nice to have another block cipher people can study and learn from which has been certified by organizations other than NIST.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , , , | Leave a comment

Asmcodes: HC-256 Stream Cipher

Introduction

HC-256 is a software-efficient stream cipher designed and written by Wu Hongjun in 2004. In 2008, it was selected for the final portfolio of eSTREAM, the stream cipher project of the European Network of Excellence for Cryptology (ECRYPT, 2004-2008).

HC-256 was designed to demonstrate that strong stream cipher can be built from nonlinear feedback function and nonlinear output function. The large secret states of the two ciphers are updated in a nonlinear way, and table lookup (with changing tables) is used in the generation of keystream.

Stack exceptions problems

One of the negative features of this cipher is the large RAM requirement when generating key dependent sboxes. It requires approx. between 16-20KB but you cannot just allocate 20KB of stack in your procedure using SUB, ADD or ENTER.

The key generating function uses the following code

;
    xor    ecx, ecx          ; ecx=0
    mul    ecx               ; eax=0, edx=0
    
    mov    cl, 5             ; ecx=5
    mov    dh, 16            ; edx=4096
    ; allocate stack memory in 4096 byte blocks
    ; 4 x 4096 = 16384 bytes, 
    ; additional 4096 bytes just in case (not needed?)
xalloca:
    sub    esp, edx          ; subtract page size
    test   [esp], esp        ; page probe
                             ; causes pages of memory to be 
                             ; allocated via the guard page 
                             ; scheme (if possible)
    loop   xalloca           ; raises exception if 
                             ; unable to allocate

If you don’t perform the page probe after allocating the space required, an exception will occur once you try to use that address of memory. I never imagined this could be an issue… but again, it’s one of those things you need to consider when using lots of stack space.

Key Initialization

The initialization process of HC-256 consists of expanding the key and initialization vector into P and Q (similar to the message setup in SHA-256) and running the cipher 4096 steps without generating output.

Steps 4 and 5 are obviously from the SHA-256 hash algorithm.

#define SIG0(x)(ROTR32((x),  7) ^ ROTR32((x), 18) ^ ((x) >>  3))
#define SIG1(x)(ROTR32((x), 17) ^ ROTR32((x), 19) ^ ((x) >> 10))

// both key and iv must be 32 bytes / 256-bits!
void hc256_setkeyx(hc_ctx *c, void *kiv)
{
    uint32_t W[2560], i;
    uint32_t *pq, *wp;
    
    // 1. set counter
    c->ctr = 0;
    
    // 2. zero initialize key and iv buffers
    for (i=0; i<16; i++) {
      c->kiv[i] = 0;
    }
    
    // 3. setup the key and iv
    for (i=0; i<64; i++) {
      c->key[i>>2] = c->key[i>>2] | ((uint8_t*)kiv)[i];
      c->key[i>>2] = ROTL32(c->key[i>>2], 8);
    }
    
    // 4. copy 16 words to work buffer
    for (i=0; i<16; i++) { 
      W[i] = c->kiv[i];
    }

    // 5. expand buffer using SHA-256 macros
    for (i=16; i<2560; i++) {
      W[i] = SIG1(W[i-2])  + W[i-7]  + 
             SIG0(W[i-15]) + W[i-16] + i; 
    }
    
    pq = c->T;
    wp = &W[512];
    
    // 6. set the P and Q tables
    for (i=0; i<2048; i++) { 
      *pq++ = *wp++;
    }
    
    // 7. run cipher 4096 iterations before generating output
    for (i=0; i<4096; i++) {
      generate(c);
    }
}
hc256_setkeyx:
_hc256_setkeyx:
    pushad
    mov    edi, [esp+32+4]   ; edi=c
    mov    esi, [esp+32+8]   ; esi=kiv
    
    xor    ecx, ecx          ; ecx=0
    mul    ecx               ; eax=0, edx=0
    
    mov    cl, 5             ; ecx=5
    mov    dh, 16            ; edx=4096
    ; allocate stack memory in 4096 byte blocks
    ; 4 x 4096 = 16384 bytes, 
    ; additional 4096 bytes just in case (not needed?)
xalloca:
    sub    esp, edx          ; subtract page size
    test   [esp], esp        ; page probe
                             ; causes pages of memory to be 
                             ; allocated via the guard page scheme (if possible)
    loop   xalloca           ; raises exception if unable to allocate
    mov    ebx, esp          ; ebx=W
    
    push   edx               ; save 4096    
    push   edi               ; save ptr to c
    stosd                    ; c->ctr=0
    push   edx               ; save 4096
    push   edi               ; save ptr to c->T
    push   edx               ; save 4096
    
    ; 2. copy 512-bits of key/iv to workspace
    mov    cl, 64
    mov    edi, ebx          ; edi=W
    rep    movsb
    
    mov    esi, ebx          ; esi=W
    mov    cl, 16
expand_key:
    ; eax = SIG0(W[i-15])
    mov    eax, [edi - 15*4]
    mov    edx, eax
    mov    ebp, eax
    ror    eax, 7
    ror    edx, 18
    shr    ebp, 3
    xor    eax, edx
    xor    eax, ebp
    ; ebx = SIG1(W[i-2])
    mov    ebx, [edi - 2*4]
    mov    edx, ebx
    mov    ebp, ebx
    ror    ebx, 17
    ror    edx, 19
    shr    ebp, 10
    xor    ebx, edx
    xor    ebx, ebp
    ; W[i] = ebx + W[i-16] + eax + w[i-7] + i
    add    eax, [edi - 16*4]
    add    ebx, [edi -  7*4]
    add    eax, ebx
    add    eax, ecx
    stosd
    inc    ecx
    cmp    ecx, [esp]        ; 4096 words
    jnz    expand_key
    
    pop    ecx               ; ecx=4096
    pop    edi               ; edi=c->T
    shr    ecx, 1            ; /=2 for 2048 words
    add    esi, ecx          ; add 512*4
    rep    movsd
    
    pop    ecx               ; ecx=4096
    pop    edi               ; edi=ctx
sk_l3:
    call   hc256_generatex
    loop   sk_l3
    
    pop    eax              ; eax=4096
    lea    esp, [esp+eax*4] ; free stack
    add    esp, eax
    popad
    ret

Keystream Generation

At each step, one element of a table is updated and one 32-bit output is generated. An S-box is used to generate only 1024 outputs, then it is updated in the next 1024 steps. The keystream generation process of HC-256 is given below.

uint32_t h(uint32_t T[], uint32_t x) {
  return T[x & 255]                 + 
         T[256 + ((x >>  8) & 255)] + 
         T[512 + ((x >> 16) & 255)] + 
         T[768 + ((x >> 24) & 255)];  
}

// step function
uint32_t generate(hc_ctx* c)
{
    uint32_t r, i, i3, i10, i12, i1023;
    uint32_t *x0, *x1, *t;
    
    // 1. obtain indices based on counter
    i     = c->ctr     & 0x3ff;
    i3    = (i - 3)    & 0x3ff;
    i10   = (i - 10)   & 0x3ff;
    i12   = (i - 12)   & 0x3ff;
    i1023 = (i - 1023) & 0x3ff;

    if (c->ctr < 1024) {
      x0=c->P;
      x1=c->Q;
    } else {
      x0=c->Q;
      x1=c->P;
    }
    
    // 2.
    x0[i] = x0[i]   + 
            x0[i10] + 
            (ROTR32(x0[i3], 10) ^ ROTR32(x0[i1023], 23)) + 
            x1[(x0[i3] ^ x0[i1023]) & 0x3ff];
         
    // 3. 
    r = h(x1, x0[i12]) ^ x0[i];
    
    // 4. update counter modulo 2048
    c->ctr = (c->ctr+1) & 0x7ff;
    
    return r;
}
; expects ctx in edi
hc256_generatex:
_hc256_generatex:
    pushad
    xor    edx, edx
    mov    dh, 8               ; edx = 2048
    mov    esi, edi            ; esi = c
    lodsd                      ; eax = c->ctr
    push   eax                 ; save  c->ctr
    inc    eax                 ; c->ctr++
    dec    edx                 ; edx = 2047
    and    eax, edx            ; c->ctr &= 2047
    stosd                      ; save new c->ctr
    pop    eax                 ; restore old c->ctr
    lea    edi, [esi+edx*2+2]  ; x0  = c->Q
    shr    edx, 1              ; edx = 1023
    push   edx                 ; save 1023
    cmp    eax, edx            ; c->ctr > 1023
    jbe    gen_l0
    xchg   esi, edi            ; swap Q and P ptrs
gen_l0:
    and    eax, edx            ; i &= 1023
    
    lea    ebx, [eax-3]        ; i3 = (i - 3) & 1023;
    and    ebx, edx
    
    lea    ecx, [eax-10]       ; i10 = (i - 10) & 1023;
    and    ecx, edx
    
    lea    ebp, [eax-12]       ; i12 = (i - 12) & 1023;
    and    ebp, edx
    mov    ebp, [esi+ebp*4]
    push   ebp                 ; save i12
    
    mov    ebp, eax            ; i1023 = (i - 1023) & 1023;
    sub    ebp, edx
    and    ebp, edx
    
    push   eax                 ; save i
    mov    eax, [esi+eax*4]    ; eax  = x0[i]
    add    eax, [esi+ecx*4]    ; eax += x0[i10]
    
    mov    ebp, [esi+ebp*4]    ; ebp  = x0[i1023]
    mov    ebx, [esi+ebx*4]    ; ebx  = x0[i3]
    mov    ecx, ebx            ; ecx  = x0[i3]
    xor    ecx, ebp            ; ecx ^= x0[i1023]
    and    ecx, edx            ; ecx &= 0x3ff
    add    eax, [edi+ecx*4]    ; ecx  = x1[(x0[i3] ^ x0[i1023]) & 1023]
    ror    ebx, 10             ; ebx  = ROTR32(x0[i3], 10)
    rol    ebp, 9              ; ebp  = ROTL32(x0[i1023], 9)
    xor    ebx, ebp            ;
    add    eax, ebx            ; 
    pop    ebx                 ; ebx = i
    mov    [esi+ebx*4], eax    ; eax = (x0[i] += eax)
    pop    edx                 ; edx = i12
    
    pop    ebp                 ; ebp=1023
    inc    ebp                 ; ebp=1024
    xchg   eax, ecx
    xor    eax, eax            ; r=0
gen_l1:
    movzx  ebx, dl
    add    eax, [edi+ebx*4]
    add    edi, ebp            ; x1 += 1024/4
    shr    edx, 8              ; w1 >>= 8
    jnz    gen_l1
    
    xor    eax, ecx            ; r ^= w0;
    mov    [esp+_eax], eax     ; return r;
    popad
    ret

Encryption and Decryption

The keystream is XORed with the message for encryption. The decryption is to XOR the keystream with the ciphertext.

// encrypt/decrypt data in place
void hc256_crypt(hc_ctx *c, void *in, uint32_t inlen)
{
  uint32_t i, w;
  uint8_t *p=(uint8_t*)in;
  
  // encrypt all bytes
  for (i=0; i<inlen;) {
    w = generate(c);
    // encrypt 4 bytes or until i equals inlen
    do {
      p[i++] ^= (w & 255);
      w >>= 8;
    } while (i < inlen && w != 0);
  }
}
hc256_cryptx:
_hc256_cryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax          ; edi=ctx
    lodsd
    xchg   edx, eax          ; edx=in
    lodsd
    xchg   ecx, eax          ; ecx=len
hc_l0:                       ; .repeat
    jecxz  hc_l2             ; .break .if ecx == 0
    call   hc256_generatex
hc_l1:
    xor    [edx], al         ; *in ^= (w0 & 0xFF)
    inc    edx               ; in++
    shr    eax, 8            ; w0 >>= 8
    loopnz hc_l1             ; .while ecx != 0 && eax != 0
    jmp    hc_l0
hc_l2:
    popad
    ret

Test Vector results

HC256 test #1 - OK
5b, 07, 89, 85, d8, f6, f3, 0d,
42, c5, c0, 2f, a6, b6, 79, 51,
53, f0, 65, 34, 80, 1f, 89, f2,
4e, 74, 24, 8b, 72, 0b, 48, 18,

HC256 test #2 - OK
af, e2, a2, bf, 4f, 17, ce, e9,
fe, c2, 05, 8b, d1, b1, 8b, b1,
5f, c0, 42, ee, 71, 2b, 31, 01,
dd, 50, 1f, c6, 0b, 08, 2a, 50,

HC256 test #3 - OK
1c, 40, 4a, fe, 4f, e2, 5f, ed,
95, 8f, 9a, d1, ae, 36, c0, 6f,
88, a6, 5a, 3c, c0, ab, e2, 23,
ae, b3, 90, 2f, 42, 0e, d3, a8,

Summary

architecture size
x86 272
x64 n/a

Since it was chosen for the final portfolio of eSTREAM, it’s likely to have undergone extensive cryptanalysis. It’s unpatented, suitable for 32-bit CPUs and the assembly output of hc256x.c is approx. 454 bytes which is just a little bigger than ChaCha20 (424 bytes) a close relative of Salsa20 which was also selected for the final portfolio.

Please refer to here for sources, documentation and any future updates that may not be shown here.

Known attacks

Improved Distinguishing Attacks on HC-256 by Gautham Sekar and Bart Preneel
A Cache Timing Analysis of HC-256 by Erik Zenner

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , | 1 Comment

Asmcodes: Rabbit Stream Cipher

Introduction

Rabbit is a stream cipher designed in 2003 by Martin Boesgaard, Mette Vesterager, Thomas Pedersen, Jesper Christiansen and Ove Scavenius. It was published in 2005 and selected as a software cipher for the eSTREAM portfolio in 2008.

It’s designed for high performance in software implementations but is also known to work well in hardware environments. Both key setup and encryption are very fast, making the algorithm particularly suited for all applications where large amounts of data or large numbers of data packages have to be encrypted.

Examples include, but are not limited to, server-side encryption, multimedia encryption, hard-disk encryption, and encryption on limited-resource devices. It is currently not known to exhibit any weaknesses and so I decided to implement it in x86 assembly after much tweaking of original C code.

A description by the authors can be found in RFC4503

Technically, Rabbit consists of a pseudorandom bitstream generator that takes a 128-bit key and a 64-bit initialization vector (IV) as input and generates a stream of 128-bit blocks. Encryption is performed by combining this output with the message, using the exclusive-OR operation. Decryption is performed in exactly the same way as encryption.

About the assembly code

This was a joint effort between Peter Ferrie and myself.

It didn’t initially appear like it would be as compact compared with Salsa20 or HC-256 which are also from the eSTREAM portfolio.

However, the only other cipher from eSTREAM software portfolio was SOSEMANUK which is based on Serpent and it seemed pointless implementing a stream cipher based on a block cipher which can already be used with CTR mode to work as a stream cipher.

Optimization trick with PUSHAD/POPAD

Although this has nothing to do with the cipher itself, I think it’s worth noting. While writing the assembly code for RABBIT, I accidently “discovered” something that I didn’t realize was possible with PUSHAD instruction. It may only work on Windows so if you use it in your own code, careful consideration should be given to whether or not it will work across all operating systems running on x86 architecture and not just Windows. UPDATE: Peter Ferrie has confirmed this is a feature of the CPU itself and will work on Linux also.

When PUSHAD is executed, it saves all the general purpose registers including ESP on the stack. The total stack space occupied for this operation is 32-bytes. But what I didn’t know is that it’s possible to overwrite ESP and when POPAD is executed, the original value of ESP remains unaffected.

So what?” you may say.. but think about it. This is a neat way to allocate and release 32-bytes of memory without worrying about overwriting ESP.

If the original value of ESP were not restored, only 12 bytes would be available, but that’s still useful for some code. So instead of using SUB, ADD or ENTER..etc to allocate 32-bytes or more in future, consider using PUSHAD since it doesn’t appear to matter if ESP is overwritten.

Try it for yourself with the following code:

;
    pushad                   ; save all registers
    xor    eax, eax          ; eax=0
    push   8                 ; overwrite 8 GP registers
    pop    ecx
    mov    edi, esp
    rep    stosd             ; memset(edi, 0, 32);
    popad                    ; like "magic", ESP is restored

I’ve updated other sources to use this “trick” and they all work fine.

It is used here in RABBIT_setiv and RABBIT_crypt

Key setup

void RABBIT_setkey(RABBIT_ctx *c, const void *key)
{
   uint32_t k0, k1, k2, k3, i;
   rabbit_blk *k=(rabbit_blk*)key;

   // Generate four subkeys
   k0 = k->d[0];
   k1 = k->d[1];
   k2 = k->d[2];
   k3 = k->d[3];

   // Generate initial state variables
   c->m.x[0] = k0;
   c->m.x[2] = k1;
   c->m.x[4] = k2;
   c->m.x[6] = k3;

   c->m.x[1] = U32V(k3<<16) | (k2>>16);
   c->m.x[3] = U32V(k0<<16) | (k3>>16);
   c->m.x[5] = U32V(k1<<16) | (k0>>16);
   c->m.x[7] = U32V(k2<<16) | (k1>>16);

   // Generate initial counter values
   c->m.c[0] = ROTL32(k2, 16);
   c->m.c[2] = ROTL32(k3, 16);
   c->m.c[4] = ROTL32(k0, 16);
   c->m.c[6] = ROTL32(k1, 16);

   c->m.c[1] = (k0&0xFFFF0000) | (k1&0xFFFF);
   c->m.c[3] = (k1&0xFFFF0000) | (k2&0xFFFF);
   c->m.c[5] = (k2&0xFFFF0000) | (k3&0xFFFF);
   c->m.c[7] = (k3&0xFFFF0000) | (k0&0xFFFF);

   // Clear carry bit
   c->m.carry = 0;

   // Iterate the system four times
   for (i=0; i<4; i++) {
      RABBIT_next_state(&c->m);
   }

   // Modify the counters
   for (i=0; i<8; i++) {
      c->m.c[i] ^= c->m.x[(i+4) & 7];
   }

   // Copy master instance to work instance
   memcpy (&c->w.x[0], &c->m.x[0], sizeof(RABBIT_state));
}

This assembly…

; Key setup
RABBIT_setkeyx:
_RABBIT_setkeyx:
    pushad
    mov    edi, [esp+32+4]
    mov    esi, [esp+32+8]
    ; ---------------------
    ; Generate four subkeys
    ; ---------------------
    lodsd
    xchg   eax, edx
    lodsd
    xchg   eax, ebx
    lodsd
    xchg   eax, ecx
    lodsd
    xchg   eax, edx
    ; --------------------------------
    ; Generate initial state variables
    ; --------------------------------
    pushad
    stosd

    ; c->m.x[1] = U32V(k3<<16) | (k2>>16);
    mov    ebp, edx
    shld   ebp, ecx, 16
    xchg   ebp, eax
    stosd

    xchg   ebx, eax
    stosd

    ; c->m.x[3] = U32V(k0<<16) | (k3>>16);
    mov    ebx, ebp
    shld   ebx, edx, 16
    xchg   ebx, eax
    stosd

    xchg   ecx, eax
    stosd

    ; c->m.x[5] = U32V(k1<<16) | (k0>>16);
    mov    ecx, ebx
    shld   ecx, ebp, 16
    xchg   ecx, eax
    stosd

    xchg   edx, eax
    stosd

    ; c->m.x[7] = U32V(k2<<16) | (k1>>16);
    mov    edx, ecx
    shld   edx, ebx, 16
    xchg   edx, eax
    stosd
    
    ; -------------------------------
    ; Generate initial counter values
    ; -------------------------------
    rol    ecx, 16
    xchg   ecx, eax
    stosd

    ; c->m.c[1] = (k0&0xFFFF0000) | (k1&0xFFFF);
    xchg   ebp, eax         ; k1.lo
    xchg   bx, ax
    stosd

    rol    edx, 16
    xchg   edx, eax
    stosd

    ; c->m.c[3] = (k1&0xFFFF0000) | (k2&0xFFFF);
    rol    ecx, 16          ; k2.lo
    xchg   ecx, eax
    stosd

    xchg   edx, eax
    xchg   bx, ax
    rol    eax, 16
    stosd

    ; c->m.c[5] = (k2&0xFFFF0000) | (k3&0xFFFF);
    xchg   ecx, eax         ; k3.lo
    xchg   bp, ax
    rol    eax, 16
    stosd

    xchg   edx, eax
    xchg   bx, ax
    rol    eax, 16
    stosd

    ; c->m.c[7] = (k3&0xFFFF0000) | (k0&0xFFFF);
    xchg   ecx, eax         ; k0.lo
    xchg   bp, ax
    rol    eax, 16
    stosd
    
    xor    eax, eax
    stosd
    popad

    ; -----------------------------
    ; Iterate the system four times
    ; -----------------------------
    push   4
    pop    ecx
rsk_l0:
    push   edi
    call   RABBIT_next_state
    pop    edi
    loop   rsk_l0
    
    ; -------------------
    ; Modify the counters
    ; -------------------
    xchg   eax, ecx
    cdq
rsk_l1:
    ; c->m.c[i] ^= c->m.x[(i+4) & 7];
    lea    ecx, [eax+edx+4]
    and    ecx, 7
    mov    ebx, [edi+ecx*4]
    xor    [edi+eax*4+32], ebx
    inc    eax
    cmp    al, 8
    jne    rsk_l1
    
    ; -------------------------------------
    ; Copy master instance to work instance
    ; -------------------------------------
    mov    cl, 68
    mov    esi, edi
    add    edi, ecx
    rep    movsb
    popad
    ret

IV setup

// IV setup
void RABBIT_setiv(RABBIT_ctx *c, const void* iv)
{
   rabbit_blk *v=(rabbit_blk*)iv;
   uint32_t   sv[4];
   int        i;

   // Generate four subvectors
   sv[0] = v->d[0];
   sv[2] = v->d[1];

   sv[1] = (sv[0]>>16) | (sv[2]&0xFFFF0000);
   sv[3] = (sv[2]<<16) | (sv[0]&0x0000FFFF);

   // Modify counter values
   for (i=0; i<8; i++) {
      c->w.c[i] = c->m.c[i] ^ sv[i & 3];
   }

   // Copy state variables but not carry flag
   memcpy (&c->w.x[0], &c->m.x[0], 32);

   // Iterate the system four times
   for (i=0; i<4; i++) {
      RABBIT_next_state(&c->w);
   }
}

Assembly…

; IV setup
RABBIT_setivx:
_RABBIT_setivx:
    ; save registers
    pushad
    mov    edx, [esp+32+4]   ; ctx
    mov    esi, [esp+32+8]   ; iv
    ; Generate four subvectors
    lodsd
    xchg   edi, eax          ; sv[0] = v->d[0];
    lodsd
    xchg   ebp, eax          ; sv[2] = v->d[1];

    pushad                   ; create sv variable (32 bytes), init first and third

    ror    edi, 16           ; sv[0] >> 16
    
    xchg   ebp, eax          ; sv[1] = (sv[0]>>16) | (sv[2]&0xFFFF0000);
    xchg   di, ax
    mov    [esp+4], eax
    
    ror    edi, 16           ; sv[3] = (sv[2]<<16) | (sv[0]&0x0000FFFF);
    mov    [esp+12], edi
    ; Modify counter values
    xor    ecx, ecx
rsv_l0:
    mov    eax, [edx+ecx*4+32]   ; eax = c->m.c[i]
    mov    ebx, ecx
    and    ebx, 3
    xor    eax, [esp+ebx*4]      ; eax ^= sv[i & 3]
    mov    [edx+ecx*4+100], eax  ; c->w.c[i] = eax
    inc    ecx
    cmp    cl, 8
    jnz    rsv_l0
    ; Copy master state variables to work
    pushad
    mov    esi, edx
    lea    edi, [esi+68]
    rep    movsd
    popad
    ; Iterate the system four times
    mov    cl, 4
rsv_l1:
    lea    eax, [edx+68]
    push   eax
    call   RABBIT_next_state ; RABBIT_next_state(&c->w);
    pop    eax
    loop   rsv_l1
    
    ; release sv of 32 bytes
    popad
    ; restore registers
    popad
    ret

Encryption/Decryption

void RABBIT_crypt(RABBIT_ctx *c, void* input, uint32_t inlen)
{
   uint32_t   i, len;
   rabbit_blk x;
   uint8_t    *in=(uint8_t*)input;

   for (;;)
   {
     // break on zero length
     if (inlen==0) break;

     // update state
     RABBIT_next_state(&c->w);

     for (i=0; i<4; i++) {
       x.d[i] = c->w.x[i<<1];
     }

     x.d[0] ^= (c->w.x[5]>>16) ^ (c->w.x[3]<<16);
     x.d[1] ^= (c->w.x[7]>>16) ^ (c->w.x[5]<<16);
     x.d[2] ^= (c->w.x[1]>>16) ^ (c->w.x[7]<<16);
     x.d[3] ^= (c->w.x[3]>>16) ^ (c->w.x[1]<<16);

     for (i=0; i<16 && inlen!=0; i++) {
       *in++ ^= x.b[i];
       inlen--;
     }
   }
}

Assembly

; encrypt/decrypt a message of any size
RABBIT_cryptx:
_RABBIT_cryptx:
    pushad                   ; save registers
    lea    esi, [esp+32+4]   ; esi = parameters
    pushad                   ; alloc 32-bytes on stack
    lodsd
    xchg   ebx, eax          ; ebx=c
    lodsd
    xchg   ecx, eax          ; ecx=input
    lodsd
    xchg   ecx, eax          ; ecx=inlen
    xchg   eax, esi          ; esi=input
    add    ebx, 68           ; ebx=&c->w.x[0]
rc_l0:
    jecxz  rc_l3             ; break if ecx==0
    push   ebx
    call   RABBIT_next_state
    pop    eax
    
    xor    eax, eax
    cdq
    mov    edi, esp
    pushad
    ; for (i=0; i<4; i++) {
    ;   x.d[i] = c->w.x[i<<1];
    ; }
rc_l1:
    mov    ecx, [ebx+eax*8]
    mov    [edi+eax*4], ecx
    inc    eax
    cmp    al, 4
    jnz    rc_l1
    
    mov    esi, [ebx+1*4]    ; ebx = c->w.x[1]
    mov    ecx, [ebx+3*4]    ; ecx = c->w.x[3]
    mov    edx, [ebx+5*4]    ; edx = c->w.x[5]
    mov    ebp, [ebx+7*4]    ; ebp = c->w.x[7]
    mov    eax, ecx
    ; x.d[0] ^= (c->w.x[5]>>16) ^ (c->w.x[3]<<16);
    shld   ecx, edx, 16  ; x5 >> 16, x3 << 16
    xor    [edi+4*0], ecx
    ; x.d[1] ^= (c->w.x[7]>>16) ^ (c->w.x[5]<<16);
    shld   edx, ebp, 16
    xor    [edi+4*1], edx
    ; x.d[2] ^= (c->w.x[1]>>16) ^ (c->w.x[7]<<16);
    shld   ebp, esi, 16
    xor    [edi+4*2], ebp
    ; x.d[3] ^= (c->w.x[3]>>16) ^ (c->w.x[1]<<16);
    shld   esi, eax, 16
    xor    [edi+4*3], esi
    popad
    
    ; for (i=0; i<16 && inlen!=0; i++) {
    ;   *in++ ^= x.b[i];
    ;   inlen--;
    ; }
    mov    dl, 16            ; do 16 bytes or remaining
rc_l2:
    mov    al, [edi]         ; al = x.b[i]
    inc    edi
    xor    [esi], al         ; *in ^= al
    inc    esi               ; in++
    dec    edx               ; i--
    loopnz rc_l2             ; break if --len==0 or i==0
    inc    ecx
    loop   rc_l0
rc_l3:
    popad                    ; free stack
    popad                    ; restore registers
    ret

Summary

architecture size
x86 458
x64 n/a

Please refer to here and here for sources, documentation and any future updates.

Salsa20 which was also chosen for eSTREAM portfolio, ChaCha20 and HC-256 are all significantly smaller than Rabbit and probably just as secure since they support 256-bit keys. I have not tested each algorithm for speed but with multiplication/division used here, it may be slower than optimized versions of Salsa20 and ChaCha20.

The final size for hand written assembly is approx. 450 bytes and approx. 700 for code generated by MSVC

Posted in assembly, cryptography, encryption, security | Tagged , , , , | 1 Comment

Asmcodes: Salsa20 Stream Cipher

Introduction

Salsa20 or Snuffle 2005 is a stream cipher designed, written and published in 2005 by mathematician D.J Bernstein. It supports 128/256-bit keys and 64-bit nonce values.

Another version XSalsa20 supports nonces up to 192-bits if 64-bits is insufficient to what you need.

It was selected for the final portfolio of the eStream (ECRYPT Stream Cipher Project) so we can presume it’s undergone extensive cryptanalysis during the selection process.

The implementation here only supports 256-bit keys.

About the assembly code

ChaCha20 was already implemented and is based on Salsa20 although ChaCha20 is apparently a better design for performance without losing security. The x86 code was a joint effort between Peter Ferrie and myself while writing ChaCha20 and so few changes were made to implement Salsa20.

Context/State

The keystream buffer is 64-bytes or 512-bits which gets updated for each time the Salsa20 core is invoked.

#define S20_BLK_LEN 64

typedef union _s20_blk_t {
  uint8_t  v8[S20_BLK_LEN];
  uint16_t v16[S20_BLK_LEN/2];
  uint32_t v32[S20_BLK_LEN/4];
  uint64_t v64[S20_BLK_LEN/8];
} s20_blk;

typedef struct _s20_ctx_t {
  s20_blk s;
} s20_ctx;

Key generation

This function expects 256-bit keys. The ChaCha20 stream cipher (Snuffle 2008) has a better design IMHO but hasn’t received as much scrutiny as Salsa20 according to the author.

// setup the key, must be 256-bits with 64-bit nonce
void s20_setkey(s20_ctx *c, void *key, void *nonce)
{
  s20_blk *iv=(s20_blk*)nonce;
  s20_blk *k=(s20_blk*)key;
  
  c->s.v32[ 0] = 0x61707865;
  c->s.v32[ 1] = k->v32[0];
  c->s.v32[ 2] = k->v32[1];
  c->s.v32[ 3] = k->v32[2];
  c->s.v32[ 4] = k->v32[3];
  c->s.v32[ 5] = 0x3320646E;
  c->s.v32[ 6] = iv->v32[0];
  c->s.v32[ 7] = iv->v32[1];
  c->s.v32[ 8] = 0;
  c->s.v32[ 9] = 0;
  c->s.v32[10] = 0x79622D32;
  c->s.v32[11] = k->v32[4];
  c->s.v32[12] = k->v32[5];
  c->s.v32[13] = k->v32[6];
  c->s.v32[14] = k->v32[7];
  c->s.v32[15] = 0x6B206574;
}
_s20_setkeyx:
s20_setkeyx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax
    lodsd
    push   eax
    lodsd
    xchg   ebx, eax
    pop    esi
    mov    eax, 061707865h
    cdq
    stosd
    ; copy 16 bytes of 256-bit key
    movsd    
    movsd    
    movsd    
    movsd    
    mov    eax, 03320646Eh
    stosd
    ; copy 64-bit iv to 6,7
    xchg   esi, ebx
    movsd
    movsd
    ; zero 64-bit counter at 8,9
    xchg   eax, edx
    stosd
    stosd
    ; store 32-bits at 10
    mov    eax, 079622D32h
    stosd
    ; store remainder of key at 11-14
    xchg   esi, ebx
    movsd    
    movsd    
    movsd    
    movsd
    ; store last 32-bits
    mov    eax, 06B206574h
    stosd
    popad
    ret

Salsa20 Core

The 64-byte input x to Salsa20 is viewed in little-endian form as 16 words x0, x1, x2, …, x15 in {0,1,…,2^32-1}. These 16 words are fed through 320 invertible modifications, where each modification changes one word. The resulting 16 words are added to the original x0, x1, x2, …, x15 respectively modulo 2^32, producing, in little-endian form, the 64-byte output Salsa20(x).

Each modification involves xor’ing into one word a rotated version of the sum of two other words modulo 2^32. Thus the 320 modifications involve, overall, 320 additions, 320 xor’s, and 320 rotations. The rotations are all by constant distances.

The entire series of modifications is a series of 10 identical double-rounds. Each double-round is a series of 2 rounds. Each round is a set of 4 parallel quarter-rounds. Each quarter-round modifies 4 words.

// transforms block using ARX instructions
void QUARTERROUND(s20_blk *blk, uint16_t idx) 
{
    uint32_t a, b, c, d;
    uint32_t *x=(uint32_t*)&blk->v8;
    
    a = (idx         & 0xF);
    b = ((idx >>  4) & 0xF);
    c = ((idx >>  8) & 0xF);
    d = ((idx >> 12) & 0xF);

    x[b] ^= ROTL32((x[a] + x[d]), 7);
    x[c] ^= ROTL32((x[b] + x[a]), 9);
    
    x[d] ^= ROTL32((x[c] + x[b]),13);
    x[a] ^= ROTL32((x[d] + x[c]),18);
}

// encrypt/decrypt 64 byte block
// do not call directly, use s20_encrypt instead
void s20_core (s20_ctx *c, uint8_t *in, uint32_t len)
{
    s20_blk x;
    int     i, j;

    // 16-bit integers of each index
    uint16_t idx16[8]=
    { 0xC840, 0x1D95, 0x62EA, 0xB73F, 
      0x3210, 0x4765, 0x98BA, 0xEDCF };
    
    // copy state to local space
    for (i=0; i<16; i++) { 
      x.v32[i] = c->s.v32[i];
    }
    // apply 20 rounds
    for (i=0; i<20; i+=2) {
      for (j=0; j<8; j++) {
        QUARTERROUND(&x, idx16[j]);
      }
    }
    // add state to x
    for (i=0; i<16; i++) {
      x.v32[i] += c->s.v32[i];
    }
    // xor input with x
    for (i=0; i<len; i++) {
      ((uint8_t*)in)[i] ^= x.v8[i];
    }
    // update block counter
    c->s.v64[4]++;
    // stopping at 2^70 bytes per nonce is user's responsibility
}
%define a eax
%define b ebx
%define c edx
%define d esi
%define x edi

%define t0 ebp

; void QUARTERROUND(s20_blk *blk, uint16_t index)
; expects edi points to x array
QUARTERROUND:
    pushad
    push   a                ; save eax
    xchg   ah, al
    aam    16
    
    movzx  d, ah
    movzx  c, al

    pop    a
    aam    16
    
    movzx  b, ah
    movzx  a, al

    lea    a, [x+a*4]
    lea    b, [x+b*4]
    lea    c, [x+c*4]
    lea    d, [x+d*4]

    ; load ecx with rotate values
    ; 16, 12, 8, 7
    mov    ecx, 0120D0907h
s20_q0:
    ; x[b] ^= ROTL32((x[a] + x[d]), 7);
    mov    t0, [a]           ; ebp=x[a]
    add    t0, [d]
    rol    t0, cl
    xor    [b], t0
    
    xchg   cl, ch
    
    mov    t0, [b]           ; ebp=x[b]
    add    t0, [a]
    rol    t0, cl
    xor    [c], t0
    
    xchg   a, c
    xchg   b, d
    
    shr    ecx, 16
    jnz    s20_q0
    
    popad
    ret
  
; void s20_corex (s20_ctx *ctx, void *in, uint32_t len)
; do not call directly
; expects state in ebx, length in eax, input in edx
_s20_corex:
s20_corex:
    pushad

    ; allocate 64-bytes local space, x
    push   64
    pop    ecx
    sub    esp, ecx
    
    ; copy state to x
    mov    edi, esp
    mov    esi, ebx
    rep    movsb

    ; move x into edi
    mov    edi, esp
    push   eax
    push   20/2  ; 20 rounds
    pop    ebp
s20_c0:
    ; load indexes
    call   s20_c1
    dw 0C840h, 01D95h, 062EAh, 0B73Fh
    dw 03210h, 04765h, 098BAh, 0EDCFh
s20_c1:
    pop    esi  ; pointer to indexes
    mov    cl, 8
s20_c2:
    lodsw
    call   QUARTERROUND
    loop   s20_c2
    dec    ebp
    jnz    s20_c0
    
    ; add state to x
    mov    cl, 16
s20_c3:
    mov    eax, [ebx+ecx*4-4]
    add    [edi+ecx*4-4], eax
    loop   s20_c3

    ; xor input with x
    pop    ecx               ; ecx=len
s20_c4:
    mov    al, byte[edi+ecx-1]
    xor    byte[edx+ecx-1], al
    loop   s20_c4
    
    ; update block counter
    stc
    adc    dword[ebx+8*4], ecx
    adc    dword[ebx+9*4], ecx
    
    ; free stack
    popad
    popad
    ; restore registers
    popad
    ret

Encryption and Decryption

The same function is used to encrypt/decrypt providing the same key and nonce are used during key setup.

// encrypt stream of bytes
void s20_crypt (s20_ctx *c, void *in, uint32_t len) 
{
    uint32_t r;
    uint8_t  *p=(uint8_t*)in;
    
    while (len) {
      r=(len>64) ? 64 : len;
      s20_core(c, p, r);
      len -= r;
      p += r;
    }
}
_s20_cryptx:
s20_cryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   ebx, eax
    lodsd
    xchg   edx, eax
    lodsd
    xchg   ecx, eax
    jecxz  s20_l1            ; exit if len==0
    xor    eax, eax          ; r=0
s20_l0:
    mov    al, 64
    cmp    ecx, eax          ; eax=len < 64 ? len : 64
    cmovb  eax, ecx
    call   s20_corex
    add    edx, eax          ; p += r;
    sub    ecx, eax          ; len -= r;
    jnz    s20_l0
s20_l1:
    popad
    ret

Summary

architecture size
x86 245
x64 n/a

For source code and any future updates, please refere to code here.

Posted in cryptography, encryption, programming, security | Tagged , , , , | 1 Comment

Asmcodes: CubeHash 1/1-256 cryptographic hash

Introduction

CubeHash is a simple cryptographic hash function designed and written in 2008 by mathematician/cryptographer Daniel J. Bernstein. It was submitted to the NIST SHA-3 competition but was not chosen as a finalist for the reasons stated below.

CubeHash is a simple, well-understood design that received extensive third-party analysis. The design is suitable for constrained environments, as it requires a small area in hardware, and little RAM or ROM in software. However, it has poor performance without the use of a vector unit, resulting in a large overhead for short messages. Moreover, no single variant for the 512-bit digest size achieves the required theoretical security with acceptable performance. Another disadvantage of the design is the existence of the symmetric properties that are arguably a potential avenue for future attacks. NIST felt that an additional year of study would not be enough to determine whether or not the symmetric properties pose a threat. Therefore, NIST felt that CubeHash could not be chosen as SHA-3 with confidence, and thus should not be selected as a finalist.

…continued

In constrained environments, CubeHash has poor performance, but requires less ROM and RAM than most of the second-round candidates.

Having looked at other algorithms and tweaking reference code to reduce space without compromising much of the security, It initially appeared to require less ROM in software than any other algorithm in round 2 which includes Keccak. I was mostly curious to see how far the code could be reduced in assembly.

Choosing parameters

The author states in specification.

CubeHashr/b-h produces an h-bit digest of a variable-length message. The digest depends on two tunable parameters r, b that allow the selection of a range of security/performance tradeoffs

Choosing high r and low b values result in the best security but with higher r would impact performance since if you were to select r=10, b=1 then the permutation function would be invoked 10 times for each byte which is not very efficient.

To minimize size for this implementation, the parameters chosen are: r=1, b=1, h=256

Be advised, the following implementation will only work with those parameters. Changing these would probably provide incorrect hashes but if you want a look at the reference code, see the authors site. or alternatively look at implementations from the SUPERCOP library.

Permutation Function

Unrolled, this is 12 loops and can be vectorized.

#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))

void transform(cube_ctx *c, int cnt)
{
  int      i, j, k, n;
  uint32_t y[16];

  for (n=cnt; n>0; n--) 
  {
    for (k=7, j=2; j>0; k+=4, j--)
    {
      for (i=15; i>=0; --i) c->x[i + 16] += c->x[i];
      for (i=15; i>=0; --i) y[i ^ (j*4)] = c->x[i];
      for (i=15; i>=0; --i) c->x[i] = ROTL32(y[i], k);
    
      for (i=15; i>=0; --i) c->x[i] ^= c->x[i + 16];
      for (i=15; i>=0; --i) y[i ^ j] = c->x[i + 16];
      for (i=15; i>=0; --i) c->x[i + 16] = y[i];
    }
  }
}

Initialization

The original implementation requires the output digest be provided by the callee. Since I’m only interested in the 256-bit output, the code is fixed to this output length.

void cube_init(cube_ctx *c)
{
  int i;

  // initialize state
  for (i=0; i<32; ++i) {
    c->x[i] = 0;
  }
  
  // set our fixed parameters
  c->x[0] = 32;            // 256-bit
  c->x[1] = 1;             // b=1 (block size in bytes)
  c->x[2] = 1;             // r=1 (number of rounds for each block)

  transform(c, 10);        // initial 10 rounds
}

Updating

void cube_update(cube_ctx *c, const void *in, uint32_t inlen)
{
  uint8_t  *data=(uint8_t*)in;
  uint32_t u;

  while (inlen) {
    c->x[0] ^= *data++;      // absorb byte into state
    if (!inlen--) break;     // don't process last byte
    transform(c, 1);         // transform state with 1 round
  }
}

Finalization

void cube_final(void *out, cube_ctx *c)
{
  int      i;
  uint32_t u;

  // step 1.
  c->x[0] ^= 128;           // absorb length
  transform(c, 1);
  
  c->x[31] ^= 1;            // absorb final bit
  transform(c, 10);         // apply final rounds
  
  // return 256-bit hash
  for (i=0; i<32; i++) 
    ((uint8_t*)out)[i] = c->v8[i];
}

The step 1 probably requires some explanation. If you look at the reference implementation, the code is

// from Final
  crypto_uint32 u;

  u = (128 >> (state->pos % 8));
  u <<= 8 * ((state->pos / 8) % 4);
  state->x[state->pos / 32] ^= u;

Because the blocksize is 1, the length byte will always be 128.

Summary

Simplicity and suitability to constrained environments is where this function wins over any other hash algorithm I’ve covered so far. The size of variables are 32-bit and because it only uses ARX instructions, makes it suitable for wide range of 32-bit architectures.

The output assembly size for this implementation is 345 bytes.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , | Leave a comment

Asmcodes: SHA-3 / Keccak

Introduction

Keccak is a cryptographic hash function designed by Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van Assche.

It was selected by NIST to become the SHA-3 standard and is a complement to SHA-2 rather than a replacement.

Currently, SHA-2 is widely used and not known publicly to have any weaknesses. Since SHA-2 is very similar to MD4, MD5 and SHA-1 in design, weaknesses may be identified in the future and SHA-3 is intended to be a drop in replacement.

When considering to choose a MAC function, SHA-3 has the advantage of not requiring HMAC like others mentioned above. This is because SHA-3 by itself is not vulnerable to Length Extension attack. To use SHA-3 as a MAC, you simply prepend the key to each message; just another reason to consider SHA-3 over SHA-2

The following x86 assembly implementation was a joint effort between Peter Ferrie and myself. It doesn’t contain the extensible output functions and is deliberately optimized for size rather than speed.

Initialization

The nice thing about Keccak/SHA-3 (from my own perspective) is that it doesn’t use any kind of initialization values. Prior hash function designs which were mostly derived from MD4, written by Ron Rivest in the early 1990s all use some kind of initial values for the state. The permutation functions also use constants which can be computed on the fly but tend to require a lot more space.

The only steps performed here are to zero intialize the state buffer and set output/input lengths.

void SHA3_Init (SHA3_CTX *ctx, int mdlen)
{
  uint32_t i;
  
  ctx->outlen = mdlen;
  ctx->buflen = 200 - (2 * mdlen);
  ctx->index  = 0;
  
  for (i=0; i<SHA3_STATE_LEN; i++) {
    ctx->state.v64[i] = 0;
  }
}

The assembly code is translation of above and tries to minimize amount of code used.

_SHA3_Initx:
    pushad
    mov    edi, [esp+32+4]    ; context
    mov    eax, [esp+32+8]    ; outlen
    stosd                     ; ctx->outlen=outlen
    add    eax, eax           ; *= 2
    push   (SHA3_STATE_LEN*8)/2
    pop    ecx
    add    ecx, ecx           ; ecx=200
    neg    eax                ; negate
    add    eax, ecx           ; add 200
    stosd                     ; buflen = 200 - (2 * outlen)
    xor    eax, eax
    stosd                     ; index=0
    rep    stosb              ; zero the state buffer
    popad
    ret

Updating

Sponge construction is used for “absorbing” data into the “sponge” / state. Blocks of data are xor’d with the state buffer before being transformed using the permutation function.

The authors have this to say.

the sponge construction is a mode of operation, based on a fixed-length permutation (or transformation) and on a padding rule, which builds a function mapping variable-length input to variable-length output. Such a function is called a sponge function.

Sponge functions are a very clever design and can be used for many other applications apart from hash functions.

The stream cipher Spritz for example also uses sponge functions and can be used for PRNG and AEAD.

I’m not endorsing Spritz here, but for more information, read Spritz: A New RC4-Like Stream Cipher

void SHA3_Update (SHA3_CTX* ctx, void *in, uint32_t inlen)
{
  uint32_t i;
  
  // update buffer and state
  for (i=0; i<inlen; i++) {    
    if (ctx->index == ctx->buflen) {
      SHA3_Transform (ctx);
      ctx->index = 0;
    }
    // absorb byte into state
    ctx->state.v8[ctx->index++] ^= ((uint8_t*)in)[i];
  }
}

The assembly code again tries to minimize code used.

_SHA3_Updatex:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    push   eax               ; save ctx
    lodsd
    xchg   ebx, eax          ; ebx = input
    lodsd
    xchg   ecx, eax          ; ecx = len
    pop    esi               ; esi = ctx
    jecxz  upd_l2
    
    lodsd                    ; skip ctx->outlen
    lodsd                    ;
    xchg   eax, ebp          ; ebp = ctx->buflen
    push   esi               ; save ptr to ctx->index
    lodsd                    ; eax = ctx->index
upd_l0:
    cmp    eax, ebp          ; buffer full?
    jne    upd_l1
    call   SHA3_Transform    ; compress, expects ctx->state in esi
    xor    eax, eax          ; index = 0
upd_l1:
    mov    dl, [ebx]         ; absorb byte
    inc    ebx    
    xor    byte[esi+eax], dl
    inc    eax               ; increase index
    loop   upd_l0
    pop    edi
    stosd
upd_l2:
    popad
    ret

Finalize

void SHA3_Final (void* out, SHA3_CTX* ctx)
{
  uint32_t i;
  // absorb 3 bits, Keccak uses 1
  ctx->state.v8[ctx->index] ^= 6;
  // absorb end bit
  ctx->state.v8[ctx->buflen-1] ^= 0x80;
  // update context
  SHA3_Transform (ctx);
  // copy digest to buffer
  for (i=0; i<ctx->outlen; i++) {
    ((uint8_t*)out)[i] = ctx->state.v8[i];
  }
}

The assembly

_SHA3_Finalx:
    pushad
    mov    esi, [esp+32+8]      ; esi=ctx
    lodsd
    xchg   ecx, eax             ; ecx=ctx->outlen
    lodsd
    xchg   edx, eax             ; edx=ctx->buflen
    lodsd                       ; eax=ctx->index
    xor    byte[esi+eax], 6     ; ctx->state.v8[ctx->index] ^= 6;
    xor    byte[esi+edx-1], 80h ; ctx->state.v8[ctx->buflen-1] ^= 0x80;
    call   SHA3_Transform       ; SHA3_Transform (ctx->state);
    mov    edi, [esp+32+4]      ; edi=out
    rep    movsb                ; memcpy (out, ctx->state.v8, ctx->outlen);
    popad
    ret

Since the SHA-3 state uses 25 64-bit words, implementing a version in assembly using only 8 32-bit general purpose registers available is doable but not really necessary unless you were targeting systems released before 1998.

The round contants can be precomputed in advance, but to save space, the LFSR function is used instead.

// Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
uint64_t rc (uint8_t *LFSR)
{
  uint64_t c;
  uint32_t i, t;

  c = 0;
  t = *LFSR;
  
  for (i=1; i<128; i += i) 
  {
    if (t & 1) {
      c ^= (uint64_t)1ULL << (i - 1);
    }
    t = (t & 0x80) ? (t << 1) ^ 0x71 : t << 1;
  }
  *LFSR = (uint8_t)t;
  return c;
}

Compression

This function was originally implemented by Markku-Juhani O. Saarinen. The only thing I’ve done is remove the round constants and replace with LFSR function above.

const uint8_t keccakf_piln[24] = 
{ 10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4, 
  15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1  };

#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))

void SHA3_Transform (SHA3_CTX *ctx)
{
  uint32_t i, j, rnd, r;
  uint64_t t, bc[5];
  uint64_t *st=(uint64_t*)ctx->state.v64;
  uint8_t lfsr=1;
  
  for (rnd=0; rnd<SHA3_ROUNDS; rnd++) 
  {
    // Theta
    for (i=0; i<5; i++) {     
      bc[i] = st[i] 
            ^ st[i + 5] 
            ^ st[i + 10] 
            ^ st[i + 15] 
            ^ st[i + 20];
    }
    for (i=0; i<5; i++) {
      t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
      for (j=0; j<25; j+=5) {
        st[j + i] ^= t;
      }
    }

    // Rho Pi
    t = st[1];
    for (i=0, r=0; i<24; i++) {
      r += i + 1;
      j = keccakf_piln[i];
      bc[0] = st[j];
      st[j] = ROTL64(t, r & 63);
      t = bc[0];
    }

    // Chi
    for (j=0; j<25; j+=5) {
      for (i=0; i<5; i++) {
        bc[i] = st[j + i];
      }
      for (i=0; i<5; i++) {
        st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
      }
    }
    // Iota
    st[0] ^= rc(&lfsr);
  }
}

The assembly code uses MMX instructions since using 32-bit code alone would require too much space.

SHA3_Transform:
    pushad
    
    ; set up workspace
    sub    esp, 8*5
    mov    _bc, esp
    xor    r, r
    
    push   1
    pop    lfsr

    call   s3_l2
sha3_mod5:
    db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4
sha3_piln:
    db 10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4 
    db 15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1  
s3_l2:
    push   r
    push   lfsr
    ; Theta
    ; for (i = 0; i < 5; i++)     
    ;   bc[i] = st[i + 0 ] ^ 
    ;           st[i + 5 ] ^ 
    ;           st[i + 10] ^ 
    ;           st[i + 15] ^ 
    ;           st[i + 20]; 
    xor    eax, eax
s3_l3:
    movq    t, [_st+8*eax+20*8]
    pxor    t, [_st+8*eax+15*8]
    pxor    t, [_st+8*eax+10*8]
    pxor    t, [_st+8*eax+ 5*8]
    pxor    t, [_st+8*eax     ]
    movq    [_bc+8*eax        ], t
    inc     eax
    cmp     al, 5
    jnz     s3_l3
      
    ; for (i = 0; i < 5; i++) {
    ;   t = bc[(i + 4) % 5] ^ ROTL64(bc[(i+1)%5], 1);
    ;   for (j = 0; j < 25; j += 5)
    ;     st[j + i] ^= t;
    ; }
    ; ************************************
    ; for (i = 0; i < 5; i++)
    xor    i, i
s3_l4:
    ; t = ROTL64(bc[(i + 1) % 5], 1)
    mov    eax, [esp+2*4]
    push   eax
    movzx  eax, byte [eax + i + 1]
    mov    edx, [_bc+8*eax]
    mov    ebp, [_bc+8*eax+4]
    add    edx, edx
    adc    ebp, ebp
    adc    dl, ih
    ; bc[(i + 4) % 5]
    pop    eax      ; keccakf_mod5
    movzx  eax, byte [eax + i + 4]
    xor    edx, [_bc+8*eax]
    xor    ebp, [_bc+8*eax+4]
    ; for (j = 0; j < 25; j += 5)
    xor    j, j
s3_l5:
    ; st[j + i] ^= t;
    lea    eax, [j+i]
    xor    [_st+8*eax], edx
    xor    [_st+8*eax+4], ebp
    add    j, 5
    cmp    j, 25
    jnz    s3_l5
    
    inc    i
    cmp    i, 5
    jnz    s3_l4
            
    ; // Rho Pi
    ; t = st[1];
    ; for (i = 0; i < 24; i++) {
    ;   j = keccakf_piln[i];
    ;   bc[0] = st[j];
    ;   st[j] = ROTL64(t, keccakf_rotc[i]);
    ;   t = bc[0];
    ; }
    ; *************************************
    ; t = st[1]
    mov    edx, [_st+8]
    mov    ebp, [_st+8+4]
    xor    i, i
    ; for (i = 0; i < 24; i++)
    push   i
s3_l6:
    push   ecx
    ; j = keccakf_piln[i];
    mov    eax, [esp+4*4]
    movzx  j, byte [eax + i + (sha3_piln - sha3_mod5)]
    mov    eax, [esp+4]
    lea    ecx, [eax+i+1]
    mov    [esp+4], ecx
    and    ecx, 63
    ;mov    cl, byte [eax + i + (sha3_rotc - sha3_piln)]
    ; st[j] = ROTL64(t, keccakf_rotc[i]);
s3_l06x:
    add    edx, edx
    adc    ebp, ebp
    adc    dl, ih
    loop   s3_l06x
    ; bc[0] = st[j];
    xchg   [_st+8*j], edx
    mov    [_bc], edx
    xchg   [_st+8*j+4], ebp
    mov    [_bc+4], ebp
    pop    ecx
    inc    i
    cmp    i, 24
    jnz    s3_l6
    pop    eax
      
    ; // Chi
    ; for (j = 0; j < 25; j += 5) {
    ;   for (i = 0; i < 5; i++)
    ;     bc[i] = st[j + i];
    ;   for (i = 0; i < 5; i++)
    ;     st[j + i] ^= (~bc[(i+1)%5]) & bc[(i+2)%5];
    ; }
    ; *********************************
    ; for (j=0; j<25; j+=5)
    xor    j, j
s3_l7:
    ; for (i=0; i<5; i++)
    xor    i, i
s3_l8:
    ; bc[i] = st[j + i];
    lea    eax, [j+i]
    movq   t, [_st+8*eax]
    movq   [_bc+8*i], t
    inc    i
    cmp    i, 5
    jnz    s3_l8
        
    ; for (i=0; i<5; i++)
    xor    i, i
s3_l9:
    ; st[j + i] ^= (~bc[(i+1)%5]) & bc[(i+2)%5];
    mov    eax, [esp+2*4] ; keccakf_mod5
    push   eax 
    movzx  eax, byte [eax + i + 1]
    movq   t, [_bc+8*eax]
    pop    eax            ; keccakf_mod5 
    movzx  eax, byte [eax + i + 2]
    pandn  t, [_bc+8*eax]
    lea    eax, [j+i]
    pxor   t, [_st+8*eax]
    movq   [_st+8*eax], t
    inc    i
    cmp    i, 5
    jnz    s3_l9
    
    add    j, 5
    cmp    j, 25
    jnz    s3_l7
           
    pop    lfsr

    ; // Iota
    ; st[0] ^= keccakf_rndc[round];
    movq    mm4, [_st]
;;    call    rc
rc:
    pxor   mm0, mm0        ; c=0
    pxor   mm1, mm1        ; 
    push   1
    pop    eax             ; i=1
    movd   mm1, eax
rc_l00:
    test   dl, 1           ; (t & 1)
    jz     rc_l01
    ; ecx = (i - 1)
    lea    ecx, [eax-1]
    movd   mm2, ecx
    movq   mm3, mm1
    ; 1ULL << (i - 1)
    psllq  mm3, mm2
    pxor   mm0, mm3        ; c ^= 1ULL << (i - 1)
rc_l01:
    add    dl, dl          ; t += t
    jnc    rc_l02
    xor    dl, 71h
rc_l02:
    add    al, al          ; i += i
    jns    rc_l00
;;    ret
  
    pxor    mm4, t
    movq    [_st], mm4
    
    pop     r
    inc     r
    cmp     r, SHA3_ROUNDS
    jnz     s3_l2
    
    add    esp, 8*5 + 4
    popad
    ret

Results

architecture size
x86 461 (using MMX)
x64 n/a

Conclusion

In my humble opinion, it’s a much better design for implementation than any other cryptographic hash function out there. It can be used for MAC without requiring HMAC algorithm and is therefore much more compact.

I’ve already considered SHA extensions provided by Intel CPU.

Code may be updated in future but not shown here, please refer to sources here and here.

Posted in assembly, cryptography, programming, security | Tagged , , , , , , , , , | 1 Comment