Asmcodes: Half SipHash


SipHash: a fast short-input Pseudo-Random-Function(PRF) was designed and published in 2012 by Jean-Philippe Aumasson and Daniel J. Bernstein.

Last month, MR. Aumasson posted to the kernel-hardening mail list a link to a “Half SipHash” implementation which is intended to boost performance. What I show here is Half SipHash but only generates 32-bit hashes.

I already looked at the original code and felt it didn’t fit well onto a 32-bit architecture. Incidentally ChasKey by Nicky Mouha is derived from SipHash, designed specifically for 32-bit architectures and provides 128-bit security.

So this code is really just half of the Half SipHash function returning 32-bits which might not be adequate for some people. The assembly code is presumably slower than a compiler generated version due to the choice of instructions.

Round function

The round function consists of simple ARX (Add-Rotate-Xor) operations which makes it ideal for almost any CPU.

Some of the more common operations on the 128-bit state before and after ARX rounds are now built into the round function itself and work depending on additional parameter: flag which is either zero or one.

After the message, key and length have been absorbed into the state, it performs the following for 32-bit hashes.

v[2] ^= 0xff;

This is simply flipping the lower 8 bits so the additional parameter to SIPROUND is either zero or one which when negated can either flip bits or have no effect at all. Xor’ing by zero does nothing but if we negate 1 and Xor the result, that is the same as performing a logical not.

Since our cROUNDS are 2 and dROUNDS for the end are 4, we can multiply last flag by 2 and add to rnds counter thus avoiding the need for any conditional testing.

    uint32_t v[4], 
    uint32_t w, 
    uint8_t last)
  int i, rnds=cROUNDS;
  v[2] ^= (uint8_t)-last;
  rnds += (last * 2);
  v[3] ^= w;
  for (i=0; i<rnds; i++)
    v[0] += v[1];                                                              
    v[1] = ROTL(v[1], 5);                                                      
    v[1] ^= v[0];                                                              
    v[0] = ROTL(v[0], 16);                                                     
    v[2] += v[3];                                                              
    v[3] = ROTL(v[3], 8);                                                      
    v[3] ^= v[2];                                                              
    v[0] += v[3];                                                              
    v[3] = ROTL(v[3], 7);                                                      
    v[3] ^= v[0];                                                              
    v[2] += v[1];                                                              
    v[1] = ROTL(v[1], 13);                                                     
    v[1] ^= v[2];                                                              
    v[2] = ROTL(v[2], 16);   
  v[0] ^= w;

uint32_t hsh32(
    const uint8_t *data, 
    const size_t inlen, 
    const uint8_t *key) 
    uint32_t v[4];
    uint8_t i;
    uint32_t m;
    uint8_t *in = (uint8_t*)data;
    uint32_t *k = (uint32_t*)key;
    uint32_t k0 = k[0];
    uint32_t k1 = k[1];
    int left = inlen & 3;        
    const uint8_t *end = in + inlen - left;
    uint32_t b = ((uint32_t)inlen) << 24;
    v[0] = k0;
    v[1] = k1;
    v[2] = k0 ^ 0x6c796765;
    v[3] = k1 ^ 0x74656462;

    for (; in != end; in += 4) {
      m = ((uint32_t*)in)[0];
      SIPROUND(v, m, 0);

    while (--left >= 0) {
      b |= ((uint32_t)in[left]) << (8 * left);   

    for (i=0; i<2; i++) {
      SIPROUND(v, b, i);
      b = 0;
    return v[1] ^ v[3];

With the above code in C, it’s much easier to implement assembly. Instead of using a flag parameter, we clear or set the Carry Flag (CF) using Clear Carry CLC or Set Carry STC. You can see in the last loop, EFLAGS register is saved with PUSHFD/POPFD and then flipped using Complement Carry CMC.

Inside the round function, we avoid flipping bits using Jump If No Carry JNC

; don't change these    
%define cROUNDS 2
%define dROUNDS 4

%define v0 ebx    
%define v1 ebp    
%define v2 edx    
%define v3 edi

    lea    esi, [esp+32+ 4]
    push   eax               ; save data
    push   eax               ; save inlen
    xchg   eax, esi          ; esi = key 
    ; initialize state
    lodsd                    ; eax = k0
    mov    v0, eax           ; v0  = k0 
    mov    v2, eax           ; v2  = k0
    lodsd                    ; eax = k1
    mov    v1, eax           ; v1  = k1
    mov    v3, eax           ; v3  = k1
    xor    v2, 0x6c796765
    xor    v3, 0x74656462
    ; update state in 4-byte blocks
    pop    ecx               ; ecx = inlen
    pop    esi               ; esi = data 
    push   ecx               ; save inlen
    shr    ecx, 2            ; inlen /= 4 
    jz     shx_l2
    call   SIPROUND
    loop   shx_l0
    pop    ecx               ; restore inlen
    mov    eax, ecx
    push   edx               ; save edx
    cdq                      ; edx = 0    
    shl    eax, 24           ; eax = inlen << 24
    and    ecx, 3            ; inlen &= 3
    dec    ecx               ; if (--left >= 0) 
    js     shx_l4            ;   goto shx_l4
    shl    edx, 8            ; t <<= 8
    mov    dl, byte[esi+ecx] ; t |= in[left]
    jmp    shx_l3
    or     eax, edx          ; b |= t
    pop    edx               ; restore edx
    clc                      ; CF=0  
    pushfd                   ; save flags
    call   SIPROUND
    popfd                    ; restore flags
    cmc                      ; CF=1 for last round
    jc     shx_l5
    xor    v1, v3            ; v[1] ^= v[3]
    mov    [esp+28], v1      ; pushad_t._eax = v1

; CF=0 for normal update    
; CF=1 for final update    
    push   ecx               ; save ecx
    push   cROUNDS 
    pop    ecx
    jnc    sr_l0             ; skip if no carry
    xor    eax, eax          ; w=0 if last round
    add    ecx, ecx          ; assumes: cROUNDS=2, dROUNDS=4
    not    dl                ; requires edx to be v2
    xor    v3, eax           ; v3 ^= w    
    add    v0, v1            ; v[0] += v[1];
    rol    v1, 5             ; v[1] = ROTL(v[1], 5);
    xor    v1, v0            ; v[1] ^= v[0];
    rol    v0, 16            ; v[0] = ROTL(v[0], 16);
    add    v2, v3            ; v[2] += v[3]; 
    rol    v3, 8             ; v[3] = ROTL(v[3], 8); 
    xor    v3, v2            ; v[3] ^= v[2]; 
    add    v0, v3            ; v[0] += v[3];
    rol    v3, 7             ; v[3] = ROTL(v[3], 7);
    xor    v3, v0            ; v[3] ^= v[0];
    add    v2, v1            ; v[2] += v[1];
    rol    v1, 13            ; v[1] = ROTL(v[1], 13);
    xor    v1, v2            ; v[1] ^= v[2];
    rol    v2, 16            ; v[2] = ROTL(v[2], 16);
    loop   sr_l1
    xor    v0, eax           ; v0 ^= w
    pop    ecx               ; restore ecx


The code generated by MSVC compiler using /Os /O2 switches resulted in 261 bytes. The assembly version is 148 bytes. See sources here for any future updates.

This entry was posted in assembly, cryptography, programming, security and tagged , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s