## Half SipHash

Introduction

SipHash: a fast short-input Pseudo-Random-Function(PRF) was designed and published in 2012 by Jean-Philippe Aumasson and Daniel J. Bernstein.

Last month, MR. Aumasson posted to the kernel-hardening mail list a link to a “Half SipHash” implementation which is intended to boost performance. What I show here is Half SipHash but only generates 32-bit hashes.

I already looked at the original code and felt it didn’t fit well onto a 32-bit architecture. Incidentally ChasKey by Nicky Mouha is derived from SipHash, designed specifically for 32-bit architectures and provides 128-bit security.

So this code is really just half of the Half SipHash function returning 32-bits which might not be adequate for some people. The assembly code is presumably slower than a compiler generated version due to the choice of instructions.

Round function

The round function consists of simple ARX (Add-Rotate-Xor) operations which makes it ideal for almost any CPU.

Some of the more common operations on the 128-bit state before and after ARX rounds are now built into the round function itself and work depending on additional parameter: flag which is either zero or one.

After the message, key and length have been absorbed into the state, it performs the following for 32-bit hashes.

```v[2] ^= 0xff;
```

This is simply flipping the lower 8 bits so the additional parameter to SIPROUND is either zero or one which when negated can either flip bits or have no effect at all. Xor’ing by zero does nothing but if we negate 1 and Xor the result, that is the same as performing a logical not.

Since our cROUNDS are 2 and dROUNDS for the end are 4, we can multiply last flag by 2 and add to rnds counter thus avoiding the need for any conditional testing.

```void SIPROUND(
uint32_t v[4],
uint32_t w,
uint8_t last)
{
int i, rnds=cROUNDS;

v[2] ^= (uint8_t)-last;
rnds += (last * 2);

v[3] ^= w;

for (i=0; i<rnds; i++)
{
v[0] += v[1];
v[1] = ROTL(v[1], 5);
v[1] ^= v[0];
v[0] = ROTL(v[0], 16);
v[2] += v[3];
v[3] = ROTL(v[3], 8);
v[3] ^= v[2];
v[0] += v[3];
v[3] = ROTL(v[3], 7);
v[3] ^= v[0];
v[2] += v[1];
v[1] = ROTL(v[1], 13);
v[1] ^= v[2];
v[2] = ROTL(v[2], 16);
}
v[0] ^= w;
}

uint32_t hsh32(
const uint8_t *data,
const size_t inlen,
const uint8_t *key)
{
uint32_t v[4];
uint8_t i;
uint32_t m;

uint8_t *in = (uint8_t*)data;
uint32_t *k = (uint32_t*)key;

uint32_t k0 = k[0];
uint32_t k1 = k[1];

int left = inlen & 3;
const uint8_t *end = in + inlen - left;
uint32_t b = ((uint32_t)inlen) << 24;

v[0] = k0;
v[1] = k1;
v[2] = k0 ^ 0x6c796765;
v[3] = k1 ^ 0x74656462;

for (; in != end; in += 4) {
m = ((uint32_t*)in)[0];
SIPROUND(v, m, 0);
}

while (--left >= 0) {
b |= ((uint32_t)in[left]) << (8 * left);
}

for (i=0; i<2; i++) {
SIPROUND(v, b, i);
b = 0;
}

return v[1] ^ v[3];
}
```

With the above code in C, it’s much easier to implement assembly. Instead of using a flag parameter, we clear or set the Carry Flag (CF) using Clear Carry CLC or Set Carry STC. You can see in the last loop, EFLAGS register is saved with PUSHFD/POPFD and then flipped using Complement Carry CMC.

Inside the round function, we avoid flipping bits using Jump If No Carry JNC

```; don't change these
%define cROUNDS 2
%define dROUNDS 4

%define v0 ebx
%define v1 ebp
%define v2 edx
%define v3 edi

_hsh32x:
hsh32x:
lea    esi, [esp+32+ 4]
lodsd
push   eax               ; save data
lodsd
push   eax               ; save inlen
lodsd
xchg   eax, esi          ; esi = key

; initialize state
lodsd                    ; eax = k0
mov    v0, eax           ; v0  = k0
mov    v2, eax           ; v2  = k0
lodsd                    ; eax = k1
mov    v1, eax           ; v1  = k1
mov    v3, eax           ; v3  = k1
xor    v2, 0x6c796765
xor    v3, 0x74656462

; update state in 4-byte blocks
pop    ecx               ; ecx = inlen
pop    esi               ; esi = data
push   ecx               ; save inlen
shr    ecx, 2            ; inlen /= 4
jz     shx_l2
shx_l0:
lodsd
shx_l1:
clc
call   SIPROUND
loop   shx_l0
shx_l2:
pop    ecx               ; restore inlen
mov    eax, ecx
push   edx               ; save edx
cdq                      ; edx = 0
shl    eax, 24           ; eax = inlen << 24
and    ecx, 3            ; inlen &= 3
shx_l3:
dec    ecx               ; if (--left >= 0)
js     shx_l4            ;   goto shx_l4
shl    edx, 8            ; t <<= 8
mov    dl, byte[esi+ecx] ; t |= in[left]
jmp    shx_l3
shx_l4:
or     eax, edx          ; b |= t
pop    edx               ; restore edx
clc                      ; CF=0
shx_l5:
pushfd                   ; save flags
call   SIPROUND
popfd                    ; restore flags
cmc                      ; CF=1 for last round
jc     shx_l5
xor    v1, v3            ; v[1] ^= v[3]
mov    [esp+28], v1      ; pushad_t._eax = v1
ret

; CF=0 for normal update
; CF=1 for final update
SIPROUND:
push   ecx               ; save ecx
push   cROUNDS
pop    ecx
jnc    sr_l0             ; skip if no carry

xor    eax, eax          ; w=0 if last round
add    ecx, ecx          ; assumes: cROUNDS=2, dROUNDS=4
not    dl                ; requires edx to be v2
sr_l0:
xor    v3, eax           ; v3 ^= w
sr_l1:
add    v0, v1            ; v[0] += v[1];
rol    v1, 5             ; v[1] = ROTL(v[1], 5);
xor    v1, v0            ; v[1] ^= v[0];
rol    v0, 16            ; v[0] = ROTL(v[0], 16);
add    v2, v3            ; v[2] += v[3];
rol    v3, 8             ; v[3] = ROTL(v[3], 8);
xor    v3, v2            ; v[3] ^= v[2];
add    v0, v3            ; v[0] += v[3];
rol    v3, 7             ; v[3] = ROTL(v[3], 7);
xor    v3, v0            ; v[3] ^= v[0];
add    v2, v1            ; v[2] += v[1];
rol    v1, 13            ; v[1] = ROTL(v[1], 13);
xor    v1, v2            ; v[1] ^= v[2];
rol    v2, 16            ; v[2] = ROTL(v[2], 16);
loop   sr_l1
xor    v0, eax           ; v0 ^= w
pop    ecx               ; restore ecx
ret
```

Summary

The code generated by MSVC compiler using /Os /O2 switches resulted in 261 bytes. The assembly version is 148 bytes. See sources here for any future updates.