Keccak Permutation Function

Introduction

Permutation functions are incredibly useful, and Keccak which is used to implement the SHA-3 standard is a like a swiss army knife of cryptographic primitives.

Designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche, Keccak can be used to construct a stream cipher, a cryptographic hash function, a pseudo random number generator(PRNG), message authentication code (MAC) or authenticated encryption. (AE)

Although SHA-3 (FIPS 202) was covered in a previous post, the underlying permutation function (Keccak-p[1600, 24]) was for the most part overlooked.

A lightweight implementation of Keccak-p[800, 22] for 32-bit architectures will be shown here, specifically the x86 CPU.

Keccak Parameters

F Target Architecture Capacity Rounds
p200 8-bit 200-bits 18
p400 16-bit 400-bits 20
p800 32-bit 800-bits 22
p1600 64-bit 1600-bits 24

Keccak Modules

Module Description
Theta Renders the internal state into a 5-by-5 array of n-bit elements. Computes the parity of each column and combines them with an exclusive-or (XOR) operator.Then it XORs the resulting parity to each state bit as follows.
Rho Rotates each element by a triangular number.
Pi Permutes the elements.
Chi Adds a non-linear aspect to the permutation round. Combines the row elements using only three bitwise operators: AND, NOT, and XOR.
Iota Breaks up any symmetry caused by the other modules. This is done by XORing one of the array elements to a round constant. Without iota, the round mapping would be symmetric. Without iota, all rounds would be the same.

Keccak-p800 in C

```// round constant function
// Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
uint32_t rc (uint8_t *LFSR) {
uint32_t c;
int8_t   t;
uint8_t  i;

c = 0;
t = *LFSR;

for (i=1; i<128; i += i)
{
if (t & 1) {
// if shift value is < 32
if ((i-1) < 32) {
c ^= 1UL << (i - 1);
}
}
t = (t & 0x80) ? (t << 1) ^ 0x71 : t << 1;
}
*LFSR = (uint8_t)t;
return c;
}

void k800_permute (void *state) {
uint32_t i, j, r, t, u, bc[5];
uint8_t  lfsr=1;
uint32_t *st=(uint32_t*)state;
uint8_t  *p, *m;
int      rnd;

uint32_t piln[6]=
{ 0x110b070a, 0x10050312, 0x04181508,
0x0d13170f, 0x0e14020c, 0x01060916 };

uint32_t m5[3]=
{ 0x03020100, 0x02010004, 0x00000403 };

p = (uint8_t*)piln;
m = (uint8_t*)m5;

for (rnd=0; rnd<22; rnd++) {
// Theta
for (i=0; i<5; i++) {
t  = st[i];
t ^= st[i +  5];
t ^= st[i + 10];
t ^= st[i + 15];
t ^= st[i + 20];
bc[i] = t;
}
for (i=0; i<5; i++) {
t  = bc[m[(i + 4)]];
t ^= ROTL32(bc[m[(i + 1)]], 1);
for (j=i; j<25; j+=5) {
st[j] ^= t;
}
}
// Rho + Pi
u = st[1];
for (i=0, r=0; i<24; i++) {
r = r + i + 1;
u  = ROTL32(u, r);
XCHG(st[p[i]], u);
bc[0] = u;
}
// Chi
for (i=0; i<25; i+=5) {
memcpy(&bc, &st[i], 5*4);
for (j=0; j<5; j++) {
t  = ~bc[m[(j + 1)]];
t &=  bc[m[(j + 2)]];
st[j + i] ^= t;
}
}
// Iota
st[0] ^= rc(&lfsr);
}
}
```

32-bit x86 Assembly

The following code includes optmizations from Peter Ferrie

```bits   32

struc kws_t
bc1  resd 1 ; edi
bc2  resd 1 ; esi
bc3  resd 1 ; ebp
bc4  resd 1 ; esp
bc5  resd 1 ; ebx
lfsr resd 1 ; edx
; ecx
; eax
.size:
endstruc

%ifndef BIN
global k800_permutex
global _k800_permutex
%endif

; void k800_permutex(void *state);
k800_permutex:
_k800_permutex:
mov    esi, [esp+32+4]      ; esi = st
call   k800_l0
; modulo 5
dd     003020100h, 002010004h, 000000403h
; rho pi
dd     0110b070ah, 010050312h, 004181508h
dd     00d13170fh, 00e14020ch, 001060916h
k800_l0:
pop    ebx                  ; m + p
push   22
pop    eax
cdq
inc    edx                  ; lfsr = 1
mov    edi, esp             ; edi = bc
k800_l1:
push   eax
push   5
pop    ecx
theta_l0:
; Theta
lodsd                       ; t  = st[i     ];
xor    eax, [esi+ 5*4-4]    ; t ^= st[i +  5];
xor    eax, [esi+10*4-4]    ; t ^= st[i + 10];
xor    eax, [esi+15*4-4]    ; t ^= st[i + 15];
xor    eax, [esi+20*4-4]    ; t ^= st[i + 20];
stosd                       ; bc[i] = t;
loop   theta_l0
xor    eax, eax
theta_l1:
movzx  ebp, byte[ebx+eax+4] ; ebp = m[(i + 4)];
mov    ebp, [edi+ebp*4]     ; t   = bc[m[(i + 4)]];
movzx  edx, byte[ebx+eax+1] ; edx = m[(i + 1)];
mov    edx, [edi+edx*4]     ; edx = bc[m[(i + 1)]];
rol    edx, 1               ; t  ^= ROTL32(edx, 1);
xor    ebp, edx
theta_l2:
xor    [esi+eax*4], ebp     ; st[j] ^= t;
cmp    al, 25               ; j<25
jb     theta_l2
sub    al, 24               ; i=i+1
loop   theta_l1             ; i<5
; *************************************
; Rho Pi
; *************************************
mov    ebp, [esi+1*4]       ; t = st[1];
rho_l0:
lea    ecx, [ecx+eax-4]     ; r = r + i + 1;
rol    ebp, cl              ; t = ROTL32(t, r);
movzx  edx, byte[ebx+eax+7] ; edx = p[i];
xchg   [esi+edx*4], ebp     ; XCHG(st[p[i]], t);
inc    eax                  ; i++
cmp    al, 24+5             ; i<24
jnz    rho_l0               ;
; *************************************
; Chi
; *************************************
xor    ecx, ecx             ; i = 0
chi_l0:
; memcpy(&bc, &st[i], 5*4);
lea    esi, [esi+ecx*4]     ; esi = &st[i];
mov    cl, 5
rep    movsd
xor    eax, eax
chi_l1:
movzx  ebp, byte[ebx+eax+1]
movzx  edx, byte[ebx+eax+2]
mov    ebp, [edi+ebp*4]     ; t = ~bc[m[(i + 1)]]
not    ebp
and    ebp, [edi+edx*4]
lea    edx, [eax+ecx]       ; edx = j + i
xor    [esi+edx*4], ebp     ; st[j + i] ^= t;
inc    eax                  ; j++
cmp    al, 5                ; j<5
jnz    chi_l1
cmp    cl, 25               ; i<25
jb     chi_l0
; Iota
mov    al, [esp+kws_t+lfsr+4]; al = t = *LFSR
mov    dl, 1                ; i = 1
xor    ebp, ebp
iota_l0:
test   al, 1                ; t & 1
je     iota_l1
lea    ecx, [edx-1]         ; ecx = (i - 1)
cmp    cl, 32               ; skip if (ecx >= 32)
jae    iota_l1
btc    ebp, ecx             ; c ^= 1UL << (i - 1)
iota_l1:
add    al, al               ; t << 1
sbb    ah, ah               ; ah = (t < 0) ? 0x00 : 0xFF
and    ah, 0x71             ; ah = (ah == 0xFF) ? 0x71 : 0x00
xor    al, ah
add    dl, dl               ; i += i
jns    iota_l0              ; while (i != 128)
mov    [esp+kws_t+lfsr+4], al; save t
xor    [esi], ebp           ; st[0] ^= rc(&lfsr);
pop    eax
dec    eax
jnz    k800_l1              ; rnds<22