Keccak Permutation Function

Introduction

Permutation functions are incredibly useful, and Keccak which is used to implement the SHA-3 standard is a like a swiss army knife of cryptographic primitives.

Designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche, Keccak can be used to construct a stream cipher, a cryptographic hash function, a pseudo random number generator(PRNG), message authentication code (MAC) or authenticated encryption. (AE)

Although SHA-3 (FIPS 202) was covered in a previous post, the underlying permutation function (Keccak-p[1600, 24]) was for the most part overlooked.

A lightweight implementation of Keccak-p[800, 22] for 32-bit architectures will be shown here, specifically the x86 CPU.

Keccak Parameters

F Target Architecture Capacity Rounds
p200 8-bit 200-bits 18
p400 16-bit 400-bits 20
p800 32-bit 800-bits 22
p1600 64-bit 1600-bits 24

Keccak Modules

Module Description
Theta Renders the internal state into a 5-by-5 array of n-bit elements. Computes the parity of each column and combines them with an exclusive-or (XOR) operator.Then it XORs the resulting parity to each state bit as follows.
Rho Rotates each element by a triangular number.
Pi Permutes the elements.
Chi Adds a non-linear aspect to the permutation round. Combines the row elements using only three bitwise operators: AND, NOT, and XOR.
Iota Breaks up any symmetry caused by the other modules. This is done by XORing one of the array elements to a round constant. Without iota, the round mapping would be symmetric. Without iota, all rounds would be the same.

Keccak-p800 in C

// round constant function
// Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
uint32_t rc (uint8_t *LFSR) {
    uint32_t c; 
    int8_t   t;
    uint8_t  i;

    c = 0;
    t = *LFSR;
    
    for (i=1; i<128; i += i) 
    {
      if (t & 1) {
        // if shift value is < 32
        if ((i-1) < 32) {
          c ^= 1UL << (i - 1);
        }
      }
      t = (t & 0x80) ? (t << 1) ^ 0x71 : t << 1;
    }
    *LFSR = (uint8_t)t;
    return c;
}

void k800_permute (void *state) {
    uint32_t i, j, r, t, u, bc[5];
    uint8_t  lfsr=1;
    uint32_t *st=(uint32_t*)state;
    uint8_t  *p, *m;
    int      rnd;
    
    uint32_t piln[6]=
    { 0x110b070a, 0x10050312, 0x04181508, 
      0x0d13170f, 0x0e14020c, 0x01060916 };

    uint32_t m5[3]=
    { 0x03020100, 0x02010004, 0x00000403 };
    
    p = (uint8_t*)piln;
    m = (uint8_t*)m5;
    
    for (rnd=0; rnd<22; rnd++) {
      // Theta
      for (i=0; i<5; i++) {     
        t  = st[i]; 
        t ^= st[i +  5]; 
        t ^= st[i + 10]; 
        t ^= st[i + 15]; 
        t ^= st[i + 20];
        bc[i] = t;
      }
      for (i=0; i<5; i++) {
        t  = bc[m[(i + 4)]]; 
        t ^= ROTL32(bc[m[(i + 1)]], 1);
        for (j=i; j<25; j+=5) {
          st[j] ^= t;
        }
      }
      // Rho + Pi
      u = st[1];      
      for (i=0, r=0; i<24; i++) {
        r = r + i + 1;       
        u  = ROTL32(u, r);
        XCHG(st[p[i]], u);
        bc[0] = u;
      }
      // Chi
      for (i=0; i<25; i+=5) {
        memcpy(&bc, &st[i], 5*4);      
        for (j=0; j<5; j++) {
          t  = ~bc[m[(j + 1)]];
          t &=  bc[m[(j + 2)]];        
          st[j + i] ^= t;
        }
      }
      // Iota
      st[0] ^= rc(&lfsr);
    }
}

32-bit x86 Assembly

; -----------------------------------------------
; Keccak-p800 in x86 assembly
;
; size: 254 bytes
;
; global calls use cdecl convention
;
; -----------------------------------------------
    bits   32
  
struc kws_t
  bc1  resd 1 ; edi
  bc2  resd 1 ; esi
  bc3  resd 1 ; ebp
  bc4  resd 1 ; esp
  bc5  resd 1 ; ebx
  lfsr resd 1 ; edx
  ; ecx
  ; eax
  .size:
endstruc
  
    %ifndef BIN
      global k800_permutex
      global _k800_permutex
    %endif
    
k800_permutex:
_k800_permutex:
    pushad
    mov    esi, [esp+32+4]      ; esi = st
    call   k800_l0
    ; modulo 5    
    dd     003020100h, 002010004h, 000000403h
    ; rho pi
    dd     0110b070ah, 010050312h, 004181508h 
    dd     00d13170fh, 00e14020ch, 001060916h
k800_l0:
    pop    ebx                  ; m + p
    push   22
    pop    eax
    cdq
    inc    edx                  ; lfsr = 1    
    pushad                      ; create local space
    mov    edi, esp             ; edi = bc   
k800_l1:    
    push   eax    
    push   5 
    pop    ecx    
    pushad
theta_l0:
    ; Theta
    lodsd                       ; t  = st[i     ];  
    xor    eax, [esi+ 5*4-4]    ; t ^= st[i +  5];
    xor    eax, [esi+10*4-4]    ; t ^= st[i + 10];
    xor    eax, [esi+15*4-4]    ; t ^= st[i + 15];
    xor    eax, [esi+20*4-4]    ; t ^= st[i + 20];
    stosd                       ; bc[i] = t;
    loop   theta_l0    
    
    popad
    xor    eax, eax    
theta_l1:
    movzx  ebp, byte[ebx+eax+4] ; ebp = m[(i + 4)];
    mov    ebp, [edi+ebp*4]     ; t   = bc[m[(i + 4)]];    
    movzx  edx, byte[ebx+eax+1] ; edx = m[(i + 1)];
    mov    edx, [edi+edx*4]     ; edx = bc[m[(i + 1)]];
    rol    edx, 1               ; t ^= ROTL32(edx, 1);
    xor    ebp, edx
    push   eax                  ; save i
theta_l2:
    xor    [esi+eax*4], ebp     ; st[j] ^= t;
    add    al, 5                ; j+=5 
    cmp    al, 25               ; j<25
    jb     theta_l2
    
    pop    eax                  ; restore i    
    inc    eax                  ; i++
    cmp    al, 5                ; i<5
    jnz    theta_l1
    ; *************************************
    ; Rho Pi
    ; *************************************
    mov    ebp, [esi+1*4]       ; t = st[1];
    xor    eax, eax 
    xor    ecx, ecx             ; r = 0;
rho_l0:
    lea    ecx, [ecx+eax+1]     ; r = r + i + 1;
    rol    ebp, cl              ; t = ROTL32(t, r); 
    movzx  edx, byte[ebx+eax+12]; edx = p[i];
    xchg   [esi+edx*4], ebp     ; XCHG(st[p[i]], t);
    mov    [edi+0*4], ebp       ; bc[0] = t;
    inc    eax                  ; i++
    cmp    al, 24               ; i<24
    jnz    rho_l0               ; 
    ; *************************************
    ; Chi
    ; *************************************
    xor    ecx, ecx             ; i = 0   
chi_l0:    
    pushad
    ; memcpy(&bc, &st[i], 5*4);
    lea    esi, [esi+ecx*4]     ; esi = &st[i];
    mov    cl, 5
    rep    movsd
    popad
    xor    eax, eax
chi_l1:
    movzx  ebp, byte[ebx+eax+1]
    movzx  edx, byte[ebx+eax+2]
    mov    ebp, [edi+ebp*4]     ; t = ~bc[m[(i + 1)]] 
    not    ebp            
    and    ebp, [edi+edx*4]
    lea    edx, [eax+ecx]       ; edx = j + i    
    xor    [esi+edx*4], ebp     ; st[j + i] ^= t;  
    inc    eax                  ; j++
    cmp    al, 5                ; j<5
    jnz    chi_l1    
    
    add    cl, 5                ; i+=5;
    cmp    cl, 25               ; i<25
    jnz    chi_l0

    ; Iota
    lea    eax, [esp+kws_t+lfsr+4]; eax = lfsr
    pushad
    xor    esi, esi             ; esi = 0
    xchg   eax, esi             ; esi = &lfsr, eax = 0
    mov    edi, esi             ; edi = &lfsr
    cdq                         ; i = 0
    xchg   eax, ebx             ; c = 0
    inc    edx                  ; i = 1
    lodsb                       ; al = t = *LFSR
iota_l0:    
    test   al, 1                ; t & 1
    je     iota_l1    
    lea    ecx, [edx-1]         ; ecx = (i - 1)
    cmp    cl, 32               ; skip if (ecx >= 32)
    jae    iota_l1    
    btc    ebx, ecx             ; c ^= 1UL << (i - 1)
iota_l1:    
    add    al, al               ; t << 1
    sbb    ah, ah               ; ah = (t < 0) ? 0x00 : 0xFF
    and    ah, 0x71             ; ah = (ah == 0xFF) ? 0x71 : 0x00  
    xor    al, ah  
    add    dl, dl               ; i += i
    jns    iota_l0              ; while (i != 128)
    stosb                       ; save t
    mov    [esp+28], ebx        ; return c & 255
    popad        
    
    xor    [esi], eax           ; st[0] ^= rc(&lfsr);  
    
    pop    eax
    dec    eax
    jnz    k800_l1              ; rnds<22
    
    popad                       ; release bc
    popad                       ; restore registers 
    ret

Summary

The x86 assembly generated by MSVC is approx. 342 bytes while the hand written x86 assembly is currently 254 bytes.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming and tagged , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s