Xoodoo Permutation Function

Introduction

Xoodoo is a cryptographic permutation function designed by the Keccak team along with Seth Hoffert and Johan De Meulder. It was described by Joan Daemen at the 2017 Workshop on Elliptic Curve Cryptography in a talk: Innovations in permutation-based crypto The design is very similar to the Keccak permutation function, but obviously draws inspiration from the design of Gimli that also uses a 384-bit state, and works efficiently on many platforms. The C code shown here is derived from a python implementation. The Keccak team have posted their own implementations here with further documentation in the Xoodoo cookbook.

C code

When calling this function, the p parameter should point to a 48-byte buffer.

#define R(v,n)(((v)>>(n))|((v)<<(32-(n))))
#define X(u,v)t=s[u],s[u]=s[v],s[v]=t
#define F(n)for(i=0;i<n;i++)
typedef unsigned int W;

void xoodoo(void*p){
  W e[4],a,b,c,t,r,i,*s=p;
  W x[12]={
    0x058,0x038,0x3c0,0x0d0,
    0x120,0x014,0x060,0x02c,
    0x380,0x0f0,0x1a0,0x012};

  for(r=0;r<12;r++){
    F(4)
      e[i]=R(s[i]^s[i+4]^s[i+8],18),
      e[i]^=R(e[i],9);
    F(12)
      s[i]^=e[(i-1)&3];
    X(7,4);X(7,5);X(7,6);
    s[0]^=x[r];
    F(4)
      a=s[i],
      b=s[i+4],
      c=R(s[i+8],21),
      s[i+8]=R((b&~a)^c,24),
      s[i+4]=R((a&~c)^b,31),
      s[i]^=c&~b;
    X(8,10);X(9,11);
  }
}

x86 Assembly

%define x0 eax
%define x1 ebx
%define x2 edx

xoodoo:
_xoodoo:
    pushad
    mov    esi, [esp+32+4]   ; esi = state
    xor    ecx, ecx          ; ecx = 0
    mul    ecx               ; eax = 0, edx = 0
    pushad                   ; allocate 32 bytes of memory
    mov    edi, esp          ; edi = e
    call   ld_rc
    dw     0x58,  0x38, 0x3c0, 0xd0
    dw     0x120, 0x14,  0x60, 0x2c
    dw     0x380, 0xf0, 0x1a0, 0x12
ld_rc:
    pop    ebx                  ; ebx = rc
xoo_main:
    pushad                      ; save all
    movzx  ebp, word[ebx+eax*2] ; ebp = rc[r]
    mov    cl, 4                ; i = 4
    pushad                      ; save all
    ;
    ; Theta
    ;
    ; e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
    ; e[i]^= ROTR32(e[i], 9);
xd_l1:
    lodsd                    ; eax  = x[i]
    xor    eax, [esi+16-4]   ; eax ^= x[i+4]
    xor    eax, [esi+32-4]   ; eax ^= x[i+8]
    ror    eax, 18
    mov    ebx, eax
    ror    ebx, 9
    xor    eax, ebx
    stosd                    ; e[i] = eax
    loop   xd_l1
    popad                    ; restore all
    ; x[i]^= e[(i - 1) & 3];
    dec    edx               ; edx = -1
    mov    cl, 12
xd_l2:
    mov    eax, edx          ; eax = edx & 3
    and    eax, 3
    mov    eax, [edi+eax*4]  ; eax = e[(i - 1) & 3]
    inc    edx               ; i++
    xor    [esi+edx*4], eax  ; x[i] ^= eax
    loop   xd_l2

    ; XCHG(x[7], x[4]);
    mov    eax, [esi+7*4]
    xchg   eax, [esi+4*4]

    ; XCHG(x[7], x[5]);
    xchg   eax, [esi+5*4]

    ; XCHG(x[7], x[6]);
    xchg   eax, [esi+6*4]
    mov    [esi+7*4], eax

    ; x[0] ^= rc[r];
    xor    [esi], ebp
    mov    cl, 4
    pushad
xd_l6:
    ; x0 = x[i+0];
    lodsd

    ; x1 = x[i+4];
    mov    x1, [esi+16-4]

    ; x2 = ROTR32(x[i+8], 21);
    mov    x2, [esi+32-4]
    ror    x2, 21

    ; x[i+8] = ROTR32((~x0 & x1) ^ x2, 24);
    not    x0
    and    x0, x1
    xor    x0, x2
    ror    x0, 24
    mov    [esi+32-4], x0

    ; x[i+4] = ROTR32((~x2 & x0) ^ x1, 31);
    push   x2
    not    x2
    and    x2, [esi-4]
    xor    x2, x1
    ror    x2, 31
    mov    [esi+16-4], x2
    pop    x2

    ; x[i+0] ^= ~x1 & x2;
    not    x1
    and    x1, x2
    xor    [esi-4], x1
    loop   xd_l6
    popad

    ; XCHG(x[8], x[10]);
    ; XCHG(x[9], x[11]);
    mov    eax, [esi+8*4]
    mov    ebp, [esi+9*4]

    xchg   eax, [esi+10*4]
    xchg   ebp, [esi+11*4]

    mov    [esi+8*4], eax
    mov    [esi+9*4], ebp

    popad

    ; --r
    inc    eax
    cmp    al, 12
    jnz    xoo_main

    ; release memory
    popad
    ; restore registers
    popad
    ret

ARM assembly

ARM offers a nice instruction BIC which is the equivilant of (x & ~y) in C.

This instruction is used in the Rho and Chi modules combined.

.arm
  .arch armv6
  .text
  .align  2

  .global xoodoo

state .req r0
x     .req r1

r     .req r2
i     .req r3

x0    .req r4
x1    .req r5
x2    .req r6
x3    .req r7

rc    .req r8
xt    .req r9

e     .req sp

xoodoo:
    // save registers
    push   {r0-r12, lr}

    mov    r, #12              // 12 rounds
    sub    sp, #16             // allocate 16 bytes
    adr    rc, rc_tab
xoodoo_main:
    mov    i, #0               // i = 0
    mov    x, state
theta_l0:
    ldr    x2, [x, #32]        // x2 = x[i+8]
    ldr    x1, [x, #16]        // x1 = x[i+4]
    ldr    x0, [x], #4         // x0 = x[i+0], advance x by 4

    // e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
    eor    x0, x1
    eor    x0, x2
    ror    x0, #18

    // e[i]^= ROTR32(e[i], 9);
    eor    x0, x0, ror #9
    str    x0, [sp, i, lsl #2]  // store in e

    add    i, #1               // i++
    cmp    i, #4               // i<4
    bne    theta_l0            //

    // x[i]^= e[(i - 1) & 3];
    mov    i, #0               // i = 0
    mov    x, state            // x = state
theta_l1:
    sub    xt, i, #1
    and    xt, #3               // xt = i & 3
    ldr    xt, [sp, xt, lsl #2] // xt = e[(i - 1) & 3]
    ldr    x0, [x, i, lsl #2]   // x0 = x[i]
    eor    x0, xt               // x0 ^= xt
    str    x0, [x, i, lsl #2]   // x[i] = x0
    add    i, #1                // i++
    cmp    i, #12               // i<12
    bne    theta_l1

    // Rho west
    // XCHG(x[7], x[4]);
    // XCHG(x[7], x[5]);
    // XCHG(x[7], x[6]);
    add    x, state, #16       // x = &state[4]
    ldm    x, {x0, x1, x2, x3}
    mov    xt, x0              // xt = x[4]
    mov    x0, x3              // x[4] = x[7]
    mov    x3, x2              // x[7] = x[6]
    mov    x2, x1              // x[6] = x[5]
    mov    x1, xt              // x[5] = xt
    stm    x, {x0, x1, x2, x3}

    mov    x, state

    // Iota
    // x[0] ^= rc[r];
    ldrh   xt, [rc], #2        // load half-word, advance by 2
    ldr    x0, [x]             // load word
    eor    x0, xt              // xor
    str    x0, [x]             // store word

    mov    i, #4
chi:
    // Chi and Rho east
    // x0 = x[i+0];
    ldr    x0, [x]

    // x1 = x[i+4];
    ldr    x1, [x, #16]

    // x2 = ROTR32(x[i+8], 21);
    ldr    x2, [x, #32]
    ror    x2, #21

    // x[i+8] = ROTR32((x1 & ~x0) ^ x2, 24);
    bic    xt, x1, x0
    eor    xt, x2
    ror    xt, #24
    str    xt, [x, #32]

    // x[i+4] = ROTR32((x0 & ~x2) ^ x1, 31);
    bic    xt, x0, x2
    eor    xt, x1
    ror    xt, #31
    str    xt, [x, #16]

    // x[i+0]^= x2 & ~x1;
    bic    xt, x2, x1
    eor    xt, x0
    str    xt, [x], #4

    // i--
    subs   i, #1
    bne    chi

    add    x, state, #32       // x = &state[8]

    // XCHG(x[8], x[10]);
    ldm    x, {x0, x1, x2, x3}
    push   {x0}
    mov    x0, x2
    pop    {x2}

    // XCHG(x[9], x[11]);
    push   {x1}
    mov    x1, x3
    pop    {x3}
    stm    x, {x0, x1, x2, x3}

    subs   r, #1               // r--
    bne    xoodoo_main         // r>0

    // release stack
    add    sp, #16

    // restore registers, and return
    pop    {r0-r12, pc}

    // round constants
rc_tab:
    .hword 0x058, 0x038, 0x3c0, 0x0d0
    .hword 0x120, 0x014, 0x060, 0x02c
    .hword 0x380, 0x0f0, 0x1a0, 0x012

ARM64 assembly

// Xoodoo in ARM64 assembly
// 268 bytes

    .arch armv8-a
    .text

    .global xoodoo

xoodoo:
    sub    sp, sp, 16          // allocate 16 bytes
    adr    x8, rc
    mov    w9, 12               // 12 rounds
L0:
    mov    w7, 0                // i = 0
    mov    x1, x0
L1:
    ldr    w4, [x1, 32]         // w4 = s[i+8]
    ldr    w3, [x1, 16]         // w3 = s[i+4]
    ldr    w2, [x1], 4          // w2 = s[i+0], advance x1 by 4

    // e[i] = R(s[i] ^ s[i+4] ^ s[i+8], 18);
    eor    w2, w2, w3 
    eor    w2, w2, w4 
    ror    w2, w2, 18

    // e[i] ^= R(e[i], 9);
    eor    w2, w2, w2, ror 9
    str    w2, [sp, x7, lsl 2]  // store in e

    add    w7, w7, 1            // i++
    cmp    w7, 4                // i < 4
    bne    L1                   //

    // s[i]^= e[(i - 1) & 3];
    mov    w7, 0                // i = 0
L2:
    sub    w2, w7, 1
    and    w2, w2, 3            // w2 = i & 3
    ldr    w2, [sp, x2, lsl 2]  // w2 = e[(i - 1) & 3]
    ldr    w3, [x0, x7, lsl 2]  // w3 = s[i]
    eor    w3, w3, w2           // w3 ^= w2 
    str    w3, [x0, x7, lsl 2]  // s[i] = w3 
    add    w7, w7, 1            // i++
    cmp    w7, 12               // i < 12
    bne    L2 

    // Rho west
    // X(s[7], s[4]);
    // X(s[7], s[5]);
    // X(s[7], s[6]);
    ldp    w2, w3, [x0, 16]
    ldp    w4, w5, [x0, 24]
    stp    w5, w2, [x0, 16]
    stp    w3, w4, [x0, 24]

    // Iota
    // s[0] ^= *rc++;
    ldrh   w2, [x8], 2         // load half-word, advance by 2
    ldr    w3, [x0]            // load word
    eor    w3, w3, w2          // xor
    str    w3, [x0]            // store word

    mov    w7, 4
    mov    x1, x0
L3:
    // Chi and Rho east
    // a = s[i+0];
    ldr    w2, [x1]

    // b = s[i+4];
    ldr    w3, [x1, 16]

    // c = R(s[i+8], 21);
    ldr    w4, [x1, 32]
    ror    w4, w4, 21

    // s[i+8] = R((b & ~a) ^ c, 24);
    bic    w5, w3, w2 
    eor    w5, w5, w4 
    ror    w5, w5, 24
    str    w5, [x1, 32]

    // s[i+4] = R((a & ~c) ^ b, 31);
    bic    w5, w2, w4 
    eor    w5, w5, w3 
    ror    w5, w5, 31
    str    w5, [x1, 16]

    // s[i+0]^= c & ~b;
    bic    w5, w4, w3 
    eor    w5, w5, w2 
    str    w5, [x1], 4

    // i--
    subs   w7, w7, 1
    bne    L3 

    // X(s[8], s[10]);
    // X(s[9], s[11]);
    ldp    w2, w3, [x0, 32] // 8, 9
    ldp    w4, w5, [x0, 40] // 10, 11
    stp    w2, w3, [x0, 40]
    stp    w4, w5, [x0, 32]

    subs   w9, w9, 1           // r--
    bne    L0                  // r != 0

    // release stack
    add    sp, sp, 16
    ret
    // round constants
rc:
    .hword 0x058, 0x038, 0x3c0, 0x0d0
    .hword 0x120, 0x014, 0x060, 0x02c
    .hword 0x380, 0x0f0, 0x1a0, 0x012

Summary

Xoodoo with 12 rounds and a 384-bit state is much more compact than Keccak with 22 rounds and an 800-bit state. Presumably Keccak is more secure, but cannot be vectorized and perform as well as Xoodoo or Gimli. Sources here.