Introduction
Xoodoo is a cryptographic permutation function designed by the Keccak team along with Seth Hoffert and Johan De Meulder. It was described by Joan Daemen at the 2017 Workshop on Elliptic Curve Cryptography in a talk: Innovations in permutation-based crypto The design is very similar to the Keccak permutation function, but obviously draws inspiration from the design of Gimli that also uses a 384-bit state, and works efficiently on many platforms. The C code shown here is derived from a python implementation. The Keccak team have posted their own implementations here with further documentation in the Xoodoo cookbook.
C code
When calling this function, the p parameter should point to a 48-byte buffer.
#define R(v,n)(((v)>>(n))|((v)<<(32-(n)))) #define X(u,v)t=s[u],s[u]=s[v],s[v]=t #define F(n)for(i=0;i<n;i++) typedef unsigned int W; void xoodoo(void*p){ W e[4],a,b,c,t,r,i,*s=p; W x[12]={ 0x058,0x038,0x3c0,0x0d0, 0x120,0x014,0x060,0x02c, 0x380,0x0f0,0x1a0,0x012}; for(r=0;r<12;r++){ F(4) e[i]=R(s[i]^s[i+4]^s[i+8],18), e[i]^=R(e[i],9); F(12) s[i]^=e[(i-1)&3]; X(7,4);X(7,5);X(7,6); s[0]^=x[r]; F(4) a=s[i], b=s[i+4], c=R(s[i+8],21), s[i+8]=R((b&~a)^c,24), s[i+4]=R((a&~c)^b,31), s[i]^=c&~b; X(8,10);X(9,11); } }
x86 Assembly
%define x0 eax %define x1 ebx %define x2 edx xoodoo: _xoodoo: pushad mov esi, [esp+32+4] ; esi = state xor ecx, ecx ; ecx = 0 mul ecx ; eax = 0, edx = 0 pushad ; allocate 32 bytes of memory mov edi, esp ; edi = e call ld_rc dw 0x58, 0x38, 0x3c0, 0xd0 dw 0x120, 0x14, 0x60, 0x2c dw 0x380, 0xf0, 0x1a0, 0x12 ld_rc: pop ebx ; ebx = rc xoo_main: pushad ; save all movzx ebp, word[ebx+eax*2] ; ebp = rc[r] mov cl, 4 ; i = 4 pushad ; save all ; ; Theta ; ; e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18); ; e[i]^= ROTR32(e[i], 9); xd_l1: lodsd ; eax = x[i] xor eax, [esi+16-4] ; eax ^= x[i+4] xor eax, [esi+32-4] ; eax ^= x[i+8] ror eax, 18 mov ebx, eax ror ebx, 9 xor eax, ebx stosd ; e[i] = eax loop xd_l1 popad ; restore all ; x[i]^= e[(i - 1) & 3]; dec edx ; edx = -1 mov cl, 12 xd_l2: mov eax, edx ; eax = edx & 3 and eax, 3 mov eax, [edi+eax*4] ; eax = e[(i - 1) & 3] inc edx ; i++ xor [esi+edx*4], eax ; x[i] ^= eax loop xd_l2 ; XCHG(x[7], x[4]); mov eax, [esi+7*4] xchg eax, [esi+4*4] ; XCHG(x[7], x[5]); xchg eax, [esi+5*4] ; XCHG(x[7], x[6]); xchg eax, [esi+6*4] mov [esi+7*4], eax ; x[0] ^= rc[r]; xor [esi], ebp mov cl, 4 pushad xd_l6: ; x0 = x[i+0]; lodsd ; x1 = x[i+4]; mov x1, [esi+16-4] ; x2 = ROTR32(x[i+8], 21); mov x2, [esi+32-4] ror x2, 21 ; x[i+8] = ROTR32((~x0 & x1) ^ x2, 24); not x0 and x0, x1 xor x0, x2 ror x0, 24 mov [esi+32-4], x0 ; x[i+4] = ROTR32((~x2 & x0) ^ x1, 31); push x2 not x2 and x2, [esi-4] xor x2, x1 ror x2, 31 mov [esi+16-4], x2 pop x2 ; x[i+0] ^= ~x1 & x2; not x1 and x1, x2 xor [esi-4], x1 loop xd_l6 popad ; XCHG(x[8], x[10]); ; XCHG(x[9], x[11]); mov eax, [esi+8*4] mov ebp, [esi+9*4] xchg eax, [esi+10*4] xchg ebp, [esi+11*4] mov [esi+8*4], eax mov [esi+9*4], ebp popad ; --r inc eax cmp al, 12 jnz xoo_main ; release memory popad ; restore registers popad ret
ARM assembly
ARM offers a nice instruction BIC which is the equivilant of (x & ~y) in C.
This instruction is used in the Rho and Chi modules combined.
.arm .arch armv6 .text .align 2 .global xoodoo state .req r0 x .req r1 r .req r2 i .req r3 x0 .req r4 x1 .req r5 x2 .req r6 x3 .req r7 rc .req r8 xt .req r9 e .req sp xoodoo: // save registers push {r0-r12, lr} mov r, #12 // 12 rounds sub sp, #16 // allocate 16 bytes adr rc, rc_tab xoodoo_main: mov i, #0 // i = 0 mov x, state theta_l0: ldr x2, [x, #32] // x2 = x[i+8] ldr x1, [x, #16] // x1 = x[i+4] ldr x0, [x], #4 // x0 = x[i+0], advance x by 4 // e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18); eor x0, x1 eor x0, x2 ror x0, #18 // e[i]^= ROTR32(e[i], 9); eor x0, x0, ror #9 str x0, [sp, i, lsl #2] // store in e add i, #1 // i++ cmp i, #4 // i<4 bne theta_l0 // // x[i]^= e[(i - 1) & 3]; mov i, #0 // i = 0 mov x, state // x = state theta_l1: sub xt, i, #1 and xt, #3 // xt = i & 3 ldr xt, [sp, xt, lsl #2] // xt = e[(i - 1) & 3] ldr x0, [x, i, lsl #2] // x0 = x[i] eor x0, xt // x0 ^= xt str x0, [x, i, lsl #2] // x[i] = x0 add i, #1 // i++ cmp i, #12 // i<12 bne theta_l1 // Rho west // XCHG(x[7], x[4]); // XCHG(x[7], x[5]); // XCHG(x[7], x[6]); add x, state, #16 // x = &state[4] ldm x, {x0, x1, x2, x3} mov xt, x0 // xt = x[4] mov x0, x3 // x[4] = x[7] mov x3, x2 // x[7] = x[6] mov x2, x1 // x[6] = x[5] mov x1, xt // x[5] = xt stm x, {x0, x1, x2, x3} mov x, state // Iota // x[0] ^= rc[r]; ldrh xt, [rc], #2 // load half-word, advance by 2 ldr x0, [x] // load word eor x0, xt // xor str x0, [x] // store word mov i, #4 chi: // Chi and Rho east // x0 = x[i+0]; ldr x0, [x] // x1 = x[i+4]; ldr x1, [x, #16] // x2 = ROTR32(x[i+8], 21); ldr x2, [x, #32] ror x2, #21 // x[i+8] = ROTR32((x1 & ~x0) ^ x2, 24); bic xt, x1, x0 eor xt, x2 ror xt, #24 str xt, [x, #32] // x[i+4] = ROTR32((x0 & ~x2) ^ x1, 31); bic xt, x0, x2 eor xt, x1 ror xt, #31 str xt, [x, #16] // x[i+0]^= x2 & ~x1; bic xt, x2, x1 eor xt, x0 str xt, [x], #4 // i-- subs i, #1 bne chi add x, state, #32 // x = &state[8] // XCHG(x[8], x[10]); ldm x, {x0, x1, x2, x3} push {x0} mov x0, x2 pop {x2} // XCHG(x[9], x[11]); push {x1} mov x1, x3 pop {x3} stm x, {x0, x1, x2, x3} subs r, #1 // r-- bne xoodoo_main // r>0 // release stack add sp, #16 // restore registers, and return pop {r0-r12, pc} // round constants rc_tab: .hword 0x058, 0x038, 0x3c0, 0x0d0 .hword 0x120, 0x014, 0x060, 0x02c .hword 0x380, 0x0f0, 0x1a0, 0x012
ARM64 assembly
// Xoodoo in ARM64 assembly // 268 bytes .arch armv8-a .text .global xoodoo xoodoo: sub sp, sp, 16 // allocate 16 bytes adr x8, rc mov w9, 12 // 12 rounds L0: mov w7, 0 // i = 0 mov x1, x0 L1: ldr w4, [x1, 32] // w4 = s[i+8] ldr w3, [x1, 16] // w3 = s[i+4] ldr w2, [x1], 4 // w2 = s[i+0], advance x1 by 4 // e[i] = R(s[i] ^ s[i+4] ^ s[i+8], 18); eor w2, w2, w3 eor w2, w2, w4 ror w2, w2, 18 // e[i] ^= R(e[i], 9); eor w2, w2, w2, ror 9 str w2, [sp, x7, lsl 2] // store in e add w7, w7, 1 // i++ cmp w7, 4 // i < 4 bne L1 // // s[i]^= e[(i - 1) & 3]; mov w7, 0 // i = 0 L2: sub w2, w7, 1 and w2, w2, 3 // w2 = i & 3 ldr w2, [sp, x2, lsl 2] // w2 = e[(i - 1) & 3] ldr w3, [x0, x7, lsl 2] // w3 = s[i] eor w3, w3, w2 // w3 ^= w2 str w3, [x0, x7, lsl 2] // s[i] = w3 add w7, w7, 1 // i++ cmp w7, 12 // i < 12 bne L2 // Rho west // X(s[7], s[4]); // X(s[7], s[5]); // X(s[7], s[6]); ldp w2, w3, [x0, 16] ldp w4, w5, [x0, 24] stp w5, w2, [x0, 16] stp w3, w4, [x0, 24] // Iota // s[0] ^= *rc++; ldrh w2, [x8], 2 // load half-word, advance by 2 ldr w3, [x0] // load word eor w3, w3, w2 // xor str w3, [x0] // store word mov w7, 4 mov x1, x0 L3: // Chi and Rho east // a = s[i+0]; ldr w2, [x1] // b = s[i+4]; ldr w3, [x1, 16] // c = R(s[i+8], 21); ldr w4, [x1, 32] ror w4, w4, 21 // s[i+8] = R((b & ~a) ^ c, 24); bic w5, w3, w2 eor w5, w5, w4 ror w5, w5, 24 str w5, [x1, 32] // s[i+4] = R((a & ~c) ^ b, 31); bic w5, w2, w4 eor w5, w5, w3 ror w5, w5, 31 str w5, [x1, 16] // s[i+0]^= c & ~b; bic w5, w4, w3 eor w5, w5, w2 str w5, [x1], 4 // i-- subs w7, w7, 1 bne L3 // X(s[8], s[10]); // X(s[9], s[11]); ldp w2, w3, [x0, 32] // 8, 9 ldp w4, w5, [x0, 40] // 10, 11 stp w2, w3, [x0, 40] stp w4, w5, [x0, 32] subs w9, w9, 1 // r-- bne L0 // r != 0 // release stack add sp, sp, 16 ret // round constants rc: .hword 0x058, 0x038, 0x3c0, 0x0d0 .hword 0x120, 0x014, 0x060, 0x02c .hword 0x380, 0x0f0, 0x1a0, 0x012
Summary
Xoodoo with 12 rounds and a 384-bit state is much more compact than Keccak with 22 rounds and an 800-bit state. Presumably Keccak is more secure, but cannot be vectorized and perform as well as Xoodoo or Gimli. Sources here.