## Xoodoo Permutation Function

### Introduction

Xoodoo is a new permutation function designed by the Keccak team along with Seth Hoffert and Johan De Meulder.

It was described by Joan Daemen at the 2017 Workshop on Elliptic Curve Cryptography in a talk: Innovations in permutation-based crypto

The design is very similar to the Keccak permutation function, but obviously draws inspiration from the design of Gimli which also uses a 384-bit state, and works efficiently on many platforms.

The C code shown here is derived from a python implementation here

September 2018 UPDATE: The Keccak team have posted their own implementations here with further documentation in the Xoodoo cookbook.

### C code

When calling this function, the p parameter should point to a 48-byte buffer.

```#define R(v,n)(((v)>>(n))|((v)<<(32-(n))))
#define X(u,v)t=s[u],s[u]=s[v],s[v]=t
#define F(n)for(i=0;i<n;i++)
typedef unsigned int W;

void xoodoo(void*p){
W e[4],a,b,c,t,r,i,*s=p;
W x[12]={
0x058,0x038,0x3c0,0x0d0,
0x120,0x014,0x060,0x02c,
0x380,0x0f0,0x1a0,0x012};

for(r=0;r<12;r++){
F(4)
e[i]=R(s[i]^s[i+4]^s[i+8],18),
e[i]^=R(e[i],9);
F(12)
s[i]^=e[(i-1)&3];
X(7,4);X(7,5);X(7,6);
s[0]^=x[r];
F(4)
a=s[i],
b=s[i+4],
c=R(s[i+8],21),
s[i+8]=R((b&~a)^c,24),
s[i+4]=R((a&~c)^b,31),
s[i]^=c&~b;
X(8,10);X(9,11);
}
}
```

### x86 Assembly

```%define x0 eax
%define x1 ebx
%define x2 edx

xoodoo:
_xoodoo:
pushad
mov    esi, [esp+32+4]   ; esi = state
xor    ecx, ecx          ; ecx = 0
mul    ecx               ; eax = 0, edx = 0
pushad                   ; allocate 32 bytes of memory
mov    edi, esp          ; edi = e
call   ld_rc
dw     0x58,  0x38, 0x3c0, 0xd0
dw     0x120, 0x14,  0x60, 0x2c
dw     0x380, 0xf0, 0x1a0, 0x12
ld_rc:
pop    ebx                  ; ebx = rc
xoo_main:
pushad                      ; save all
movzx  ebp, word[ebx+eax*2] ; ebp = rc[r]
mov    cl, 4                ; i = 4
pushad                      ; save all
;
; Theta
;
; e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
; e[i]^= ROTR32(e[i], 9);
xd_l1:
lodsd                    ; eax  = x[i]
xor    eax, [esi+16-4]   ; eax ^= x[i+4]
xor    eax, [esi+32-4]   ; eax ^= x[i+8]
ror    eax, 18
mov    ebx, eax
ror    ebx, 9
xor    eax, ebx
stosd                    ; e[i] = eax
loop   xd_l1
popad                    ; restore all
; x[i]^= e[(i - 1) & 3];
dec    edx               ; edx = -1
mov    cl, 12
xd_l2:
mov    eax, edx          ; eax = edx & 3
and    eax, 3
mov    eax, [edi+eax*4]  ; eax = e[(i - 1) & 3]
inc    edx               ; i++
xor    [esi+edx*4], eax  ; x[i] ^= eax
loop   xd_l2

; XCHG(x[7], x[4]);
mov    eax, [esi+7*4]
xchg   eax, [esi+4*4]

; XCHG(x[7], x[5]);
xchg   eax, [esi+5*4]

; XCHG(x[7], x[6]);
xchg   eax, [esi+6*4]
mov    [esi+7*4], eax

; x[0] ^= rc[r];
xor    [esi], ebp
mov    cl, 4
pushad
xd_l6:
; x0 = x[i+0];
lodsd

; x1 = x[i+4];
mov    x1, [esi+16-4]

; x2 = ROTR32(x[i+8], 21);
mov    x2, [esi+32-4]
ror    x2, 21

; x[i+8] = ROTR32((~x0 & x1) ^ x2, 24);
not    x0
and    x0, x1
xor    x0, x2
ror    x0, 24
mov    [esi+32-4], x0

; x[i+4] = ROTR32((~x2 & x0) ^ x1, 31);
push   x2
not    x2
and    x2, [esi-4]
xor    x2, x1
ror    x2, 31
mov    [esi+16-4], x2
pop    x2

; x[i+0] ^= ~x1 & x2;
not    x1
and    x1, x2
xor    [esi-4], x1
loop   xd_l6
popad

; XCHG(x[8], x[10]);
; XCHG(x[9], x[11]);
mov    eax, [esi+8*4]
mov    ebp, [esi+9*4]

xchg   eax, [esi+10*4]
xchg   ebp, [esi+11*4]

mov    [esi+8*4], eax
mov    [esi+9*4], ebp

popad

; --r
inc    eax
cmp    al, 12
jnz    xoo_main

; release memory
popad
; restore registers
popad
ret
```

### ARM assembly

ARM offers a nice instruction BIC which is the equivilant of (x & ~y) in C.

This instruction is used in the Rho and Chi modules combined.

```.arm
.arch armv6
.text
.align  2

.global xoodoo

state .req r0
x     .req r1

r     .req r2
i     .req r3

x0    .req r4
x1    .req r5
x2    .req r6
x3    .req r7

rc    .req r8
xt    .req r9

e     .req sp

xoodoo:
// save registers
push   {r0-r12, lr}

mov    r, #12              // 12 rounds
sub    sp, #16             // allocate 16 bytes
adr    rc, rc_tab
xoodoo_main:
mov    i, #0               // i = 0
mov    x, state
theta_l0:
ldr    x2, [x, #32]        // x2 = x[i+8]
ldr    x1, [x, #16]        // x1 = x[i+4]
ldr    x0, [x], #4         // x0 = x[i+0], advance x by 4

// e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
eor    x0, x1
eor    x0, x2
ror    x0, #18

// e[i]^= ROTR32(e[i], 9);
eor    x0, x0, ror #9
str    x0, [sp, i, lsl #2]  // store in e

add    i, #1               // i++
cmp    i, #4               // i<4
bne    theta_l0            //

// x[i]^= e[(i - 1) & 3];
mov    i, #0               // i = 0
mov    x, state            // x = state
theta_l1:
sub    xt, i, #1
and    xt, #3               // xt = i & 3
ldr    xt, [sp, xt, lsl #2] // xt = e[(i - 1) & 3]
ldr    x0, [x, i, lsl #2]   // x0 = x[i]
eor    x0, xt               // x0 ^= xt
str    x0, [x, i, lsl #2]   // x[i] = x0
add    i, #1                // i++
cmp    i, #12               // i<12
bne    theta_l1

// Rho west
// XCHG(x[7], x[4]);
// XCHG(x[7], x[5]);
// XCHG(x[7], x[6]);
add    x, state, #16       // x = &state[4]
ldm    x, {x0, x1, x2, x3}
mov    xt, x0              // xt = x[4]
mov    x0, x3              // x[4] = x[7]
mov    x3, x2              // x[7] = x[6]
mov    x2, x1              // x[6] = x[5]
mov    x1, xt              // x[5] = xt
stm    x, {x0, x1, x2, x3}

mov    x, state

// Iota
// x[0] ^= rc[r];
ldrh   xt, [rc], #2        // load half-word, advance by 2
ldr    x0, [x]             // load word
eor    x0, xt              // xor
str    x0, [x]             // store word

mov    i, #4
chi:
// Chi and Rho east
// x0 = x[i+0];
ldr    x0, [x]

// x1 = x[i+4];
ldr    x1, [x, #16]

// x2 = ROTR32(x[i+8], 21);
ldr    x2, [x, #32]
ror    x2, #21

// x[i+8] = ROTR32((x1 & ~x0) ^ x2, 24);
bic    xt, x1, x0
eor    xt, x2
ror    xt, #24
str    xt, [x, #32]

// x[i+4] = ROTR32((x0 & ~x2) ^ x1, 31);
bic    xt, x0, x2
eor    xt, x1
ror    xt, #31
str    xt, [x, #16]

// x[i+0]^= x2 & ~x1;
bic    xt, x2, x1
eor    xt, x0
str    xt, [x], #4

// i--
subs   i, #1
bne    chi

add    x, state, #32       // x = &state[8]

// XCHG(x[8], x[10]);
ldm    x, {x0, x1, x2, x3}
push   {x0}
mov    x0, x2
pop    {x2}

// XCHG(x[9], x[11]);
push   {x1}
mov    x1, x3
pop    {x3}
stm    x, {x0, x1, x2, x3}

subs   r, #1               // r--
bne    xoodoo_main         // r>0

// release stack
add    sp, #16

// restore registers, and return
pop    {r0-r12, pc}

// round constants
rc_tab:
.hword 0x058, 0x038, 0x3c0, 0x0d0
.hword 0x120, 0x014, 0x060, 0x02c
.hword 0x380, 0x0f0, 0x1a0, 0x012
```

### ARM64 assembly

```// Xoodoo in ARM64 assembly
// 268 bytes

.arch armv8-a
.text

.global xoodoo

xoodoo:
sub    sp, sp, 16          // allocate 16 bytes
adr    x8, rc
mov    w9, 12               // 12 rounds
L0:
mov    w7, 0                // i = 0
mov    x1, x0
L1:
ldr    w4, [x1, 32]         // w4 = s[i+8]
ldr    w3, [x1, 16]         // w3 = s[i+4]
ldr    w2, [x1], 4          // w2 = s[i+0], advance x1 by 4

// e[i] = R(s[i] ^ s[i+4] ^ s[i+8], 18);
eor    w2, w2, w3
eor    w2, w2, w4
ror    w2, w2, 18

// e[i] ^= R(e[i], 9);
eor    w2, w2, w2, ror 9
str    w2, [sp, x7, lsl 2]  // store in e

add    w7, w7, 1            // i++
cmp    w7, 4                // i < 4
bne    L1                   //

// s[i]^= e[(i - 1) & 3];
mov    w7, 0                // i = 0
L2:
sub    w2, w7, 1
and    w2, w2, 3            // w2 = i & 3
ldr    w2, [sp, x2, lsl 2]  // w2 = e[(i - 1) & 3]
ldr    w3, [x0, x7, lsl 2]  // w3 = s[i]
eor    w3, w3, w2           // w3 ^= w2
str    w3, [x0, x7, lsl 2]  // s[i] = w3
add    w7, w7, 1            // i++
cmp    w7, 12               // i < 12
bne    L2

// Rho west
// X(s[7], s[4]);
// X(s[7], s[5]);
// X(s[7], s[6]);
ldp    w2, w3, [x0, 16]
ldp    w4, w5, [x0, 24]
stp    w5, w2, [x0, 16]
stp    w3, w4, [x0, 24]

// Iota
// s[0] ^= *rc++;
ldrh   w2, [x8], 2         // load half-word, advance by 2
ldr    w3, [x0]            // load word
eor    w3, w3, w2          // xor
str    w3, [x0]            // store word

mov    w7, 4
mov    x1, x0
L3:
// Chi and Rho east
// a = s[i+0];
ldr    w2, [x1]

// b = s[i+4];
ldr    w3, [x1, 16]

// c = R(s[i+8], 21);
ldr    w4, [x1, 32]
ror    w4, w4, 21

// s[i+8] = R((b & ~a) ^ c, 24);
bic    w5, w3, w2
eor    w5, w5, w4
ror    w5, w5, 24
str    w5, [x1, 32]

// s[i+4] = R((a & ~c) ^ b, 31);
bic    w5, w2, w4
eor    w5, w5, w3
ror    w5, w5, 31
str    w5, [x1, 16]

// s[i+0]^= c & ~b;
bic    w5, w4, w3
eor    w5, w5, w2
str    w5, [x1], 4

// i--
subs   w7, w7, 1
bne    L3

// X(s[8], s[10]);
// X(s[9], s[11]);
ldp    w2, w3, [x0, 32] // 8, 9
ldp    w4, w5, [x0, 40] // 10, 11
stp    w2, w3, [x0, 40]
stp    w4, w5, [x0, 32]

subs   w9, w9, 1           // r--
bne    L0                  // r != 0

// release stack
add    sp, sp, 16
ret
// round constants
rc:
.hword 0x058, 0x038, 0x3c0, 0x0d0
.hword 0x120, 0x014, 0x060, 0x02c
.hword 0x380, 0x0f0, 0x1a0, 0x012
```

### Summary

Xoodoo with 12 rounds and a 384-bit state is much more compact than Keccak with 22 rounds and an 800-bit state. Presumably Keccak is more secure, but cannot be vectorized and perform as well as Xoodoo or Gimli.

The x86 assembly here is 186 bytes while the C generated assembly is 300.

Sources here.

Advertisements
This entry was posted in arm, assembly, cryptography, programming, x86 and tagged , , , , , , . Bookmark the permalink.