Xoodoo Permutation Function

Introduction

Xoodoo is a new permutation function designed by the Keccak team along with Seth Hoffert and Johan De Meulder.

It was described by Joan Daemen at the 2017 Workshop on Elliptic Curve Cryptography in a talk: Innovations in permutation-based crypto

The design is very similar to the Keccak permutation function, but obviously draws inspiration from the design of Gimli which also uses a 384-bit state, and works efficiently on many platforms.

The C code shown here is derived from a python implementation here

C code

There might be some way to make this more compact, but I’m not sure at the moment how.

When calling this function, the state parameter should point to a 48-byte buffer.

void xoodoo(void *state) {
    uint32_t e[4], x0, x1, x2, t;
    int      r, i;
    uint32_t *x=(uint32_t*)state;

    uint32_t rc[12]=
    { 0x058, 0x038, 0x3c0, 0x0d0,
      0x120, 0x014, 0x060, 0x02c,
      0x380, 0x0f0, 0x1a0, 0x012 };

    // 12 rounds by default
    for (r=0; r<12; r++) {
      // Theta
      for (i=0; i<4; i++) {
        e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
        e[i]^= ROTR32(e[i], 9);
      }

      for (i=0; i<12; i++) {
        x[i]^= e[(i - 1) & 3];
      }

      // Rho west
      XCHG(x[7], x[4]);
      XCHG(x[7], x[5]);
      XCHG(x[7], x[6]);

      // Iota
      x[0] ^= rc[r];

      // Chi and Rho east
      for (i=0; i<4; i++) {
        x0 = x[i+0];
        x1 = x[i+4];
        x2 = ROTR32(x[i+8], 21);

        x[i+8] = ROTR32((~x0 & x1) ^ x2, 24);
        x[i+4] = ROTR32((~x2 & x0) ^ x1, 31);
        x[i+0]^= ~x1 & x2;
      }
      XCHG(x[8], x[10]);
      XCHG(x[9], x[11]);
    }
}

Theta could be rewritten like this.

// Theta
      for (i=0; i<4; i++) {
        e0 = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
        e0^= ROTR32(e0, 9);
        
        XCHG(e0, e1);
        XCHG(e1, e2);
        XCHG(e2, e3);
      }

      for (i=0; i<12; i+=4) {
        x[i+0] ^= e3; x[i+1] ^= e0;
        x[i+2] ^= e1; x[i+3] ^= e2;        
      }

However, It doesn’t work well in x86 assembly, so it wasn’t used.

x86 Assembly

%define x0 eax
%define x1 ebx
%define x2 edx

xoodoo:
_xoodoo:
    pushad
    mov    esi, [esp+32+4]   ; esi = state
    xor    ecx, ecx          ; ecx = 0
    mul    ecx               ; eax = 0, edx = 0
    pushad                   ; allocate 32 bytes of memory
    mov    edi, esp          ; edi = e
    call   ld_rc
    dw     0x58,  0x38, 0x3c0, 0xd0
    dw     0x120, 0x14,  0x60, 0x2c
    dw     0x380, 0xf0, 0x1a0, 0x12
ld_rc:
    pop    ebx                  ; ebx = rc
xoo_main:
    pushad                      ; save all
    movzx  ebp, word[ebx+eax*2] ; ebp = rc[r]
    mov    cl, 4                ; i = 4
    pushad                      ; save all
    ;
    ; Theta
    ;
    ; e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
    ; e[i]^= ROTR32(e[i], 9);
xd_l1:
    lodsd                    ; eax  = x[i]
    xor    eax, [esi+16-4]   ; eax ^= x[i+4]
    xor    eax, [esi+32-4]   ; eax ^= x[i+8]
    ror    eax, 18
    mov    ebx, eax
    ror    ebx, 9
    xor    eax, ebx
    stosd                    ; e[i] = eax
    loop   xd_l1
    popad                    ; restore all
    ; x[i]^= e[(i - 1) & 3];
    dec    edx               ; edx = -1
    mov    cl, 12
xd_l2:
    mov    eax, edx          ; eax = edx & 3
    and    eax, 3
    mov    eax, [edi+eax*4]  ; eax = e[(i - 1) & 3]
    inc    edx               ; i++
    xor    [esi+edx*4], eax  ; x[i] ^= eax
    loop   xd_l2

    ; XCHG(x[7], x[4]);
    mov    eax, [esi+7*4]
    xchg   eax, [esi+4*4]

    ; XCHG(x[7], x[5]);
    xchg   eax, [esi+5*4]

    ; XCHG(x[7], x[6]);
    xchg   eax, [esi+6*4]
    mov    [esi+7*4], eax

    ; x[0] ^= rc[r];
    xor    [esi], ebp
    mov    cl, 4
    pushad
xd_l6:
    ; x0 = x[i+0];
    lodsd

    ; x1 = x[i+4];
    mov    x1, [esi+16-4]

    ; x2 = ROTR32(x[i+8], 21);
    mov    x2, [esi+32-4]
    ror    x2, 21

    ; x[i+8] = ROTR32((~x0 & x1) ^ x2, 24);
    not    x0
    and    x0, x1
    xor    x0, x2
    ror    x0, 24
    mov    [esi+32-4], x0

    ; x[i+4] = ROTR32((~x2 & x0) ^ x1, 31);
    push   x2
    not    x2
    and    x2, [esi-4]
    xor    x2, x1
    ror    x2, 31
    mov    [esi+16-4], x2
    pop    x2

    ; x[i+0] ^= ~x1 & x2;
    not    x1
    and    x1, x2
    xor    [esi-4], x1
    loop   xd_l6
    popad

    ; XCHG(x[8], x[10]);
    ; XCHG(x[9], x[11]);
    mov    eax, [esi+8*4]
    mov    ebp, [esi+9*4]

    xchg   eax, [esi+10*4]
    xchg   ebp, [esi+11*4]

    mov    [esi+8*4], eax
    mov    [esi+9*4], ebp

    popad

    ; --r
    inc    eax
    cmp    al, 12
    jnz    xoo_main

    ; release memory
    popad
    ; restore registers
    popad
    ret

ARM assembly

ARM offers a nice instruction BIC which is the equivilant of (x & ~y) in C.

This instruction is used in the Rho and Chi modules combined.

.arm
  .arch armv6
  .text
  .align  2

  .global xoodoo

state .req r0
x     .req r1

r     .req r2
i     .req r3

x0    .req r4
x1    .req r5
x2    .req r6
x3    .req r7

rc    .req r8
xt    .req r9

e     .req sp

xoodoo:
    // save registers
    push   {r0-r12, lr}

    mov    r, #12              // 12 rounds
    sub    sp, #16             // allocate 16 bytes
    adr    rc, rc_tab
xoodoo_main:
    mov    i, #0               // i = 0
    mov    x, state
theta_l0:
    ldr    x2, [x, #32]        // x2 = x[i+8]
    ldr    x1, [x, #16]        // x1 = x[i+4]
    ldr    x0, [x], #4         // x0 = x[i+0], advance x by 4

    // e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
    eor    x0, x1
    eor    x0, x2
    ror    x0, #18

    // e[i]^= ROTR32(e[i], 9);
    eor    x0, x0, ror #9
    str    x0, [sp, i, lsl #2]  // store in e

    add    i, #1               // i++
    cmp    i, #4               // i<4
    bne    theta_l0            //

    // x[i]^= e[(i - 1) & 3];
    mov    i, #0               // i = 0
    mov    x, state            // x = state
theta_l1:
    sub    xt, i, #1
    and    xt, #3               // xt = i & 3
    ldr    xt, [sp, xt, lsl #2] // xt = e[(i - 1) & 3]
    ldr    x0, [x, i, lsl #2]   // x0 = x[i]
    eor    x0, xt               // x0 ^= xt
    str    x0, [x, i, lsl #2]   // x[i] = x0
    add    i, #1                // i++
    cmp    i, #12               // i<12
    bne    theta_l1

    // Rho west
    // XCHG(x[7], x[4]);
    // XCHG(x[7], x[5]);
    // XCHG(x[7], x[6]);
    add    x, state, #16       // x = &state[4]
    ldm    x, {x0, x1, x2, x3}
    mov    xt, x0              // xt = x[4]
    mov    x0, x3              // x[4] = x[7]
    mov    x3, x2              // x[7] = x[6]
    mov    x2, x1              // x[6] = x[5]
    mov    x1, xt              // x[5] = xt
    stm    x, {x0, x1, x2, x3}

    mov    x, state

    // Iota
    // x[0] ^= rc[r];
    ldrh   xt, [rc], #2        // load half-word, advance by 2
    ldr    x0, [x]             // load word
    eor    x0, xt              // xor
    str    x0, [x]             // store word

    mov    i, #4
chi:
    // Chi and Rho east
    // x0 = x[i+0];
    ldr    x0, [x]

    // x1 = x[i+4];
    ldr    x1, [x, #16]

    // x2 = ROTR32(x[i+8], 21);
    ldr    x2, [x, #32]
    ror    x2, #21

    // x[i+8] = ROTR32((x1 & ~x0) ^ x2, 24);
    bic    xt, x1, x0
    eor    xt, x2
    ror    xt, #24
    str    xt, [x, #32]

    // x[i+4] = ROTR32((x0 & ~x2) ^ x1, 31);
    bic    xt, x0, x2
    eor    xt, x1
    ror    xt, #31
    str    xt, [x, #16]

    // x[i+0]^= x2 & ~x1;
    bic    xt, x2, x1
    eor    xt, x0
    str    xt, [x], #4

    // i--
    subs   i, #1
    bne    chi

    add    x, state, #32       // x = &state[8]

    // XCHG(x[8], x[10]);
    ldm    x, {x0, x1, x2, x3}
    push   {x0}
    mov    x0, x2
    pop    {x2}

    // XCHG(x[9], x[11]);
    push   {x1}
    mov    x1, x3
    pop    {x3}
    stm    x, {x0, x1, x2, x3}

    subs   r, #1               // r--
    bne    xoodoo_main         // r>0

    // release stack
    add    sp, #16

    // restore registers, and return
    pop    {r0-r12, pc}

    // round constants
rc_tab:
    .hword 0x058, 0x038, 0x3c0, 0x0d0
    .hword 0x120, 0x014, 0x060, 0x02c
    .hword 0x380, 0x0f0, 0x1a0, 0x012

Summary

Xoodoo with 12 rounds and a 384-bit state is much more compact than Keccak with 22 rounds and an 800-bit state. Presumably Keccak is more secure, but cannot be vectorized and perform as well as Xoodoo or Gimli.

The x86 assembly here is 186 bytes while the C generated assembly is 300. Both could probably be optimized further, but it’s the best I can do at the moment.

Advertisements
This entry was posted in arm, assembly, cryptography, programming, x86 and tagged , , , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s