Xoodoo Permutation Function

Introduction

Xoodoo is a new permutation function designed by the Keccak team along with Seth Hoffert and Johan De Meulder.

It was described by Joan Daemen at the 2017 Workshop on Elliptic Curve Cryptography in a talk entitled Innovations in permutation-based crypto

The design is very similar to the Keccak permutation function, but obviously draws inspiration from the design of Gimli which also uses a 384-bit state, and works efficiently on many platforms.

The C code shown here is derived from a python implementation here

C code

There might be some way to make this more compact, but I’m not sure at the moment how.

When calling this function, the state parameter should point to a 48-byte buffer.

void xoodoo(void *state) {
    uint32_t e[4], x0, x1, x2, t;
    int      r, i;
    uint32_t *x=(uint32_t*)state;

    uint32_t rc[12]=
    { 0x058, 0x038, 0x3c0, 0x0d0,
      0x120, 0x014, 0x060, 0x02c,
      0x380, 0x0f0, 0x1a0, 0x012 };

    // 12 rounds by default
    for (r=0; r<12; r++) {
      // Theta
      for (i=0; i<4; i++) {
        e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
        e[i]^= ROTR32(e[i], 9);
      }

      for (i=0; i<12; i++) {
        x[i]^= e[(i - 1) & 3];
      }

      // Rho west
      XCHG(x[7], x[4]);
      XCHG(x[7], x[5]);
      XCHG(x[7], x[6]);

      // Iota
      x[0] ^= rc[r];

      // Chi and Rho east
      for (i=0; i<4; i++) {
        x0 = x[i+0];
        x1 = x[i+4];
        x2 = ROTR32(x[i+8], 21);

        x[i+8] = ROTR32((~x0 & x1) ^ x2, 24);
        x[i+4] = ROTR32((~x2 & x0) ^ x1, 31);
        x[i+0]^= ~x1 & x2;
      }
      XCHG(x[8], x[10]);
      XCHG(x[9], x[11]);
    }
}

Theta could be rewritten like this.

// Theta
      for (i=0; i<4; i++) {
        e0 = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
        e0^= ROTR32(e0, 9);
        
        XCHG(e0, e1);
        XCHG(e1, e2);
        XCHG(e2, e3);
      }

      for (i=0; i<12; i+=4) {
        x[i+0] ^= e3; x[i+1] ^= e0;
        x[i+2] ^= e1; x[i+3] ^= e2;        
      }

However, It doesn’t work well in x86 assembly, so it wasn’t used.

x86 Assembly

%define x0 eax
%define x1 ebx
%define x2 edx

xoodoo:
_xoodoo:
    pushad
    mov    esi, [esp+32+4]   ; esi = state
    xor    ecx, ecx          ; ecx = 0
    mul    ecx               ; eax = 0, edx = 0
    pushad                   ; allocate 32 bytes of memory
    mov    edi, esp          ; edi = e
    call   ld_rc
    dw     0x58,  0x38, 0x3c0, 0xd0
    dw     0x120, 0x14,  0x60, 0x2c
    dw     0x380, 0xf0, 0x1a0, 0x12
ld_rc:
    pop    ebx                  ; ebx = rc
xoo_main:
    pushad                      ; save all
    movzx  ebp, word[ebx+eax*2] ; ebp = rc[r]
    mov    cl, 4                ; i = 4
    pushad                      ; save all
    ;
    ; Theta
    ;
    ; e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
    ; e[i]^= ROTR32(e[i], 9);
xd_l1:
    lodsd                    ; eax  = x[i]
    xor    eax, [esi+16-4]   ; eax ^= x[i+4]
    xor    eax, [esi+32-4]   ; eax ^= x[i+8]
    ror    eax, 18
    mov    ebx, eax
    ror    ebx, 9
    xor    eax, ebx
    stosd                    ; e[i] = eax
    loop   xd_l1
    popad                    ; restore all
    ; x[i]^= e[(i - 1) & 3];
    dec    edx               ; edx = -1
    mov    cl, 12
xd_l2:
    mov    eax, edx          ; eax = edx & 3
    and    eax, 3
    mov    eax, [edi+eax*4]  ; eax = e[(i - 1) & 3]
    inc    edx               ; i++
    xor    [esi+edx*4], eax  ; x[i] ^= eax
    loop   xd_l2

    ; XCHG(x[7], x[4]);
    mov    eax, [esi+7*4]
    xchg   eax, [esi+4*4]

    ; XCHG(x[7], x[5]);
    xchg   eax, [esi+5*4]

    ; XCHG(x[7], x[6]);
    xchg   eax, [esi+6*4]
    mov    [esi+7*4], eax

    ; x[0] ^= rc[r];
    xor    [esi], ebp
    mov    cl, 4
    pushad
xd_l6:
    ; x0 = x[i+0];
    lodsd

    ; x1 = x[i+4];
    mov    x1, [esi+16-4]

    ; x2 = ROTR32(x[i+8], 21);
    mov    x2, [esi+32-4]
    ror    x2, 21

    ; x[i+8] = ROTR32((~x0 & x1) ^ x2, 24);
    not    x0
    and    x0, x1
    xor    x0, x2
    ror    x0, 24
    mov    [esi+32-4], x0

    ; x[i+4] = ROTR32((~x2 & x0) ^ x1, 31);
    push   x2
    not    x2
    and    x2, [esi-4]
    xor    x2, x1
    ror    x2, 31
    mov    [esi+16-4], x2
    pop    x2

    ; x[i+0] ^= ~x1 & x2;
    not    x1
    and    x1, x2
    xor    [esi-4], x1
    loop   xd_l6
    popad

    ; XCHG(x[8], x[10]);
    ; XCHG(x[9], x[11]);
    mov    eax, [esi+8*4]
    mov    ebp, [esi+9*4]

    xchg   eax, [esi+10*4]
    xchg   ebp, [esi+11*4]

    mov    [esi+8*4], eax
    mov    [esi+9*4], ebp

    popad

    ; --r
    inc    eax
    cmp    al, 12
    jnz    xoo_main

    ; release memory
    popad
    ; restore registers
    popad
    ret

ARM assembly

ARM offers a nice instruction BIC which is the equivilant of (x & ~y) in C.

This instruction is used in the Rho and Chi modules combined.

.arm
  .arch armv6
  .text
  .align  2

  .global xoodoo

state .req r0
x     .req r1

r     .req r2
i     .req r3

x0    .req r4
x1    .req r5
x2    .req r6
x3    .req r7

rc    .req r8
xt    .req r9

e     .req sp

xoodoo:
    // save registers
    push   {r0-r12, lr}

    mov    r, #12              // 12 rounds
    sub    sp, #16             // allocate 16 bytes
    adr    rc, rc_tab
xoodoo_main:
    mov    i, #0               // i = 0
    mov    x, state
theta_l0:
    ldr    x2, [x, #32]        // x2 = x[i+8]
    ldr    x1, [x, #16]        // x1 = x[i+4]
    ldr    x0, [x], #4         // x0 = x[i+0], advance x by 4

    // e[i] = ROTR32(x[i] ^ x[i+4] ^ x[i+8], 18);
    eor    x0, x1
    eor    x0, x2
    ror    x0, #18

    // e[i]^= ROTR32(e[i], 9);
    eor    x0, x0, ror #9
    str    x0, [sp, i, lsl #2]  // store in e

    add    i, #1               // i++
    cmp    i, #4               // i<4
    bne    theta_l0            //

    // x[i]^= e[(i - 1) & 3];
    mov    i, #0               // i = 0
    mov    x, state            // x = state
theta_l1:
    sub    xt, i, #1
    and    xt, #3               // xt = i & 3
    ldr    xt, [sp, xt, lsl #2] // xt = e[(i - 1) & 3]
    ldr    x0, [x, i, lsl #2]   // x0 = x[i]
    eor    x0, xt               // x0 ^= xt
    str    x0, [x, i, lsl #2]   // x[i] = x0
    add    i, #1                // i++
    cmp    i, #12               // i<12
    bne    theta_l1

    // Rho west
    // XCHG(x[7], x[4]);
    // XCHG(x[7], x[5]);
    // XCHG(x[7], x[6]);
    add    x, state, #16       // x = &state[4]
    ldm    x, {x0, x1, x2, x3}
    mov    xt, x0              // xt = x[4]
    mov    x0, x3              // x[4] = x[7]
    mov    x3, x2              // x[7] = x[6]
    mov    x2, x1              // x[6] = x[5]
    mov    x1, xt              // x[5] = xt
    stm    x, {x0, x1, x2, x3}

    mov    x, state

    // Iota
    // x[0] ^= rc[r];
    ldrh   xt, [rc], #2        // load half-word, advance by 2
    ldr    x0, [x]             // load word
    eor    x0, xt              // xor
    str    x0, [x]             // store word

    mov    i, #4
chi:
    // Chi and Rho east
    // x0 = x[i+0];
    ldr    x0, [x]

    // x1 = x[i+4];
    ldr    x1, [x, #16]

    // x2 = ROTR32(x[i+8], 21);
    ldr    x2, [x, #32]
    ror    x2, #21

    // x[i+8] = ROTR32((x1 & ~x0) ^ x2, 24);
    bic    xt, x1, x0
    eor    xt, x2
    ror    xt, #24
    str    xt, [x, #32]

    // x[i+4] = ROTR32((x0 & ~x2) ^ x1, 31);
    bic    xt, x0, x2
    eor    xt, x1
    ror    xt, #31
    str    xt, [x, #16]

    // x[i+0]^= x2 & ~x1;
    bic    xt, x2, x1
    eor    xt, x0
    str    xt, [x], #4

    // i--
    subs   i, #1
    bne    chi

    add    x, state, #32       // x = &state[8]

    // XCHG(x[8], x[10]);
    ldm    x, {x0, x1, x2, x3}
    push   {x0}
    mov    x0, x2
    pop    {x2}

    // XCHG(x[9], x[11]);
    push   {x1}
    mov    x1, x3
    pop    {x3}
    stm    x, {x0, x1, x2, x3}

    subs   r, #1               // r--
    bne    xoodoo_main         // r>0

    // release stack
    add    sp, #16

    // restore registers, and return
    pop    {r0-r12, pc}

    // round constants
rc_tab:
    .hword 0x058, 0x038, 0x3c0, 0x0d0
    .hword 0x120, 0x014, 0x060, 0x02c
    .hword 0x380, 0x0f0, 0x1a0, 0x012

Summary

Xoodoo with 12 rounds and a 384-bit state is much more compact than Keccak with 22 rounds and an 800-bit state. Presumably Keccak is more secure, but cannot be vectorized and perform as well as Xoodoo or Gimli.

The x86 assembly here is 186 bytes while the C generated assembly is 300. Both could probably be optimized further, but it’s the best I can do at the moment.

Advertisements
Posted in arm, assembly, cryptography, programming, x86 | Tagged , , , , , , | Leave a comment

XTEA Block Cipher

Introduction

TEA Extensions (XTEA) is a 64-bit block cipher with support for 128-bit keys. It was published in 1998 as a response to weaknesses found in the Tiny Encryption Algorithm (TEA) which was discussed previously in this post.

XTEA compared to its predecessor contains a more complex key-schedule and rearrangement of shifts, XORs, and additions.

Only Encryption is presented, which can be used in CTR mode to create a stream cipher.

Encryption

// uses 32 rounds by default
void xtea_encrypt(void *key, void *data) {
    int      i;
    uint32_t x0, x1, t, sum=0;
    
    uint32_t *k=(uint32_t*)key;
    uint32_t *x=(uint32_t*)data;
    
    // load 64-bit plain text
    x0 = x[0]; x1 = x[1];
    
    for (i=64; i>0; i--) {
      t = sum;
      // add constant every 2nd round
      if (i & 1) {
        sum += 0x9E3779B9;
        t = sum >> 11;          
      }
      x0 += ((((x1 << 4) ^ 
        (x1 >> 5)) + x1) ^ 
        (sum + k[t & 3]));
         
      XCHG(x0, x1);
    }
    // save 64-bit cipher text
    x[0] = x0; x[1] = x1;
}

The assembly code for x86 CPU.

bits 32
        
    %define x0  eax
    %define x1  ebx    

    %define t0  ebp
    %define t1  esi
    
    %define k   edi
    
    %define sum edx
    
xtea_encryptx:    
_xtea_encryptx:
    pushad
    mov    edi, [esp+32+4]   ; edi = key
    mov    esi, [esp+32+8]   ; esi = data
    push   64
    pop    ecx
    xor    edx, edx          ; sum = 0
    push   esi
    lodsd
    xchg   eax, x1
    lodsd
    xchg   eax, x1
xtea_enc:
    mov    t0, x1            ; t0   = x1 << 4
    shl    t0, 4
    
    mov    t1, x1            ; t1   = x1 >> 5
    shr    t1, 5    
    
    xor    t0, t1            ; t0  ^= t1
    add    t0, x1            ; t0  += x1;
    
    mov    t1, sum           ; t1   = sum
    test   cl, 1
    jz     xtea_l1
    
    add    sum, 0x9E3779B9   ; sum += 0x9E3779B9   
    mov    t1, sum     
    shr    t1, 11            ; t1 = sum >> 11  
xtea_l1:    
    and    t1, 3             ; t1  &= 3
    mov    t1, [k+4*t1]      ; t1 = sum + k[t1]
    add    t1, sum
    
    xor    t0, t1            ; t0 ^= t1
    
    add    x0, t0            ; x0 += t0
    xchg   x0, x1            ; XCHG(x0, x1); 
    loop   xtea_enc    
    
    pop    edi               ; edi = x
    stosd                    ; x[0] = x0
    xchg   eax, x1
    stosd                    ; x[1] = x1
    popad
    ret

Assembly for the ARMv6 architecture.

ARMv6 allows each instruction to have conditional execution, which allows us to eliminate branches in the main loop.

// key
k   .req r0
x   .req r1

// data
x0  .req r2
x1  .req r3

sum .req r4
t0  .req r5
t1  .req r6
c   .req r7
i   .req r8

  // xtea_encryptx(xoid *key, xoid *data);
xtea_encryptx:
  // saxe registers
  push  {r0-r12,lr}

  // 32 rounds by default
  mov   i, #64               // number of rounds

  // load plaintext
  ldm   x, {x0, x1}          // x0  = x[0], x1 = x[1];
  mov   sum, #0              // sum = 0;
  ldr   c, =#0x9E3779B9      // c   = 0x9E3779B9;
xtea_loop:
  mox   t0, sum              // t0 = sum;
  tst   i, #1                // if (i & 1)

  // the next 2 only execute if (i % 2) is not zero
  addne sum, sum, c          // sum += 0x9E3779B9;
  lsrne t0, sum, #11         // t0 = sum >> 11

  and   t0, t0, #3           // t0 %= 4
  ldr   t0, [k, t0, lsl #2]  // t0 = k[t0];
  add   t1, sum, t0          // t1 = sum + t0
  mov   t0, x1, lsl #4       // t0 = (x1 << 4)
  eor   t0, t0, x1, lsr #5   // t0^= (x1 >> 5)
  add   t0, t0, x1           // t0+= x1
  eor   t0, t0, t1           // t0^= t1
  mov   t1, x1               // backup x1
  add   x1, t0, x0           // x1 = t0 + x0

  // XCHG(x0, x1)
  mov   x0, t1               // x0 = x1
  subs  i, i, #1             // i--
  bne   xtea_loop            // i>0

  // saxe ciphertext
  stm   x, {x0, x1}

  // restore registers
  pop   {r0-r12,pc}

Because the link register (LR) is saved upon entry point, and restored to the program counter (PC) at end, this returns execution directly to the caller.

Summary

Some people searching the internet for a “Tiny Encryption Algorithm” tend to come accross TEA/XTEA, but for a variety of reasons, it’s probably not the best choice anymore. The Chaskey block cipher is a good alternative with support for 128-bit blocks.

Posted in arm, assembly, programming, security, x86 | Tagged , , , , , , | 1 Comment

BlaBla Stream Cipher

Introduction

BlaBla is a stream cipher intended for 64-bit CPUs, which was published by the cryptographer Jean-Philippe Aumasson on his Github in April 2017.

BlaBla uses the same permutation function as the cryptographic hash algorithm BLAKE2b, which is derived from the permutation function used in the ChaCha stream cipher.

The structure of BlaBla is essentially the same as ChaCha, but obviously intended for 64-bit architectures.

No other information is available at this point. Frank Denis described it as “…yet another evil experiment from Jean-Philippe Aumasson

Frank informed me the permutation function used in BlaBla is also used in this project here which is an Authenticated Encryption Associated Data (AEAD) algorithm based on Masked Even Mansour(MEM) construction.

More information found in Improved Masking for Tweakable Blockciphers with Applications to Authenticated Encryption

Initialization

ChaCha uses an internal state of 512-bits or 64-bytes, and works efficiently on 32-bit CPUs. BlaBla uses an internal state of 1024-bits or 128-bytes, and works efficiently on 64-bit CPUs.

There are 9 64-bit constants used to initialize the state. The first 4 of these include the same 32-bit values used to initialize the 256-bit key variant of ChaCha, which is the string “expand 32-byte k”

The other values were possibly generated randomly, but there’s currently no explanation for their origin.

// setup the key
void bb20_setkey(bb20_ctx *c, void *key, void *nonce)
{
    c->q[ 0] = 0x6170786593810fab;
    c->q[ 1] = 0x3320646ec7398aee;
    c->q[ 2] = 0x79622d3217318274;
    c->q[ 3] = 0x6b206574babadada;

    // set 256-bit key
    memcpy (&c->b[32], key, BB20_KEY_LEN);
    
    c->q[ 8] = 0x2ae36e593e46ad5f;
    c->q[ 9] = 0xb68f143029225fc9;
    c->q[10] = 0x8da1e08468303aa6;
    c->q[11] = 0xa48a209acd50a4a7;
    c->q[12] = 0x7fdc12f23f90778c;
    
    // set 32-bit counter
    c->q[13] = 1; 
    
    // set 128-bit nonce
    memcpy(&c->q[14], nonce, 16);
}

OK. Only Microsoft fastcall is supported here with this assembly code.

bb20_setkeyx:
    ; save
    push   rsi
    push   rdi
    
    push   rcx   ; rdi = c
    pop    rdi
    
    ; load values
    call   sk_l0
    dq     0x6170786593810fab
    dq     0x3320646ec7398aee
    dq     0x79622d3217318274
    dq     0x6b206574babadada    
    dq     0x2ae36e593e46ad5f
    dq     0xb68f143029225fc9
    dq     0x8da1e08468303aa6
    dq     0xa48a209acd50a4a7
    dq     0x7fdc12f23f90778c
sk_l0:
    pop    rsi
    push   (4*8)/4
    pop    rcx
    rep    movsd
    
    ; copy key
    xchg   rsi, rdx   ; rsi = key
    mov    cl, 32/4
    rep    movsd
    
    ; copy remaining values
    xchg   rsi, rdx
    mov    cl, (5*8)/4
    rep    movsd
    
    ; set counter
    ; c->q[13] = 1
    push   1
    pop    rax
    stosq
    
    ; set nonce
    push   r8        ; rsi = nonce   
    pop    rsi
    movsq
    movsq    
    
    ; restore
    pop    rdi
    pop    rsi
    ret

Permutation Function

As stated, this is the same function used by the BLAKE2b cryptographic hash function.

This function is optimized to save space, whereas others are usually optimized to perform fast encryption and decryption of data streams.

// permutation function from blake2b
void F(uint64_t s[16])
{
    int         i;
    uint64_t    a, b, c, d, t, idx;
    uint32_t    r;
    
    uint16_t idx16[8]=
    { 0xC840, 0xD951, 0xEA62, 0xFB73,    // column index
      0xFA50, 0xCB61, 0xD872, 0xE943 };  // diagonal index
    
    for (i=0; i<8; i++) {
      idx = idx16[i];
        
      a = (idx         & 0xF);
      b = ((idx >>  4) & 0xF);
      c = ((idx >>  8) & 0xF);
      d = ((idx >> 12) & 0xF);
  
      r = 0x3F101820;
      
      // The quarter-round
      do {
        s[a]+= s[b]; 
        s[d] = ROTR64(s[d] ^ s[a], r & 0xFF);
        XCHG(c, a);
        XCHG(d, b);
        r >>= 8;
      } while (r != 0);
    }    
}

The assembly code could be inlined with bb20_streamx, but feel free to make changes if you want.

; rsi = state
; rdi = x
FX:
    push    rsi   
    push    rdi   
    push    rbx   
    push    rbp   
    push    rcx   
    ; load indexes
    call    bb_f1
    dw      0c840H, 0d951H
    dw      0ea62H, 0fb73H
    dw      0fa50H, 0cb61H
    dw      0d872H, 0e943H
bb_f1:
    pop     rsi  ; pointer to indexes
    mov     cl, 8
bb_f2:
    push    rcx
    xor     eax, eax 
    lodsw
    push    rsi
    ; ========================
    mov     ebx, eax
    mov     edx, eax
    mov     esi, eax

    ; a = (idx         & 0xF);
    and     eax, 15
    ; b = ((idx >>  4) & 0xF);
    shr     ebx, 4
    and     ebx, 15
    ; c = ((idx >>  8) & 0xF);
    shr     edx, 8
    and     edx, 15
    ; d = ((idx >> 12) & 0xF);
    shr     esi, 12         
    ; load ecx with rotate values
    mov     ecx, 0x3F101820
bb_f3:
    ; s[a]+= s[b];
    mov     rbp, [rdi+rbx*8]    
    add     [rdi+rax*8], rbp
    ; s[d] = ROTR64(s[d] ^ s[a], r & 0xFF);
    mov     rbp, [rdi+rsi*8]
    xor     rbp, [rdi+rax*8]
    ror     rbp, cl
    mov     [rdi+rsi*8], rbp  
    xchg    edx, eax
    xchg    esi, ebx    
    shr     ecx, 8
    jnz     bb_f3
    ; ======================
    pop     rsi
    pop     rcx
    loop    bb_f2 
    
    pop     rcx
    pop     rbp
    pop     rbx
    pop     rdi
    pop     rsi
    ret

Stream Generator

Essentially the same as ChaCha, but using double the internal state and 64-bit operations.

// generate stream of bytes
void bb20_stream (bb20_ctx *c, w1024_t *x)
{
    int i;

    // copy state to x
    memcpy(x->b, c->b, BB20_STATE_LEN);
    
    // apply 20 rounds of permutation function
    for (i=0; i<20; i+=2) {
      F(x->q);
    }
    // add state to x
    for (i=0; i<16; i++) {
      x->q[i] += c->q[i];
    }
    // update 64-bit counter
    c->q[13]++;
}
; generate 128-byte stream 
; rdi has x
; rsi has state   
bb20_streamx:
    push    rax       ; save rcx
    push    rcx       ; save rcx
    push    rsi       ; save state
    push    rdi       ; save x    
    ; copy state to x
    xchg    eax, ecx  ; zero the upper 56-bits of rcx
    mov     cl, 128   ; 1024-bits
    rep     movsb    
    ; apply 20 rounds of permutation function
    pop     rdi       ; restore x
    mov     cl, 20/2
bb_sx0    
    ; F(x->q);
    call    FX   
    loop    bb_sx0
    pop     rsi       ; restore state    
    ; add state to x    
    mov     cl, 16
bb_sx1:
    ; x->q[i] += c->q[i];
    mov     rax, [rsi+rcx*8-8]
    add     [rdi+rcx*8-8], rax
    loop    bb_sx1
    ; update 64-bit counter
    ; c->q[13]++;   
    inc     qword[rsi+13*8]    
    pop     rcx
    pop     rax
    ret

Encryption/Decryption

Generates a stream of 128-bytes of using the generator. Then XORs the stream against plaintext/ciphertext.

// encrypt or decrypt stream of len-bytes
void bb20_encrypt (uint64_t len, void *in, bb20_ctx *ctx) 
{
    uint64_t r, i;
    w1024_t  s;
    uint8_t  *p=(uint8_t*)in;
    
    while (len) {      
      bb20_stream(ctx, &s);
      
      r = MIN(len, BB20_BLK_LEN);
      
      // XOR input with stream
      for (i=0; i<r; i++) {
        p[i] ^= s.b[i];
      }
    
      len -= r;
      p   += r;
    }
}
; void bb20_encrypt (uint64_t len, void *in, bb20_ctx *state)
bb20_encryptx:
    push    rsi
    push    rdi
    push    rbx
    push    rbp
  
    push    r8               ; rsi = state
    pop     rsi    
    
    push    rdx              ; rbx = in
    pop     rbx

    sub     rsp, 128
    push    rsp
    pop     rdi    
bb_e0:
    xor     eax, eax         ; idx = 0  
    jecxz   bb_e3            ; exit if len==0
    call    bb20_streamx
bb_e1:
    mov     dl, byte[rdi+rax]
    xor     byte[rbx], dl    ; p[idx] ^= stream[idx]
    inc     rbx
    inc     al
    cmp     al, 128
    loopne  bb_e1            ; --len
    jmp     bb_e0
bb_e3:
    add     rsp, 128
    pop     rbp
    pop     rbx
    pop     rdi
    pop     rsi
    ret

334 bytes for the x64 assembly.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , | Leave a comment

HIGHT Block Cipher

Introduction

HIGHT which stands for HIGh security and light weigHT is a block cipher first proposed at the 2006 Cryptographic Hardware and Embedded Systems (CHES) conference held in Japan.

HIGHT attracted a lot of attention upon its release because it only uses ARX operations (modular addition, bitwise rotation, and XOR) on 8-bit words.

It was selected in 2010 by the Telecommunications Technology Association (TTA) of Korea to become part of the ISO/IEC 18033-3 portfolio which also consists of the following 64-bit block ciphers: TDEA, MISTY1, CAST-128.

Some points to note.

  • 64-bit input/output data block size
  • 128-bit key length
  • 32-rounds using 2**8 modular addition, rotation, and XOR (ARX)
  • No S-Box (non-linear operations using modular addition)
  • Designed for low-resource device (data storage, power, etc.)

I normally don’t implement hardware based ciphers, but thought i’d make exception for this one, as it initially appeared to be compact.

Only encryption is presented, and it’s not optimized for performance.

Constant Generation

“Random” constants are used in the generation of subkeys. These are to protect the cipher against simple Slide Attacks.

I’ve included the option of using a lookup table or function to calculate the constants at runtime.

// use lookup table for constants
#ifdef LUT
uint8_t rc[128]=
{ 0x5a, 0x6d, 0x36, 0x1b, 0x0d, 0x06, 0x03, 0x41,
  0x60, 0x30, 0x18, 0x4c, 0x66, 0x33, 0x59, 0x2c,
  0x56, 0x2b, 0x15, 0x4a, 0x65, 0x72, 0x39, 0x1c,
  0x4e, 0x67, 0x73, 0x79, 0x3c, 0x5e, 0x6f, 0x37,
  0x5b, 0x2d, 0x16, 0x0b, 0x05, 0x42, 0x21, 0x50,
  0x28, 0x54, 0x2a, 0x55, 0x6a, 0x75, 0x7a, 0x7d,
  0x3e, 0x5f, 0x2f, 0x17, 0x4b, 0x25, 0x52, 0x29,
  0x14, 0x0a, 0x45, 0x62, 0x31, 0x58, 0x6c, 0x76,
  0x3b, 0x1d, 0x0e, 0x47, 0x63, 0x71, 0x78, 0x7c,
  0x7e, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x43, 0x61,
  0x70, 0x38, 0x5c, 0x6e, 0x77, 0x7b, 0x3d, 0x1e,
  0x4f, 0x27, 0x53, 0x69, 0x34, 0x1a, 0x4d, 0x26,
  0x13, 0x49, 0x24, 0x12, 0x09, 0x04, 0x02, 0x01,
  0x40, 0x20, 0x10, 0x08, 0x44, 0x22, 0x11, 0x48,
  0x64, 0x32, 0x19, 0x0c, 0x46, 0x23, 0x51, 0x68,
  0x74, 0x3a, 0x5d, 0x2e, 0x57, 0x6b, 0x35, 0x5a };
#else

void gen_const(uint8_t *ci)
{
    int     i, j;
    uint8_t c;
    
    union {
      uint8_t  b[128+8];
      uint32_t w[(128+8)/4];
    } s;

    // zero initialize s
    memset(&s, 0, sizeof(s));

    // set initial bits
    s.w[1] = 65537;
    s.w[0] = s.w[1] << 8;

    // set first constant
    // precalculated from bits of s array
    ci[0] = 0x5A;

    for(i=1; i<128; i++) {
      c = s.b[i + 2] ^
          s.b[i - 1];

      s.b[i + 6] = c;

      for(j=1; j<7; j++) {
        c += c;
        c ^= s.b[i + 6 - j];
      }
      ci[i] = c;
    }
}
#endif

The assembly code is less than 128 bytes, which is used instead of precomputed array.

gen_constx:
    pushad
    mov    esi, edi ; esi = ci
    xor    eax, eax
    mov    cl, 80h + 8 ; allocate 136 bytes
    sub    esp, ecx
    mov    edi, esp
    ; memset(&s, 0, sizeof(s));
    rep    stosb
    ; s.w[1] = 65537;
    mov    dword[esp+4], 65537
    ; s.w[0] = s.w[1] << 8;
    mov    dword[esp], 65537 << 8
    ; ci[0] = 0x5A;
    mov    byte[esi], 0x5A
    ; i = 1;
    inc    eax
gen_l1:
    ; c = s.b[i+2] ^ s.b[i-1];
    mov    cl, [esp+eax+2]
    xor    cl, [esp+eax-1]
    ; s.b[i+6] = c
    mov    [esp+eax+6], cl
    ; j = 1
    cdq
    inc    edx
gen_l2:
    ; c += c
    add    cl, cl
    ; c ^= s.b[(i + 6) - j];
    lea    ebx, [eax+6]
    sub    ebx, edx  
    xor    cl, [esp+ebx]
    ; j++
    inc    edx
    cmp    dl, 7
    jnz    gen_l2
    ; ci[i] = c;
    mov    [esi+eax], cl
    ; i++
    inc    al
    ; i<128
    jns    gen_l1
    lea    esp, [esp+eax+8]   
    popad
    ret

Key Schedule

The master key is expanded from 128-bits to 1024-bits. Each byte of the master key, indexed using an unexplained calculation, is added to 1 of the generated constants.

void hight128_setkey(void *in, void *out)
{
    uint8_t i, j, idx;
    w128_t  *wk, *mk;
    uint8_t *sk;
    
    mk=(w128_t*)in;
    wk=(w128_t*)out;
    sk=(uint8_t*)out;
    
    // apply key whitening
    wk->w[0] = mk->w[3];
    wk->w[1] = mk->w[0];

    #ifdef LUT
      memcpy(&sk[8], rc, sizeof(rc));
    #else  
      // generate constants
      gen_const(&sk[8]);
    #endif
 
    // generate subkeys
    for(i=0; i<8; i++) {
      sk += 8;
      for(j=0; j<8; j++) {
        idx = (j - i + 8) & 7;

        sk[0] += mk->b[idx  ];
        sk[8] += mk->b[idx+8];
        sk++;        
      }
    }
}

There may be a way to implement generation of the subkeys in 1 loop.

The calculation of idx currently requires 2 loops.

hight128_setkeyx:    
_hight128_setkeyx:
    pushad
    mov    esi, [esp+32+4] ; esi = in
    mov    edi, [esp+32+8] ; edi = out    
    mov    eax, [esi+3*4]
    stosd
    mov    eax, [esi]
    stosd
    ; i=0
    xor    ecx, ecx    
    call   gen_constx
sk_l1:
    ; j=0
    xor    edx, edx
sk_l2:
    ; idx = ((j + 8) - i) & 7;
    lea    ebx, [edx+8]
    sub    ebx, ecx
    and    ebx, 7
    ; sk[0] += mk->b[idx  ];
    mov    al, [esi+ebx]
    add    [edi+0], al
    ; sk[8] += mk->b[idx+8];
    mov    al, [esi+ebx+8]
    add    [edi+8], al
    ; sk++;
    inc    edi
    ; j++
    inc    edx
    ; j<8
    cmp    dl, 8
    jnz    sk_l2
    ; sk += 8
    add    edi, edx
    ; i++
    inc    ecx
    ; i<8
    cmp    cl, 8
    jnz    sk_l1    
    popad
    ret

Round functions

2 Linear functions which provide diffusion. Although they’re shown separately here, a different version inlines them in the main encryption loop.

uint8_t F0(uint8_t x) {
    return ROTL8(x, 1) ^ ROTL8(x, 2) ^ ROTL8(x, 7);
}

uint8_t F1(uint8_t x) {
    return ROTL8(x, 3) ^ ROTL8(x, 4) ^ ROTL8(x, 6);
}

Encryption

The only nonlinear operations here are the modular addition. Perhaps it would have made more sense to use an s-box layer. A 4 x 4 bit-slice sbox would work well here.

void hight128_encrypt(void *data, void *keys)
{
    int      i;
    w64_t    *x;
    uint8_t  *sk, *wk=(uint8_t*)keys;

    x  = (w64_t*)data;
    sk = &wk[8];

    // mix key with 1st 4 bytes
    x->b[0] += wk[0]; x->b[2] ^= wk[1];
    x->b[4] += wk[2]; x->b[6] ^= wk[3];

    for(i=0; i<32; i++) {
      // circular shift left by 8 bits
      x->q = ROTL64(x->q, 8);
      // apply linear/non-linear operations
      x->b[2] += (F1(x->b[1]) ^ *sk++);
      x->b[4] ^= (F0(x->b[3]) + *sk++);
      x->b[6] += (F1(x->b[5]) ^ *sk++);
      x->b[0] ^= (F0(x->b[7]) + *sk++);
    }
    // circular shift right by 8 bits
    x->q = ROTL64(x->q, 56);

    // mix key with 2nd 4 bytes
    x->b[0] += wk[4]; x->b[2] ^= wk[5];
    x->b[4] += wk[6]; x->b[6] ^= wk[7];
}

The assembly code for this is slightly different from the C code, and wasn’t enjoyable to implement 😉

For each round, there are 2 loops, consisting of 1 addition and 1 XOR with the linear functions inlined.

The 64-bit rotation uses ADD/ADC

; rotate cipher text 64-bits    
rotl64: 
    pushad  
    mov    eax, [edi]
    mov    ebx, [edi+4]   
rt_l0:     
    add    eax, eax
    adc    ebx, ebx
    adc    al, 0
    loop   rt_l0    
    stosd
    xchg   eax, ebx
    stosd
    popad
    ret   
   
hight128_encryptx:
_hight128_encryptx:
    pushad
    mov    edi, [esp+32+4] ; data
    mov    esi, [esp+32+8] ; wk and sk
    push   2
    pop    ecx
    push   edi
hi_l0:    
    lodsw
    ; x->b[0] += wk[0];     
    add    [edi+0], al
    ; x->b[2] ^= wk[1];
    xor    [edi+2], ah
    scasd
    loop   hi_l0    
    pop    edi    
    lodsd    
    ; save wk[4]
    push   eax
    ; 32 rounds    
    mov    cl, 32
hi_enc:
    push   ecx
    ; x->q = ROTL64(x->q, 8);
    mov    cl, 8
    call   rotl64        
    mov    cl, 2
    movzx  edx, cl
hi_l1:
    ; c = x->b[j-1];
    mov    al, [edi+edx-1]
    ; c = ROTL8(c, 3) ^ ROTL8(c, 4) ^ ROTL8(c, 6);
    mov    ah, al
    rol    ah, 3
    rol    al, 4
    xor    ah, al
    rol    al, 6-4
    xor    ah, al
    ; x->b[j] += (c ^ *sk++);
    lodsb
    xor    al, ah
    add    [edi+edx], al    
    ; c = x->b[j+1];
    mov    al, [edi+edx+1]
    ; c = ROTL8(c, 1) ^ ROTL8(c, 2) ^ ROTL8(c, 7);
    mov    ah, al
    rol    ah, 1
    rol    al, 2
    xor    ah, al
    rol    al, 7-2
    xor    ah, al
    ; x->b[(j+2) & 7] ^= (c + *sk++);
    lodsb
    add    al, ah
    lea    ebx, [edx+2]
    and    bl, 7
    xor    [edi+ebx], al
    add    dl, 4    
    loop   hi_l1    
    pop    ecx
    loop   hi_enc    
    ; x->q = ROTL64(x->q, 56);   
    mov    cl, 56
    call   rotl64    
    ; restore wk[4]
    pop    eax
    ; x->b[0] += wk[0];     
    add    [edi+0], al
    ; x->b[2] ^= wk[1];
    xor    [edi+2], ah
    bswap  eax                ; instead of shr eax, 16
    ; x->b[4] += wk[6]; 
    add    [edi+4], ah    
    ; x->b[6] ^= wk[7];
    xor    [edi+6], al    
    popad
    ret

I’ve no doubt the above code can be optimized further, but it would require removing the dependency on EDI in the main loop.

The load/save operations using EDI are not necessary except at the beginning and end of code.

If the plaintext was loaded into 2 32-bit registers, there’s a possibility of removing the 64-bit rotation in the main loop by rotating registers and swapping bytes.

Summary

HIGHT is a hardware cipher obviously intended for 8-bit CPUs. Because of the linear functions, it’s difficult to see how one could optimize it for a 32-bit or 64-bit CPU.

I’ve speculated that if the bytes of plaintext were rearranged, one could optimize it further for a 16-bit CPU, but it would still need 8-bit operations for the linear functions.

The assembly code generated by MSVC is approx. 440 bytes, and the assembly is currently 283 bytes.

There are some parts of this cipher I feel could have been better designed, which would have made it easier to implement in software.

Take for example the key schedule routine; the calculation of index to master key seems overcomplicated and inefficient. For the encryption, there’s an additional rotation of the ciphertext which just seems redundant.

After the encryption has been applied, rearranging the ciphertext isn’t likely to affect security of the cipher layers, so why do it? Presumably, it’s a permutation to strengthen ciphertext.

As said already, the only nonlinear operations are modular addition, and there could be potential security problems with an ARX structure yet to be revealed.

A bit-slice S-box layer for each round of encryption might have mitigated against any future problems found in ARX.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , | Leave a comment

RoadRunneR Block Cipher

Introduction

RoadRunneR is a compact, fast Bitslice Block cipher designed specifically for Low Cost 8-bit CPUs. Details of the cipher were published here in 2015, and proposed by Adnan Baysal and Suhap Sahin at the Lightweight Cryptography for Security and Privacy, (LightSec 2015) held in Germany.

RoadRunneR is a Feistel-type block cipher, with 64-bit block size and 80-bit or 128-bit key. The 80-bit variant uses 10 rounds while the 128-bit uses 12. In this post, I’ll show x86 assembly and C code for the 128-bit version.

The authors of cipher state in their proposal the main objectives of RoadRunneR:

  • Implementation efficiency in 8-bit CPUs
  • No table and SRAM usage
  • Low decryption overhead
  • Provable security like in wide trail design strategy

RoadRunneR is a perfect choice for devices with very restricted memory resources and for applications requiring reasonable throughput expectations.

Although the cipher is specifically designed for 8-bit CPUs, the code shown here has a mixture of 8-bit and 32-bit code.

S Layer

Using bit-slice techniques for an S-box layer has become popular lately as more research into lightweight cryptography for resource constrained environments progresses.

Block ciphers such as NOEKEON, PRIDE, RECTANGLE and Fantomas/Robin all use a bit-sliced S-box layer.

Some advantages of using a bit-slice S-box layer in both hardware and software implementations include protection against side-channel analysis, and elimination of S-Box array that would require ROM/RAM.

Authors say the S-box for RoadRunneR was found through a brute force search of assembly code combinations. The search space was restricted to 4 input words, a single temporary word, and the following instructions: AND, OR, XOR, NOT, and MOV.

Finding Optimal Bitsliced Implementations of 4×4-bit S-boxes. describes this process in more detail.

The following values for the 4 x 4 S-Layer are used.

// S-Layer
void sbox(uint8_t *x)
{
    uint8_t t;
    
    t = x[3];

    x[3] &= x[2];
    x[3] ^= x[1];
    x[1] |= x[2];
    x[1] ^= x[0];
    x[0] &= x[3];
    x[0] ^=  t; 
       t &= x[1];
    x[2] ^=  t;
}

Cryptographic Analysis of All 4 x 4 – Bit S-Boxes

L Layer

A linear function on each byte of the state to provide diffusion. Such a linear function is normally implemented using an XOR of rotated or shifted values. It’s commonly seen in many block ciphers and hash algorithms like Serpent, SM3 and SM4.

K Layer

This is not referenced in the original paper, but is essentially Key Whitening AKA Key Addition, which was first implemented by Ron Rivest in DES-X.

The theory is that by XOR’ing bits of the key against plaintext during the encryption process, this helps strengthen the final ciphertext against exhaustive brute force attacks.

It’s a technique used in many SPN block ciphers. For example, it’s known as AddRoundKey in the Advanced Encryption Standard (AES).

SLK Function

Combining Layers S, L and K is the following function.

// SLK function 
void SLK(w32_t *x, uint8_t *sk)
{
    int     i;
    uint8_t t;
    uint8_t *p=x->b;
    
    // apply S-Layer
    sbox(p);
    
    for (i=3; i>=0; i--) {      
      // apply L-Layer
      t   = ROTL8(*p, 1) ^ *p;       
      *p ^= ROTL8(t,  1); 
      
      // apply K-Layer
      *p++ ^= *sk++;
    }
}

The F-Function

The main function consists of a 4-round Substitution–permutation network (SPN) structure. The first three rounds have the same function called SLK, which is the application of S, L and K layers already described. The final round only applies the S layer.

In addition to aforementioned layers is the addition of a round constant after the second call to SLK. This is simply the round number XOR’ed against the least significant byte of the state, which for the 128-bit key variant would be from 12 down to 1.

It is, as the authors describe, intended to protect against simple Slide Attacks.

Biryukov and Wagner state the following in Advanced Slide Attacks.

The most obvious type of slide attack is usually easy to prevent by destroying self-similarity in iterative ciphers, for example by adding iteration counters or random constants.

// F round
void F(w64_t *blk, void *key, 
    uint8_t *key_idx, uint8_t ci)
{
    int      i;
    uint32_t t;
    uint8_t  *rk=(uint8_t*)key;
    w32_t    *x=(w32_t*)blk;
    
    // save 32-bits
    t = x->w;
    
    for (i=3; i>0; i--) {
      // add round constant
      if (i==1) x->b[3] ^= ci;
      // apply S,L,K layers
      SLK (x, rk + *key_idx);      
      // advance master key index
      *key_idx = (*key_idx + 4) & 15;
    }
    
    // apply S-Layer
    sbox(x->b);
    
    // add upper 32-bits
    blk->w[0]^= blk->w[1];
    blk->w[1] = t;
}

Encryption

The variable key_idx which acts as a master key index is initialized to 4, skipping the first 4 bytes which are XOR’d against the plaintext as part of K-Layer before 12 rounds of F.

Note how rnd is initialized to 12 and passed to F. This is the round constant used to protect against simple slide attacks.

// encrypt 64-bits of data using 128-bit key  
void road64_encrypt(void *data, void *key)
{
    int      rnd;
    uint8_t  key_idx;
    uint32_t t;
    w64_t    *x=(w64_t*)data;
    uint32_t *rk=(uint32_t*)key;

    // initialize master key index
    key_idx = 4;
    
    // apply K-Layer
    x->w[0] ^= rk[0];
    
    // apply rounds
    for (rnd=RR_ROUNDS; rnd>0; rnd--) {
      F(x, rk, &key_idx, rnd);
    }
    // P-Layer?
    XCHG(x->w[0], x->w[1]);
    // apply K-Layer
    x->w[0] ^= rk[1];
}

Above you see P-Layer? It’s not mentioned, but this swap is a permutation, and likely helps strengthen the final ciphertext.

Assembly code

Because it’s such a compact algorithm with no key scheduling, the encryption can be implemented all in one call.

bits 32
    
    %ifndef BIN
      global road64_encryptx
      global _road64_encryptx
    %endif
    
struc pushad_t
  _edi resd 1
  _esi resd 1
  _ebp resd 1
  _esp resd 1
  _ebx resd 1
  _edx resd 1
  _ecx resd 1
  _eax resd 1
  .size:
endstruc

%define key_idx ebx    
%define rnd     ecx 
%define t       edx   
%define x       esi
%define p       esi
%define rk      edi
%define sk      edi
    
road64_encryptx:
_road64_encryptx:
    pushad
    mov    x,  [esp+32+4] ; esi : x  = data
    mov    rk, [esp+32+8] ; edi : rk = key
    ; key_idx = 4;
    push   4
    pop    key_idx        
    ; apply K-Layer
    ; ; x->w[0] ^= rk[0];
    mov    eax, [rk]
    xor    [x], eax       
    ; apply rounds
    ; rnd = RR_ROUNDS
    push   12             
    pop    rnd
rr_encrypt:    
    ; ------------------------- F round
    pushad
    ; t = x->w;
    mov    t, [x]
    ; i = 3;
    mov    cl, 3
f_round:
    ; add round constant
    ; if (i==1)
    cmp    cl, 1
    jne    SLKX
    ; x->b[3] ^= ci;
    mov    eax, [esp+_ecx]  ; ecx has round index   
    xor    [x+3], al  
SLKX:    
    ; -------------------------------
    ; SLK (x, rk + *key_idx);
    pushad  
    ; apply S-Layer
    call   sboxx 
    add    sk, key_idx    
    mov    cl, 4      ; 4 rounds of SLK 
    ; apply L-Layer
slk_round:
    ; t   = ROTL8(*p, 1) ^ *p; 
    movzx  eax, byte[p]
    rol    al, 1
    xor    al, [p]
    ; *p ^= ROTL8(t,  1);
    rol    al, 1 
    xor    [p], al
    ; apply K-Layer
    ; *p++ ^= *sk++;
    mov    al, byte[sk]
    inc    sk
    xor    [p], al
    inc    p
    loop   slk_round 
    popad
    ; -------------------------------- 
    ; advance master key index
    ; *key_idx = (*key_idx + 4) & 15;
    add    key_idx, 4
    and    key_idx, 15
    loop   f_round
    ; apply S-Layer
    ; sbox(x);    
    call   sboxx
    ; add upper 32-bits
    ; blk->w[0]^= blk->w[1];
    mov    eax, [x+4]
    xor    [x], eax
    ; blk->w[1] = t;
    mov    [x+4], t   
    mov    [esp+_ebx], key_idx    
    popad
    ; -------------------------
    loop   rr_encrypt 
    ; XCHG(x->w[0], x->w[1]);        
    mov    eax, [x]
    xchg   eax, [x+4]
    ; x->w[0] ^= rk[1];    
    xor    eax, [rk+4]
    mov    [x], eax  
    popad
    ret    

; S-Layer    
sboxx:
    pushad
    mov    ebx, x        ; ebx = esi
    lodsd
    mov    edx, eax      ; t.w = ROTR32(x->w, 16); 
    ror    edx, 16
    and    [ebx+3], dl   ; x->b[3] &=  t.b[0];
    xor    [ebx+3], ah   ; x->b[3] ^= x->b[1];
    or     [ebx+1], dl   ; x->b[1] |=  t.b[0];    
    xor    [ebx+1], al   ; x->b[1] ^= x->b[0];
    mov    dl, [ebx+3]
    and    [ebx+0], dl   ; x->b[0] &= x->b[3];
    xor    [ebx+0], dh   ; x->b[0] ^=  t.b[1];
    and    dh, [ebx+1]   ;  t.b[1] &= x->b[1];
    xor    [ebx+2], dh   ; x->b[2] ^=  t.b[1]; 
    popad
    ret

This could definitely be optimized in some parts, but I’ll update later.

Summary

A really nice and simple cipher with interesting features, which is entirely suitable for resource constrained environments. Assembly output from MSVC is currently 242 bytes. The x86 assembly is 142 bytes.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , | Leave a comment

CHAM Block Cipher

Introduction

CHAM: A Family of Lightweight Block Ciphers for Resource-Constrained Devices was published in December 2017 at the 20th Annual International Conference on Information Security and Cryptology held in South Korea.

CHAM consists of three ciphers, CHAM-64/128, CHAM-128/128, and CHAM-128/256 which are based on the Generalized Feistel Network (GFN) structure.

This post will focus on CHAM-128/128 which is the 128-bit block cipher with 128-bit key variant operating on 4 branches of 32 bits each. The only operations used are 32-bit modular addition, Rotation and XOR (ARX)

The design of CHAM draws inspiration from Speck and Simon block ciphers by the NSA. A cursory glance of the algorithm would suggest CHAM is not ideal for hardware due to its use of modular addition which is inefficient to implement on hardware.

Take this quote from the designers of the permutation function Keccak:

..the designer of an adder must choose between complexity (area, consumption) or gate delay (latency): It is either compact or fast, but not at the same time

Here are parameters for all 3 variants. N is the block length, K is the key length, R is the number of rounds, W is the width of a word and K/W is the number of words per key.

Key schedule

For the 128-bit version, there are 8 round keys of 32-bits each, or 32-bytes in total.

void cham128_setkey(void *in, void *out)
{
    int i;
    uint32_t *k=(uint32_t*)in;
    uint32_t *rk=(uint32_t*)out;

    for (i=0; i<KW; i++) {
      rk[i] = k[i] ^ ROTL32(k[i], 1) ^ ROTL32(k[i], 8);
      rk[(i + KW) ^ 1]	= k[i] ^ ROTL32(k[i], 1) ^ ROTL32(k[i], 11);
    }
}

The assembly may not be completely optimized here, but it’s best I can do at the moment.

%define K 128   ; key length
%define N 128   ; block length
%define R 80    ; number of rounds
%define W 32    ; word length
%define KW K/W  ; number of words per key
     
cham128_setkeyx:
_cham128_setkeyx:
    pushad
    mov    esi, [esp+32+4]  ; k  = in
    mov    edi, [esp+32+8]  ; rk = out    
    xor    eax, eax         ; i  = 0
sk_l0:
    mov    ebx, [esi+eax*4] ; ebx = k[i]
    mov    ecx, ebx         ; ecx = k[i]
    mov    edx, ebx         ; edx = k[i]    
    rol    ebx, 1           ; ROTL32(k[i], 1)
    rol    ecx, 8           ; ROTL32(k[i], 8)    
    xor    edx, ebx         ; k[i] ^ ROTL32(k[i], 1) 
    xor    edx, ecx    
    mov    [edi+eax*4], edx ; rk[i] = edx
    xor    edx, ecx         ; reset edx
    rol    ecx, 3           ; k[i] ^ ROTL32(k[i], 11)
    xor    edx, ecx    
    lea    ebx, [eax+KW]
    xor    ebx, 1
    mov    [edi+ebx*4], edx ; rk[(i + KW) ^ 1] = edx
    inc    al
    cmp    al, KW
    jnz    sk_l0    
    popad
    ret

Encryption

There are 80 rounds in total, which is significantly more than the 32 used for Speck.

void cham128_encrypt(void *keys, void *data)
{
    int i;
    uint32_t x0, x1, x2, x3;
    uint32_t t;
    uint32_t *rk=(uint32_t*)keys;
    uint32_t *x=(uint32_t*)data;
    
    x0 = x[0]; x1 = x[1];
    x2 = x[2]; x3 = x[3];

    for (i=0; i<R; i++)
    {
      if ((i & 1) == 0) {
        x0 = ROTL32((x0 ^ i) + (ROTL32(x1, 1) ^ rk[i & 7]), 8);
      } else {
        x0 = ROTL32((x0 ^ i) + (ROTL32(x1, 8) ^ rk[i & 7]), 1);
      }
      XCHG(x0, x1);
      XCHG(x1, x2);
      XCHG(x2, x3);
    }
    x[0] = x0; x[1] = x1;
    x[2] = x2; x[3] = x3;
}

Only the encryption here, since if you were to implement with CTR mode, decryption isn’t necessary.

%define x0 ebp
%define x1 ebx
%define x2 edx
%define x3 esi
%define rk edi

cham128_encryptx:
_cham128_encryptx:
    pushad
    mov    esi, [esp+32+8] ; x = data
    push   esi
    lodsd
    xchg   eax, x0
    lodsd
    xchg   eax, x1
    lodsd
    xchg   eax, x2
    lodsd
    xchg   eax, x3
    xor    eax, eax ; i = 0
    ; initialize sub keys
enc_l0: 
    mov    edi, [esp+32+8] ; k = keys
    jmp    enc_lx    
enc_l1:
    test   al, 7    ; i & 7
    jz     enc_l0
enc_lx:    
    push   eax      ; save i
    mov    cx, 0x0108
    test   al, 1    ; if ((i & 1)==0)
    jnz    enc_l2
    
    xchg   cl, ch
enc_l2:
    xor    x0, eax          ; x0 ^= i
    mov    eax, x1
    rol    eax, cl          ; 
    xor    eax, [edi]       ; ROTL32(x1, r0) ^ *rk++
    scasd
    add    x0, eax
    xchg   cl, ch
    rol    x0, cl
    
    xchg   x0, x1          ; XCHG(x0, x1);
    xchg   x1, x2          ; XCHG(x1, x2);
    xchg   x2, x3          ; XCHG(x2, x3);
    
    pop    eax      ; restore i
    inc    al       ; i++
    cmp    al, R    ; i<R
    jnz    enc_l1
    
    pop    edi
    xchg   eax, x0
    stosd           ; x[0] = x0;
    xchg   eax, x1
    stosd           ; x[1] = x1;
    xchg   eax, x2
    stosd           ; x[2] = x2;
    xchg   eax, x3
    stosd           ; x[3] = x3;
    popad
    ret

The total size of assembly at moment is 128 bytes, although I will try optimize more later.

Summary

CHAM is a relatively new block cipher clearly influenced by block ciphers Speck and Simon. Although the strong variants of Speck and Simon remain unbroken, that hasn’t prevented some in the cryptographic community being cynical about the intentions of the NSA.

Orr Dunkelman, a computer science professor at the University of Haifa commented. “I don’t trust the designers” and “There are quite a lot of people in NSA who think their job is to subvert standards. My job is to secure standards.”

Daniel Bernstein also commented: NSA says:We need a “generalist” cipher that works well everywhere, so here are 20 incompatible Simon+Speck variants https://eprint.iacr.org/2015/585

As most of the attacks appear ad-hominem, is it fair to dismiss Speck and Simon even when the rationale behind the designs have been published?

When asked if it could beat Simon and Speck encryption, NSA officials are reported to have said: “We firmly believe they are secure.” which neither confirms nor denies NSA can break these type of ciphers.

Then again, if the NSA are capable of breaking Speck and Simon, they may also know how to break other ARX designs like ChaCha, LEA and CHAM?

A final vote on whether Speck/Simon are accepted as part of an ISO standard takes place in February 2018.

Posted in assembly, cryptography, encryption, programming | Tagged , , | 1 Comment

Magic “Nothing Up My Sleeve” Numbers

Introduction

Just to be clear, this post does not endorse any cryptographic algorithms. It will only illustrate using source code how some magic numbers are created in such algorithms.

Nothing Up My Sleeve Numbers (NUMSN) are numeric constants in cryptographic algorithms which include from the designer a reasonable explanation for how they were created.

The purpose of explaining in detail how the numbers were created is intended to diminish any fears among users the numbers have some hidden properties that might purposely introduce a weakness only the designer knows about.

The motivation behind NUMSN is probably related to the Data Encryption Standard (DES) which as some of you know was designed by IBM with assistance from the wonderful NSA 🙂

The design criteria for DES was never published, and it was revealed the NSA changed the s-boxes without any explanation before it was officially made a standard.

So, naturally this change to the s-boxes in addition to reducing the key length from 64-bits to 56-bits provoked accusations of tampering, deliberately introducing weaknesses into the cipher. It was suggested by some in the cryptographic community, DES was weakened sufficiently enough to be compromised by the NSA if they ever needed to.

NSA introducing a weakness beyond reducing the key length from 64-bits to 56-bits was neither proved nor disproved.

It was revealed later, the changes to s-boxes improved resistance to Differential Cryptanalysis, an attack known to IBM designers, but which remained secret until being independently discovered in 1992 by Adi Shamir and Eli Biham.

As shown in Backdoors up my Sleeve by JP Aumasson, using NUMSN doesn’t automatically guarantee the security of a cryptographic algorithm, but any algorithm with numbers absent of a rational explanation or verifiable formula/algorithm, is generally going to be viewed with much more suspicion than if it did.

For example, there’s been suspicion surrounding algorithms part of the Chinese Trusted Platform Module (TPM).

They are as follows:

  • SM2 (Elliptic Curve Algorithm to rival ECDSA-P256)
  • SM3 (Cryptographic Hash to rival SHA-2)
  • SM4 (Block Cipher to rival AES-128)
  • ZUC (Stream Cipher)

All of the above algorithms contain magic numbers absent of any explanation behind their creation, and even though these ciphers remain unbroken by anyone in the public domain, that doesn’t mean they don’t have some weaknesses built-in.

The NUMSN I’ll cover in this post include: MD2, MD4, MD5, SHA-1, SHA-2, BLAKE2, Blowfish, AES, SEED, SAFER, XTEA, Gimli, RC5, RC6, Kuznyechik and ARIA.

If you just want to look at source codes for the above see the repo here.

MD2 cryptographic hash

RFC 1319 published in 1992 by Ron Rivest provides no explanation for how PI was generated beyond the following:

This step uses a 256-byte “random” permutation constructed from the digits of pi. Let S[i] denote the i-th element of this table. The table is given in the appendix.

Eventually someone asked the question on crypto stack exchange: How is the MD2 S-Table constructed from Pi

Señor Crypto (Mike Azo) requests explanation from Rivest via Email, and we finally discover the algorithm used.

// The first 722 digits of PI
uint32_t PI[722]=
{0x03, 0x01, 0x04, 0x01, 0x05, 0x09, 0x02, 0x06,
 0x05, 0x03, 0x05, 0x08, 0x09, 0x07, 0x09, 0x03,
 0x02, 0x03, 0x08, 0x04, 0x06, 0x02, 0x06, 0x04,
 0x03, 0x03, 0x08, 0x03, 0x02, 0x07, 0x09, 0x05,
 0x00, 0x02, 0x08, 0x08, 0x04, 0x01, 0x09, 0x07,
 0x01, 0x06, 0x09, 0x03, 0x09, 0x09, 0x03, 0x07,
 0x05, 0x01, 0x00, 0x05, 0x08, 0x02, 0x00, 0x09,
 0x07, 0x04, 0x09, 0x04, 0x04, 0x05, 0x09, 0x02,
 0x03, 0x00, 0x07, 0x08, 0x01, 0x06, 0x04, 0x00,
 0x06, 0x02, 0x08, 0x06, 0x02, 0x00, 0x08, 0x09,
 0x09, 0x08, 0x06, 0x02, 0x08, 0x00, 0x03, 0x04,
 0x08, 0x02, 0x05, 0x03, 0x04, 0x02, 0x01, 0x01,
 0x07, 0x00, 0x06, 0x07, 0x09, 0x08, 0x02, 0x01,
 0x04, 0x08, 0x00, 0x08, 0x06, 0x05, 0x01, 0x03,
 0x02, 0x08, 0x02, 0x03, 0x00, 0x06, 0x06, 0x04,
 0x07, 0x00, 0x09, 0x03, 0x08, 0x04, 0x04, 0x06,
 0x00, 0x09, 0x05, 0x05, 0x00, 0x05, 0x08, 0x02,
 0x02, 0x03, 0x01, 0x07, 0x02, 0x05, 0x03, 0x05,
 0x09, 0x04, 0x00, 0x08, 0x01, 0x02, 0x08, 0x04,
 0x08, 0x01, 0x01, 0x01, 0x07, 0x04, 0x05, 0x00,
 0x02, 0x08, 0x04, 0x01, 0x00, 0x02, 0x07, 0x00,
 0x01, 0x09, 0x03, 0x08, 0x05, 0x02, 0x01, 0x01,
 0x00, 0x05, 0x05, 0x05, 0x09, 0x06, 0x04, 0x04,
 0x06, 0x02, 0x02, 0x09, 0x04, 0x08, 0x09, 0x05,
 0x04, 0x09, 0x03, 0x00, 0x03, 0x08, 0x01, 0x09,
 0x06, 0x04, 0x04, 0x02, 0x08, 0x08, 0x01, 0x00,
 0x09, 0x07, 0x05, 0x06, 0x06, 0x05, 0x09, 0x03,
 0x03, 0x04, 0x04, 0x06, 0x01, 0x02, 0x08, 0x04,
 0x07, 0x05, 0x06, 0x04, 0x08, 0x02, 0x03, 0x03,
 0x07, 0x08, 0x06, 0x07, 0x08, 0x03, 0x01, 0x06,
 0x05, 0x02, 0x07, 0x01, 0x02, 0x00, 0x01, 0x09,
 0x00, 0x09, 0x01, 0x04, 0x05, 0x06, 0x04, 0x08,
 0x05, 0x06, 0x06, 0x09, 0x02, 0x03, 0x04, 0x06,
 0x00, 0x03, 0x04, 0x08, 0x06, 0x01, 0x00, 0x04,
 0x05, 0x04, 0x03, 0x02, 0x06, 0x06, 0x04, 0x08,
 0x02, 0x01, 0x03, 0x03, 0x09, 0x03, 0x06, 0x00,
 0x07, 0x02, 0x06, 0x00, 0x02, 0x04, 0x09, 0x01,
 0x04, 0x01, 0x02, 0x07, 0x03, 0x07, 0x02, 0x04,
 0x05, 0x08, 0x07, 0x00, 0x00, 0x06, 0x06, 0x00,
 0x06, 0x03, 0x01, 0x05, 0x05, 0x08, 0x08, 0x01,
 0x07, 0x04, 0x08, 0x08, 0x01, 0x05, 0x02, 0x00,
 0x09, 0x02, 0x00, 0x09, 0x06, 0x02, 0x08, 0x02,
 0x09, 0x02, 0x05, 0x04, 0x00, 0x09, 0x01, 0x07,
 0x01, 0x05, 0x03, 0x06, 0x04, 0x03, 0x06, 0x07,
 0x08, 0x09, 0x02, 0x05, 0x09, 0x00, 0x03, 0x06,
 0x00, 0x00, 0x01, 0x01, 0x03, 0x03, 0x00, 0x05,
 0x03, 0x00, 0x05, 0x04, 0x08, 0x08, 0x02, 0x00,
 0x04, 0x06, 0x06, 0x05, 0x02, 0x01, 0x03, 0x08,
 0x04, 0x01, 0x04, 0x06, 0x09, 0x05, 0x01, 0x09,
 0x04, 0x01, 0x05, 0x01, 0x01, 0x06, 0x00, 0x09,
 0x04, 0x03, 0x03, 0x00, 0x05, 0x07, 0x02, 0x07,
 0x00, 0x03, 0x06, 0x05, 0x07, 0x05, 0x09, 0x05,
 0x09, 0x01, 0x09, 0x05, 0x03, 0x00, 0x09, 0x02,
 0x01, 0x08, 0x06, 0x01, 0x01, 0x07, 0x03, 0x08,
 0x01, 0x09, 0x03, 0x02, 0x06, 0x01, 0x01, 0x07,
 0x09, 0x03, 0x01, 0x00, 0x05, 0x01, 0x01, 0x08,
 0x05, 0x04, 0x08, 0x00, 0x07, 0x04, 0x04, 0x06,
 0x02, 0x03, 0x07, 0x09, 0x09, 0x06, 0x02, 0x07,
 0x04, 0x09, 0x05, 0x06, 0x07, 0x03, 0x05, 0x01,
 0x08, 0x08, 0x05, 0x07, 0x05, 0x02, 0x07, 0x02,
 0x04, 0x08, 0x09, 0x01, 0x02, 0x02, 0x07, 0x09,
 0x03, 0x08, 0x01, 0x08, 0x03, 0x00, 0x01, 0x01,
 0x09, 0x04, 0x09, 0x01, 0x02, 0x09, 0x08, 0x03,
 0x03, 0x06, 0x07, 0x03, 0x03, 0x06, 0x02, 0x04,
 0x04, 0x00, 0x06, 0x05, 0x06, 0x06, 0x04, 0x03,
 0x00, 0x08, 0x06, 0x00, 0x02, 0x01, 0x03, 0x09,
 0x04, 0x09, 0x04, 0x06, 0x03, 0x09, 0x05, 0x02,
 0x02, 0x04, 0x07, 0x03, 0x07, 0x01, 0x09, 0x00,
 0x07, 0x00, 0x02, 0x01, 0x07, 0x09, 0x08, 0x06,
 0x00, 0x09, 0x04, 0x03, 0x07, 0x00, 0x02, 0x07,
 0x07, 0x00, 0x05, 0x03, 0x09, 0x02, 0x01, 0x07,
 0x01, 0x07, 0x06, 0x02, 0x09, 0x03, 0x01, 0x07,
 0x06, 0x07, 0x05, 0x02, 0x03, 0x08, 0x04, 0x06,
 0x07, 0x04, 0x08, 0x01, 0x08, 0x04, 0x06, 0x07,
 0x06, 0x06, 0x09, 0x04, 0x00, 0x05, 0x01, 0x03,
 0x02, 0x00, 0x00, 0x00, 0x05, 0x06, 0x08, 0x01,
 0x02, 0x07, 0x01, 0x04, 0x05, 0x02, 0x06, 0x03,
 0x05, 0x06, 0x00, 0x08, 0x02, 0x07, 0x07, 0x08,
 0x05, 0x07, 0x07, 0x01, 0x03, 0x04, 0x02, 0x07,
 0x05, 0x07, 0x07, 0x08, 0x09, 0x06, 0x00, 0x09,
 0x01, 0x07, 0x03, 0x06, 0x03, 0x07, 0x01, 0x07,
 0x08, 0x07, 0x02, 0x01, 0x04, 0x06, 0x08, 0x04,
 0x04, 0x00, 0x09, 0x00, 0x01, 0x02, 0x02, 0x04,
 0x09, 0x05, 0x03, 0x04, 0x03, 0x00, 0x01, 0x04,
 0x06, 0x05, 0x04, 0x09, 0x05, 0x08, 0x05, 0x03,
 0x07, 0x01, 0x00, 0x05, 0x00, 0x07, 0x09, 0x02,
 0x02, 0x07, 0x09, 0x06, 0x08, 0x09, 0x02, 0x05,
 0x08, 0x09, 0x02, 0x03, 0x05, 0x04, 0x02, 0x00,
 0x01, 0x09, 0x09, 0x05, 0x06, 0x01, 0x01, 0x02,
 0x01, 0x02, 0x09, 0x00, 0x02, 0x01, 0x09, 0x06,
 0x00, 0x08 };

uint32_t md2_rand(uint32_t n) {
    uint32_t        x, y;
    static uint32_t pos = 0;
    
    x = PI[pos]; pos++;
    y = 10;    
    
    if (n > 10) {
      x = (x * 10) + PI[pos]; pos++;      
      y = 100;
    }
    if (n > 100) {
      x = (x * 10) + PI[pos]; pos++;
      y = 1000;
    }
    if (x < (n * (y / n))) {
      return (x % n);
    }
    return md2_rand(n);
}

int main(void)
{
    uint32_t i, j, t, S[256];
    
    // Initialize s-box
    for (i=0; i<256; i++) {
      S[i] = i;
    }
    // Shuffle the S-box
    for (i=2; i<=256; i++) {
        j = md2_rand(i);
        t = S[j];
        S[j] = S[i-1];
        S[i-1] = t;
    }
    // Output the S-box
    bin2hex("MD2 PI sbox", S, 256); 
    return 0;
}

MD4 cryptographic hash

RFC 1320 published in 1992 by Ron Rivest uses 2 unique constants in the compression function.

The value 5A..99 is a hexadecimal 32-bit constant, written with the high-order digit first. This constant represents the square root of 2. The octal value of this constant is 013240474631.

The value 6E..A1 is a hexadecimal 32-bit constant, written with the high-order digit first. This constant represents the square root of 3. The octal value of this constant is 015666365641.

See Knuth, The Art of Programming, Volume 2 (Seminumerical Algorithms), Second Edition (1981), Addison-Wesley. Table 2, page 660.

0x5a827999 is derived from the square root of 2 multiplied by 2**30
0x6ed9eba1 is derived from the square root of 3 multiplied by 2**30

#pragma intrinsic(fabs,pow,sin)

uint32_t p[2]={2,3};

uint32_t sqrt2int (uint32_t x) {
    uint32_t r;
    r = (uint32_t)(fabs(sqrt((double)p[x]))*pow(2,30));
    return r;
}

int main(void)
{
    int      i;
    uint32_t k[2];
        
    // create K constants
    for (i=0; i<2; i++) {
      k[i] = sqrt2int(i);
    }
    bin2hex("MD4 K constants", k, 2);  
    return 0;
}

MD5 cryptographic hash

RFC 1321 published in 1992 uses 64 unique constants in the compression function. These are calculated by passing 1-64 to the trigonometric sine function. Each result is multiplied by 2**32 before conversion to integer.

#pragma intrinsic(fabs,pow,sin)

uint32_t sin2int (uint32_t i)
{
    uint32_t r;
    r = (uint32_t)(fabs(sin(i)*pow(2,32)));
    return r;
}

int main(void)
{
    int      i;
    uint32_t t[64];

    // create T constants
    for (i=0; i<64; i++) {
      t[i] = sin2int(i+1);
    }
    bin2hex("MD5 T constants", t, 64);  
    return 0;
}

SHA-1 cryptographic hash

Designed by the NSA and published in 1995, this algorithm was definitely inspired by MD4.

The generation of K constants used in SHA-1 are exactly the same as those generated in MD4. There are 2 additional values derived from 5 and 10 respectively.

#pragma intrinsic(fabs,pow,sqrt)

uint32_t p[4]={2,3,5,10};

uint32_t sqrt2int (uint32_t x) {
    uint32_t r;
    r = (uint32_t)(fabs(sqrt((double)p[x]))*pow(2,30));
    return r;
}

int main(void)
{
    int      i;
    uint32_t k[4];
        
    // create K constants
    for (i=0; i<4; i++) {
      k[i] = sqrt2int(i);
    }
    bin2hex("SHA-1 K constants", k, 4);    
    return 0;
}

SHA-2 cryptographic hash

SHA-2 was published in 2001. The K constants are generated similar to how T constants are in MD5, except the first 64 prime numbers are used.

The H values are derived from square root of first 8 primes.
The K values are derived from cube root of first 64 primes.

#pragma intrinsic(fabs,pow,sqrt)

uint32_t p[64] =
{  2,   3,   5,   7,  11,  13,  17,  19, 
  23,  29,  31,  37,  41,  43,  47,  53,
  59,  61,  67,  71,  73,  79,  83,  89, 
  97, 101, 103, 107, 109, 113, 127, 131,
 137, 139, 149, 151, 157, 163, 167, 173, 
 179, 181, 191, 193, 197, 199, 211, 223,
 227, 229, 233, 239, 241, 251, 257, 263, 
 269, 271, 277, 281, 283, 293, 307, 311 };

// square root of integer, return fractional part as integer
uint32_t sqrt2int (uint32_t x) {
    uint32_t r;
    r = (uint32_t)(fabs(sqrt((double)p[x]))*pow(2,32));
    return r;
}

// cube root of integer, return fractional part as integer
uint32_t cbr2int (uint32_t x) {
    uint32_t r;
    r = (uint32_t)(fabs(pow((double)p[x],1.0/3.0))*pow(2,32));
    return r;
}

int main(void)
{
    int      i;
    uint32_t h[8], k[64];
    
    // create H constants
    for (i=0; i<8; i++) {
      h[i] = sqrt2int(i);
    }
    bin2hex("SHA-256 H constants", h, 8);
    
    // create K constants
    for (i=0; i<64; i++) {
      k[i] = cbr2int(i);
    }
    bin2hex("SHA-256 K constants", k, 64);  
    return 0;
}

Blowfish Block Cipher

Designed in 1993 and published in 1994, Blowfish became very popular among people who did not trust the security of DES.

The P and S boxes are derived from 8336 hexadecimal digits of Pi using Bailey–Borwein–Plouffe formula (BBP).

I’ve used code by David Bailey which you can find here.

static double expm (double p, double ak)

/*  expm = 16^p mod ak.  
  This routine uses the left-to-right binary 
  exponentiation scheme. */

{
  int i, j;
  double p1, pt, r;
#define ntp 25
  static double tp[ntp];
  static int tp1 = 0;

/*  If this is the first call to expm, 
  fill the power of two table tp. */

  if (tp1 == 0) {
    tp1 = 1;
    tp[0] = 1.;

    for (i = 1; i < ntp; i++) tp[i] = 2. * tp[i-1];
  }

  if (ak == 1.) return 0.;

/*  Find the greatest power of two less than or equal to p. */

  for (i = 0; i < ntp; i++) if (tp[i] > p) break;

  pt = tp[i-1];
  p1 = p;
  r = 1.;

/*  Perform binary exponentiation algorithm modulo ak. */

  for (j = 1; j <= i; j++){
    if (p1 >= pt){
      r = 16. * r;
      r = r - (int) (r / ak) * ak;
      p1 = p1 - pt;
    }
    pt = 0.5 * pt;
    if (pt >= 1.){
      r = r * r;
      r = r - (int) (r / ak) * ak;
    }
  }

  return r;
}

static double series (int m, int id)

/*  This routine evaluates the series  sum_k 16^(id-k)/(8*k+m) 
    using the modular exponentiation technique. */

{
  int k;
  double ak, p, s, t;
#define eps 1e-17

  s = 0.;

/*  Sum the series up to id. */

  for (k = 0; k < id; k++){
    ak = 8 * k + m;
    p = id - k;
    t = expm (p, ak);
    s = s + t / ak;
    s = s - (int) s;
  }

/*  Compute a few terms where k >= id. */

  for (k = id; k <= id + 100; k++){
    ak = 8 * k + m;
    t = pow (16., (double) (id - k)) / ak;
    if (t < eps) break;
    s = s + t;
    s = s - (int) s;
  }
  return s;
}

unsigned char get_byte(int id)
{
  double y;
  unsigned char first, second;
  double s1 = series (1, id);
  double s2 = series (4, id);
  double s3 = series (5, id);
  double s4 = series (6, id);
  double pid = 4. * s1 - 2. * s2 - s3 - s4;
  
  pid = pid - (int) pid + 1.;

  y = fabs(pid);
  y = 16. * (y - floor (y));
  first = y;
  //y = 16. * (y - floor (y));
  //second = y;
  return first; //(first << 4) | second;
}

// generates digits for blowfish..very slow 😦   
// apparently Fabrice Bellard formula is faster
unsigned int boxes[1024+18];
    
void main(void)
{
    int i, j;
    unsigned int x, *p;
    
    p = (unsigned int*)boxes;
    
    printf ("\nGenerating digits of Pi"
            "\nThis might take a while...\n");
    
    for (i=0; i<(1024+18)*8; i+=8) {    
      x = 0;
      for (j=0; j<8; j++) {
        x <<= 4;      
        x |= get_byte(i+j);
      }
      *p++ = x;    
    }
    printf (" // p-box\n");
    // p-box
    for (i=0; i<18; i++) {
      if ((i & 3)==0) putchar('\n');
      printf (" 0x%08x,", boxes[i]);
    }
    
    // s-box
    for (i=0; i<1024; i++) {
      if ((i & 3)==0) putchar('\n');
      if ((i & 255)==0) { 
        printf(" // sbox-%i\n", (i/256)+1); 
      }
      printf (" 0x%08x,", boxes[i+18]);    
    }  
}

Rijndael Block Cipher

The Advanced Encryption Standard published in 2001 which was designed by Joan Daemen and Vincent Rijmen.

// Multiplication
uint8_t gf_mul(uint8_t x, uint8_t y, uint8_t p)
{
    uint8_t z = 0;

    while (y) {
      if (y & 1) {
        z ^= x;
      }
      x = (x << 1) ^ (x & 0x80 ? p : 0x00);
      y >>= 1;
    }
    return z;
}

int main (void)
{
    uint8_t gf_log[256], s[256], s_inv[256];
    int     i, x;

    for (i=0, x=1; i<256; i++) {
      gf_log[i] = x;
      x ^= gf_mul(2, x, 0x1b);
    }

    s[0] = 0x63;
    
    for (i=0; i<255; i++) {
      x = gf_log[255 - i];
      x |= x << 8;
      x ^= (x >> 4) ^ (x >> 5) ^ (x >> 6) ^ (x >> 7);
      s[gf_log[i]] = (x ^ 0x63);
    }
    
    for (i=0; i<256; i++) {
      s_inv[s[i]] = i;
    }
  
    bin2hex("AES sbox", s, 256);
    bin2hex("AES inverse sbox", s_inv, 256);    
    return 0;
}

SAFER Block Cipher

Secure And Fast Encryption Routine (SAFER) by James Lee Massey was submitted to both AES and NESSIE competitions.

// Modular exponentiation
int powmod (int b, int e, int m)
{
    int r = 1;
    
    while (e > 0) {
      if (e & 1) {
        r = r * b % m;
      }
      b = b * b % m;
      e >>= 1;
    }
    return r;
}

int main(void)
{  
    int      x, i;
    uint8_t  s0[256], s1[256];

    // create sbox
    for (x=0; x<256; x++) {
      s0[x] = powmod(45, x, 257) % 256;            
    }
    // create inverse sbox
    for (i=0; i<256; i++) {
      s1[s0[i]] = i;
    }
    bin2hex("SAFER sbox", s0, 256);
    bin2hex("SAFER inverse sbox", s1, 256);   
    return 0;
}

SEED Block Cipher

SEED was published in 1998 and directly a response by the Korea Information Security Agency to concerns about export restrictions on cryptography imposed by the US government that only allowed 40-bit RC4 encryption at the time 🙂

// matrix used for s-box 1 generation in seed
uint8_t A1[8][8] = {
    {1,0,0,0,1,0,1,0},
    {1,1,1,1,1,1,1,0},
    {1,0,0,0,0,1,0,1},
    {0,1,0,0,0,0,1,0},
    {0,1,0,0,0,1,0,1},
    {0,0,1,0,0,0,0,1},
    {1,0,0,0,1,0,0,0}, 
    {0,0,0,1,0,1,0,0}, 
};

// matrix used for s-box 2 generation in seed
uint8_t A2[8][8] = {
    {0,1,0,0,0,1,0,1},
    {1,0,0,0,0,1,0,1},
    {1,1,1,1,1,1,1,0},
    {0,0,1,0,0,0,0,1},
    {1,0,0,0,1,0,1,0},
    {1,0,0,0,1,0,0,0},
    {0,1,0,0,0,0,1,0},
    {0,0,0,1,0,1,0,0},
};

// Multiplication
uint8_t gf_mul(uint8_t x, uint8_t y, uint8_t p)
{
    uint8_t z = 0;

    while (y) {
      if (y & 1) {
        z ^= x;
      }
      x = (x << 1) ^ (x & 0x80 ? p : 0x00);
      y >>= 1;
    }
    return z;
}

// Exponentiation
uint8_t gf_exp(uint8_t b, uint8_t e, uint8_t p)
{
    uint8_t r = 1;
    uint8_t t = b;
    
    while (e > 0) {
      if (e & 1) {
        r = gf_mul(r, t, p);
      }
      t = gf_mul(t, t, p);
      e >>= 1;
    }
    return r;
}

// Matrix Multiplcation
uint8_t matmul(uint8_t mat[8][8], uint8_t a) {
    uint8_t res = 0;
    int     x, y;
    
    for (x = 0; x < 8; x++) {
      if (a & (1 << (7 - x))) {
        for (y = 0; y < 8; y++) {
          res ^= mat[y][x] << (7 - y);
        }
      }
    }
    return res;
}

int main(void)
{  
    int      x, i;
    uint8_t  s0[256], s1[256];
    uint32_t g = 0x9e3779b9;
       
    for (x=0; x<256; x++) {
      s0[x] = matmul(A1, gf_exp(x, 247, 99)) ^ 169;
      s1[x] = matmul(A2, gf_exp(x, 251, 99)) ^ 56;      
    }

    bin2hex("SEED sbox", s0, 256);
    bin2hex("SEED inverse sbox", s1, 256);

    printf ("\n // SEED constants");
    
    for (i=0; i<16; i++) {
      if ((i & 1)==0) putchar('\n');
      printf (" 0x%08x, ", g);
      g = (g << 1) | (g >> 32-1);
    }
    putchar('\n');    
    return 0;
}

ARIA Block Cipher

Another algorithm designed by the Korea Information Security Agency published in 2003.

This uses a matrix multiplication function from code by Markku-Juhani O. Saarinen which does the same thing.

// matrix used for s-box 1 generation in aria
uint8_t A[8][8] = {
    {1,0,0,0,1,1,1,1}, 
    {1,1,0,0,0,1,1,1},
    {1,1,1,0,0,0,1,1},
    {1,1,1,1,0,0,0,1},
    {1,1,1,1,1,0,0,0},
    {0,1,1,1,1,1,0,0},
    {0,0,1,1,1,1,1,0}, 
    {0,0,0,1,1,1,1,1}, 
};

// matrix used for s-box 2 generation in aria
uint8_t B[8][8] = {
    {0,1,0,1,1,1,1,0}, 
    {0,0,1,1,1,1,0,1},
    {1,1,0,1,0,1,1,1},
    {1,0,0,1,1,1,0,1},
    {0,0,1,0,1,1,0,0},
    {1,0,0,0,0,0,0,1},
    {0,1,0,1,1,1,0,1}, 
    {1,1,0,1,0,0,1,1}, 
};

// Multiplication
uint8_t gf_mul(uint8_t x, uint8_t y, uint8_t p)
{
    uint8_t z = 0;

    while (y) {
      if (y & 1) {
        z ^= x;
      }
      x = (x << 1) ^ (x & 0x80 ? p : 0x00);
      y >>= 1;
    }
    return z;
}

// Exponentiation
uint8_t gf_exp(uint8_t b, uint8_t e, uint8_t p)
{
    uint8_t r = 1;
    uint8_t t = b;
    
    while (e > 0) {
      if (e & 1) {
        r = gf_mul(r, t, p);
      }
      t = gf_mul(t, t, p);
      e >>= 1;
    }
    return r;
}

uint8_t matmul8(uint8_t m[8][8], uint8_t x)
{
    int i, j;
    uint8_t y;
    
    y = 0;
    for (i = 0; i < 8; i++) {	
      if (x & (1 << i)) {
        for (j = 0; j < 8; j++)
          y ^= m[j][i] << j;
      }
    }	
    return y;
}

// modular inverse
uint8_t gf_inv(uint8_t a) {
    uint8_t j, b = a;
    for (j = 14; --j;)
        b = gf_mul(b, j & 1 ? b : a, 0x1b); 
    return b;
}

int main(void) 
{
    int     i, p, x;
    uint8_t s1[256], s1_inv[256], s2[256], s2_inv[256];

    for (x=0; x<256; x++) {
      // generate s-box 1 
      s1[x] = matmul8(A, gf_inv(x)) ^ 0x63; 
      // generate s-box 2
      s2[x] = matmul8(B, gf_exp(x, 247, 0x1b)) ^ 0xE2; 
    }
    for (i=0; i<256; i++) {
      s1_inv[s1[i]] = i;
      s2_inv[s2[i]] = i;
    }    
    bin2hex("ARIA s1", s1, 256);    
    bin2hex("ARIA inverse s1", s1_inv, 256);    
    
    bin2hex("ARIA s2", s2, 256);    
    bin2hex("ARIA inverse s2", s2_inv, 256);    
    return 0;
}

Kuznyechik “GrassHopper” Block Cipher

GOST R 34.12-2015 is an algorithm designed by the Russian government to replace their legacy standard GOST 28147-89

Although no design criteria for the sboxes was published, some cryptographers managed to reverse engineer the algorithm.
Reverse-Engineering the S-Box of Streebog, Kuznyechik and STRIBOBr1

The following code was provided by the authors.

uint8_t inv[16]   = 
{0x0,0x1,0xc,0x8,0x6,0xf,0x4,0xe,0x3,0xd,0xb,0xa,0x2,0x9,0x7,0x5};

uint8_t nu_0[16]  = 
{0x2,0x5,0x3,0xb,0x6,0x9,0xe,0xa,0x0,0x4,0xf,0x1,0x8,0xd,0xc,0x7};

uint8_t nu_1[16]  = 
{0x7,0x6,0xc,0x9,0x0,0xf,0x8,0x1,0x4,0x5,0xb,0xe,0xd,0x2,0x3,0xa};

uint8_t sigma[16] = 
{0xc,0xd,0x0,0x4,0x8,0xb,0xa,0xe,0x3,0x9,0x5,0x2,0xf,0x1,0x6,0x7};

uint8_t phi[16]   = 
{0xb,0x2,0xb,0x8,0xc,0x4,0x1,0xc,0x6,0x3,0x5,0x8,0xe,0x3,0x6,0xb};

uint8_t alpha[8][8] = {
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,0,0,1},
    {0,1,0,0,0,0,1,1},
    {1,1,1,0,1,1,1,1},
    {1,0,0,0,1,0,1,0},
    {0,1,0,0,0,1,0,0},
    {0,0,0,1,1,0,1,0},
    {0,0,1,0,0,0,0,0},
};

uint8_t omega[8][8] = {
    {0,0,0,0,1,0,1,0},
    {0,0,0,0,0,1,0,0},
    {0,0,1,0,0,0,0,0},
    {1,0,0,1,1,0,1,0},
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,1,0,0},
    {1,0,0,0,0,0,1,0},
    {0,0,0,0,0,0,0,1},
};

// multiplcation by matrix over GF(2)
uint8_t matmul(uint8_t a, uint8_t mat[8][8]) {
  uint8_t res = 0;
  int x, y;
  
  for (x = 0; x < 8; x++)
    if (a & (1 << (7 - x)))
      for (y = 0; y < 8; y++)
        res ^= mat[y][x] << (7 - y);
  return res;
}

uint8_t gf_mul(uint8_t a, uint8_t b) {
  uint8_t p = 0;
  
  while (b) {
    if (b & 1) 
      p ^= a;
    if (a & 0x8)
      a = (a << 1) ^ 0x19;  // x^4 + x^3 + 1
    else
      a <<= 1;
    b >>= 1;
  }
  return p;
}

int main(int argc, char *argv[]) {
    int i, x, y, l, r, n, z;
    uint8_t pi[256], pi_inv[256];
    uint8_t *p;   
 
    // generate sbox 
    for(x = 0; x < 256; x++) {
        y = matmul(x, alpha);
        l = (y >> 4) & 0xf;
        r = y & 0xf;
        l = (r == 0) ? nu_0[l] : nu_1[gf_mul(l, inv[r])];
        r = sigma[gf_mul(r, phi[l])];
        y = matmul((l << 4) | r, omega);
        pi[x] = y;
    }
    // generate inverse sbox
    for (i=0; i<256; i++) {
      pi_inv[pi[i]]=i;
    }
    
    bin2hex("Kuznyechik sbox", pi, 256);
    bin2hex("Kuznyechik inverse sbox", pi_inv, 256);
    return 0;
}

Salsa / ChaCha Stream ciphers

Salsa published in 2005 and ChaCha in 2008 are both designed by Daniel Bernstein.

The 128-bit key version uses the string “expand 16-byte k” to initialize state.
The 256-bit key version uses the string “expand 32-byte k” to initialize state.

SipHash PRF

Pseudo Random Function (PRF) SipHash by Aumasson and Bernstein

Initializes the state with string “somepseudorandomlygeneratedbytes” big-endian encoded.

Various Algorithms

Block ciphers RC5, RC6, XTEA, Serpent, and more recently the permutation function Gimli all use Golden Ratio in hexadecimal format: 0x9E3779B9

RC5 and RC6 use euler constant minus 2: 0xB7E15163

uint32_t golden_ratio(void) {
    uint32_t r;
    r = (uint32_t)(fabs((sqrt((double)5)-1)/2)*pow(2,32));
    return r;
}

uint32_t euler_number(void) {
    uint32_t r;
    r = (uint32_t)(fabs(exp((double)1)-2)*pow(2,32)+1);
    return r;
}

int main(void)
{
  printf ("\nEuler Number = %08X\n", euler_number());  
  printf ("\nGolden Ratio = %08X\n", golden_ratio());
  return 0;
}

Summary

Wasn’t that fun? 😛 Now it all makes sense, right?

Although most of what I posted here is just source code, I wanted to compile various algorithms used to generate NUMSN values in one place.

There are still many algorithms out there the designers refuse to explain the source of inspiration for their magic numbers, and this inevitably means people will distrust the algorithm.

For source code to all of the algorithms mentioned above, see here

Posted in cryptography, encryption, programming | Tagged , , , , , , , , , , , , , | 1 Comment

Ascon Permutation Function

Introduction

Ascon is an Authenticated Encryption Associated Data (AEAD) algorithm submitted to The Competition for Authenticated Encryption: Security, Applicability, and Robustness (CAESAR)

It was designed by Christoph Dobraunig, Maria Eichlseder, Florian Mendel and Martin Schläffer

Some of the authors mentioned are also behind designs such as Gimli permutation function, and the Grøstl cryptographic hash algorithm so even if Ascon doesn’t make the final portfolio for CAESAR, the permutation function may still be useful in other applications running on 64-bit architectures.

C code

This is taken directly from the implementation in SUPERCOP.

void ascon_permute(void *state, int rnds) {
    int      i;
    uint64_t x0, x1, x2, x3, x4;
    uint64_t t0, t1, t2, t3, t4;
    
    uint64_t *x=(uint64_t*)state;
    
    // load 320-bit state
    x0 = x[0]; x1 = x[1];
    x2 = x[2]; x3 = x[3];
    x4 = x[4];

    for (i=0; i<rnds; i++) {
      // addition of round constant
      x2 ^= ((0xfull - i) << 4) | i;

      // substitution layer
      x0 ^= x4;  x4 ^= x3;  x2 ^= x1;
      t0  = x0;  t1  = x1;  t2  = x2;  t3  =  x3; t4  = x4;
      t0  = ~t0; t1  = ~t1; t2  = ~t2; t3  = ~t3; t4  = ~t4;
      t0 &= x1;  t1 &= x2;  t2 &= x3;  t3 &=  x4; t4 &= x0;
      x0 ^= t1;  x1 ^= t2;  x2 ^= t3;  x3 ^=  t4; x4 ^= t0;
      x1 ^= x0;  x0 ^= x4;  x3 ^= x2;  x2  = ~x2;

      // linear diffusion layer
      x0 ^= ROTR(x0, 19) ^ ROTR(x0, 28);
      x1 ^= ROTR(x1, 61) ^ ROTR(x1, 39);
      x2 ^= ROTR(x2,  1) ^ ROTR(x2,  6);
      x3 ^= ROTR(x3, 10) ^ ROTR(x3, 17);
      x4 ^= ROTR(x4,  7) ^ ROTR(x4, 41);

    }
    // save 320-bit state
    x[0] = x0; x[1] = x1;
    x[2] = x2; x[3] = x3;
    x[4] = x4;
}

x86-64 assembly

%define x0 rbx
%define x1 rdx
%define x2 rdi
%define x3 rsi
%define x4 rbp

%define t0 r8
%define t1 r9
%define t2 r10
%define t3 r11
%define t4 r12

%define x rcx
%define r rdx
%define i rax
    
ascon_permutex:
    push   rsi
    push   rbx
    push   rdi
    push   rbp
    push   r12

    push   r
        
    ; load
    mov    x0, [x+0*8]
    mov    x1, [x+1*8]
    mov    x2, [x+2*8]
    mov    x3, [x+3*8]
    mov    x4, [x+4*8]
    
    xor    i, i
permute_loop:
    push   i
    ; **************************
    ; addition of round constant
    ; **************************    
    ; x2 ^= ((0xfull - i) << 4) | i;
    push  15
    pop   rax
    sub   rax, [rsp]
    shl   rax, 4
    or    rax, [rsp]
    xor   x2, rax    
    ; **********************
    ; substitution layer
    ; **********************
    ; x0 ^= x4;    x4 ^= x3;    x2 ^= x1;
    xor    x0, x4
    xor    x4, x3
    xor    x2, x1
    ; t0  = x0;    t1  = x1;    t2  = x2;    t3  =  x3;    t4  = x4;
    mov    t0, x0
    mov    t1, x1
    mov    t2, x2
    mov    t3, x3
    mov    t4, x4
    ; t0  = ~t0;   t1  = ~t1;   t2  = ~t2;   t3  = ~t3;    t4  = ~t4;
    not    t0
    not    t1
    not    t2
    not    t3
    not    t4
    ; t0 &= x1;    t1 &= x2;    t2 &= x3;    t3 &=  x4;    t4 &= x0;
    and    t0, x1
    and    t1, x2
    and    t2, x3
    and    t3, x4
    and    t4, x0
    ; x0 ^= t1;    x1 ^= t2;    x2 ^= t3;    x3 ^=  t4;    x4 ^= t0;
    xor    x0, t1
    xor    x1, t2
    xor    x2, t3
    xor    x3, t4
    xor    x4, t0
    ; x1 ^= x0;    x0 ^= x4;    x3 ^= x2;    x2  = ~x2;
    xor    x1, x0  
    xor    x0, x4  
    xor    x3, x2  
    not    x2    
    ; **********************
    ; linear diffusion layer
    ; **********************
    ; x0 ^= ROTR(x0, 19) ^ ROTR(x0, 28);
    mov    t0, x0
    ror    t0, 19
    xor    x0, t0
    ror    t0, 28-19
    xor    x0, t0
    
    ; x1 ^= ROTR(x1, 61) ^ ROTR(x1, 39);
    mov    t0, x1
    ror    t0, 39
    xor    x1, t0
    ror    t0, 61-39
    xor    x1, t0

    ; x2 ^= ROTR(x2,  1) ^ ROTR(x2,  6);
    mov    t0, x2
    ror    t0, 1
    xor    x2, t0
    ror    t0, 6-1
    xor    x2, t0
    
    ; x3 ^= ROTR(x3, 10) ^ ROTR(x3, 17);
    mov    t0, x3
    ror    t0, 10
    xor    x3, t0
    ror    t0, 17-10
    xor    x3, t0
    
    ; x4 ^= ROTR(x4,  7) ^ ROTR(x4, 41);
    mov    t0, x4
    ror    t0, 7
    xor    x4, t0
    ror    t0, 41-7
    xor    x4, t0
    
    pop    i
    inc    i
    cmp    i, [rsp]
    jnz    permute_loop  
   
    ; save
    mov    [x+0*8], x0    
    mov    [x+1*8], x1  
    mov    [x+2*8], x2  
    mov    [x+3*8], x3  
    mov    [x+4*8], x4  

    pop    r
    
    pop    r12
    pop    rbp
    pop    rdi
    pop    rbx
    pop    rsi
    ret
Posted in cryptography, encryption, programming, security | Tagged , , | Leave a comment

Keccak Permutation Function

Introduction

Permutation functions are incredibly useful, and Keccak which is used to implement the SHA-3 standard is a like a swiss army knife of cryptographic primitives.

Designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche, Keccak can be used to construct a stream cipher, a cryptographic hash function, a pseudo random number generator(PRNG), message authentication code (MAC) or authenticated encryption. (AE)

Although SHA-3 (FIPS 202) was covered in a previous post, the underlying permutation function (Keccak-p[1600, 24]) was for the most part overlooked.

A lightweight implementation of Keccak-p[800, 22] for 32-bit architectures will be shown here, specifically the x86 CPU.

Keccak Parameters

F Target Architecture Capacity Rounds
p200 8-bit 200-bits 18
p400 16-bit 400-bits 20
p800 32-bit 800-bits 22
p1600 64-bit 1600-bits 24

Keccak Modules

Module Description
Theta Renders the internal state into a 5-by-5 array of n-bit elements. Computes the parity of each column and combines them with an exclusive-or (XOR) operator.Then it XORs the resulting parity to each state bit as follows.
Rho Rotates each element by a triangular number.
Pi Permutes the elements.
Chi Adds a non-linear aspect to the permutation round. Combines the row elements using only three bitwise operators: AND, NOT, and XOR.
Iota Breaks up any symmetry caused by the other modules. This is done by XORing one of the array elements to a round constant. Without iota, the round mapping would be symmetric. Without iota, all rounds would be the same.

Keccak-p800 in C

// round constant function
// Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
uint32_t rc (uint8_t *LFSR) {
    uint32_t c; 
    int8_t   t;
    uint8_t  i;

    c = 0;
    t = *LFSR;
    
    for (i=1; i<128; i += i) 
    {
      if (t & 1) {
        // if shift value is < 32
        if ((i-1) < 32) {
          c ^= 1UL << (i - 1);
        }
      }
      t = (t & 0x80) ? (t << 1) ^ 0x71 : t << 1;
    }
    *LFSR = (uint8_t)t;
    return c;
}

void k800_permute (void *state) {
    uint32_t i, j, r, t, u, bc[5];
    uint8_t  lfsr=1;
    uint32_t *st=(uint32_t*)state;
    uint8_t  *p, *m;
    int      rnd;
    
    uint32_t piln[6]=
    { 0x110b070a, 0x10050312, 0x04181508, 
      0x0d13170f, 0x0e14020c, 0x01060916 };

    uint32_t m5[3]=
    { 0x03020100, 0x02010004, 0x00000403 };
    
    p = (uint8_t*)piln;
    m = (uint8_t*)m5;
    
    for (rnd=0; rnd<22; rnd++) {
      // Theta
      for (i=0; i<5; i++) {     
        t  = st[i]; 
        t ^= st[i +  5]; 
        t ^= st[i + 10]; 
        t ^= st[i + 15]; 
        t ^= st[i + 20];
        bc[i] = t;
      }
      for (i=0; i<5; i++) {
        t  = bc[m[(i + 4)]]; 
        t ^= ROTL32(bc[m[(i + 1)]], 1);
        for (j=i; j<25; j+=5) {
          st[j] ^= t;
        }
      }
      // Rho + Pi
      u = st[1];      
      for (i=0, r=0; i<24; i++) {
        r = r + i + 1;       
        u  = ROTL32(u, r);
        XCHG(st[p[i]], u);
        bc[0] = u;
      }
      // Chi
      for (i=0; i<25; i+=5) {
        memcpy(&bc, &st[i], 5*4);      
        for (j=0; j<5; j++) {
          t  = ~bc[m[(j + 1)]];
          t &=  bc[m[(j + 2)]];        
          st[j + i] ^= t;
        }
      }
      // Iota
      st[0] ^= rc(&lfsr);
    }
}

32-bit x86 Assembly

The following code includes optmizations from Peter Ferrie

bits   32
  
struc kws_t
  bc1  resd 1 ; edi
  bc2  resd 1 ; esi
  bc3  resd 1 ; ebp
  bc4  resd 1 ; esp
  bc5  resd 1 ; ebx
  lfsr resd 1 ; edx
  ; ecx
  ; eax
  .size:
endstruc
  
    %ifndef BIN
      global k800_permutex
      global _k800_permutex
    %endif
    
; void k800_permutex(void *state);    
k800_permutex:
_k800_permutex:
    pushad
    mov    esi, [esp+32+4]      ; esi = st
    call   k800_l0
    ; modulo 5    
    dd     003020100h, 002010004h, 000000403h
    ; rho pi
    dd     0110b070ah, 010050312h, 004181508h 
    dd     00d13170fh, 00e14020ch, 001060916h
k800_l0:
    pop    ebx                  ; m + p
    push   22
    pop    eax
    cdq
    inc    edx                  ; lfsr = 1    
    pushad                      ; create local space
    mov    edi, esp             ; edi = bc   
k800_l1:    
    push   eax    
    push   5 
    pop    ecx    
    pushad
theta_l0:
    ; Theta
    lodsd                       ; t  = st[i     ];  
    xor    eax, [esi+ 5*4-4]    ; t ^= st[i +  5];
    xor    eax, [esi+10*4-4]    ; t ^= st[i + 10];
    xor    eax, [esi+15*4-4]    ; t ^= st[i + 15];
    xor    eax, [esi+20*4-4]    ; t ^= st[i + 20];
    stosd                       ; bc[i] = t;
    loop   theta_l0        
    popad    
    xor    eax, eax    
theta_l1:
    movzx  ebp, byte[ebx+eax+4] ; ebp = m[(i + 4)];
    mov    ebp, [edi+ebp*4]     ; t   = bc[m[(i + 4)]];    
    movzx  edx, byte[ebx+eax+1] ; edx = m[(i + 1)];
    mov    edx, [edi+edx*4]     ; edx = bc[m[(i + 1)]];
    rol    edx, 1               ; t  ^= ROTL32(edx, 1);
    xor    ebp, edx
theta_l2:
    xor    [esi+eax*4], ebp     ; st[j] ^= t;
    add    al, 5                ; j+=5 
    cmp    al, 25               ; j<25
    jb     theta_l2    
    sub    al, 24               ; i=i+1
    loop   theta_l1             ; i<5 
    ; *************************************
    ; Rho Pi
    ; *************************************
    mov    ebp, [esi+1*4]       ; t = st[1];
rho_l0:
    lea    ecx, [ecx+eax-4]     ; r = r + i + 1;
    rol    ebp, cl              ; t = ROTL32(t, r); 
    movzx  edx, byte[ebx+eax+7] ; edx = p[i];
    xchg   [esi+edx*4], ebp     ; XCHG(st[p[i]], t);
    inc    eax                  ; i++
    cmp    al, 24+5             ; i<24
    jnz    rho_l0               ; 
    ; *************************************
    ; Chi
    ; *************************************
    xor    ecx, ecx             ; i = 0   
chi_l0:    
    pushad
    ; memcpy(&bc, &st[i], 5*4);
    lea    esi, [esi+ecx*4]     ; esi = &st[i];
    mov    cl, 5
    rep    movsd
    popad
    xor    eax, eax
chi_l1:
    movzx  ebp, byte[ebx+eax+1]
    movzx  edx, byte[ebx+eax+2]
    mov    ebp, [edi+ebp*4]     ; t = ~bc[m[(i + 1)]] 
    not    ebp            
    and    ebp, [edi+edx*4]
    lea    edx, [eax+ecx]       ; edx = j + i    
    xor    [esi+edx*4], ebp     ; st[j + i] ^= t;  
    inc    eax                  ; j++
    cmp    al, 5                ; j<5
    jnz    chi_l1        
    add    cl, al               ; i+=5;
    cmp    cl, 25               ; i<25
    jb     chi_l0
    ; Iota
    mov    al, [esp+kws_t+lfsr+4]; al = t = *LFSR
    mov    dl, 1                ; i = 1
    xor    ebp, ebp
iota_l0:    
    test   al, 1                ; t & 1
    je     iota_l1    
    lea    ecx, [edx-1]         ; ecx = (i - 1)
    cmp    cl, 32               ; skip if (ecx >= 32)
    jae    iota_l1    
    btc    ebp, ecx             ; c ^= 1UL << (i - 1)
iota_l1:    
    add    al, al               ; t << 1
    sbb    ah, ah               ; ah = (t < 0) ? 0x00 : 0xFF
    and    ah, 0x71             ; ah = (ah == 0xFF) ? 0x71 : 0x00  
    xor    al, ah  
    add    dl, dl               ; i += i
    jns    iota_l0              ; while (i != 128)
    mov    [esp+kws_t+lfsr+4], al; save t
    xor    [esi], ebp           ; st[0] ^= rc(&lfsr);      
    pop    eax
    dec    eax
    jnz    k800_l1              ; rnds<22    
    popad                       ; release bc
    popad                       ; restore registers 
    ret

Summary

The x86 assembly generated by MSVC is approx. 342 bytes while the hand written x86 assembly is currently 236 bytes.

Posted in assembly, cryptography, encryption, programming | Tagged , , , | 1 Comment

Gimli: a cross-platform permutation function

Introduction

Gimli, named after the Lord Of The Rings character, is a 384-bit permutation function, designed specifically to be lightweight, high performance, and secure across multiple architectures.

It was designed by Daniel J. Bernstein, Stefan Kölbl, Stefan Lucks, Pedro Maat Costa Massolino, Florian Mendel, Kashif Nawaz, Tobias Schneider, Peter Schwabe, François-Xavier Standaert, Yosuke Todo, and Benoît Viguier.

Gimli can easily be used to build high-security block ciphers, tweakable block ciphers, stream ciphers, message-authentication codes, authenticated ciphers and hash functions.

What follows here is a compact implementation in C and x86 assembly. For performance enhanced implementations, please look at reference code written by Frank Denis here.

So happy was Frank with this function, that the LibHydrogen library is built atop Gimli.

However, Mike Hamburg points out in Cryptanalysis of 22 1/2 rounds of Gimli, there are weaknesses in how the Linear layer is applied which may lead to an update of this function in the future.

The Gimli team responded to Hamburg in Statement regarding “Cryptanalysis of 22 1/2 rounds of Gimli”

Mike responded further here.

C code

The only thing to mention here is the cyclic left shift by 24 bits is replaced with cyclic right shift by 8 bits. Same outcome, different direction.

void gimli(void *state)
{
  int      r, j;
  uint32_t t, x, y, z;
  uint32_t *s=(uint32_t*)state;
  
  for (r=0x9e377918; r!=0x9e377900; r--) {
    for (j=0; j<4; j++) {
      x = ROTR32(s[    j], 8);
      y = ROTL32(s[4 + j], 9);
      z =        s[8 + j];

      s[8 + j] = x ^ (z << 1) ^ ((y & z) << 2);
      s[4 + j] = y ^ x        ^ ((x | z) << 1);
      s[j]     = z ^ y        ^ ((x & y) << 3);
    }

    t = r & 3;
    
    // small swap
    if (t == 0) {
      XCHG(s[0], s[1]);
      XCHG(s[2], s[3]);
      
      // add constant      
      s[0] ^= r;
    }   
    // big swap
    if (t == 2) {
      XCHG(s[0], s[2]);
      XCHG(s[1], s[3]);
    }
  }
}

x86 Assembly

The following code includes optmizations from Peter Ferrie It’s currently 112 bytes.

%define j  ebx
%define x  eax
%define y  ebp
%define z  edx

%define s  esi

%define t0 ebp 
%define t1 edi

%define r  ecx

%define s0 eax
%define s1 ebx
%define s2 ebp
%define s3 esi

gimlix:
_gimlix:
    pushad  
    mov    r, 0x9e377900 + 24
g_l0:
    mov    s, [esp+32+4]        ; esi = s 
    push   s
    push   4
    pop    j
g_l1:
    ; x = ROTR32(s[    j], 8);
    lodsd  ;mov    x, [s]  
    ror    x, 8  
    
    ; y = ROTL32(s[4 + j], 9);
    mov    y, [s + (4*3)]   
    rol    y, 9
    
    ; z = s[8 + j];
    mov    z, [s + (4*7)]
    
    ; s[8 + j] = x ^ (z << 1) ^ ((y & z) << 2);
    push   x
    push   y
    lea    t1, [z + z]
    and    y, z
    shl    y, 2
    xor    t1, y
    xor    x, t1    
    mov    [s + (7*4)], x
    pop    y
    pop    x
    
    ; s[4 + j] = y ^ x        ^ ((x | z) << 1);
    push   x
    push   y
    xor    y, x
    or     x, z
    shl    x, 1
    xor    y, x
    mov    [s + (3*4)], y
    pop    y
    pop    x
    
    ; s[j]     = z ^ y        ^ ((x & y) << 3);    
    xor    z, y
    and    x, y
    shl    x, 3
    xor    z, x
    push   z
    
    dec    j
    jnz    g_l1

    pop    s3
    pop    s2
    pop    s1
    pop    s0

    pop    t1

    mov    dl, cl
    and    dl, 3
    jnz    g_l2
    
    ; XCHG (s[0], s[1]);
    xchg   s0, s1
    ; XCHG (s[2], s[3]);
    xchg   s2, s3
    ; s[0] ^= 0x9e377900 ^ r;
    xor    s0, r    
g_l2:
    cmp    dl, 2
    jnz    g_l3  
    ; XCHG (s[0], s[2]);
    xchg   s0, s2
    ; XCHG (s[1], s[3]);
    xchg   s1, s3
g_l3:
    stosd
    xchg   eax, s1
    stosd
    xchg   eax, s2
    stosd
    xchg   eax, s3
    stosd    
    dec    cl   
    jnz    g_l0    
    popad
    ret

See sources here

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , , | 3 Comments