Chaskey-LTS Block Cipher

Introduction

Chaskey is a 128-bit block cipher with support for 128-bit keys. It was designed by Nicky Mouha, Bart Mennink, Anthony Van Herrewege, Dai Watanabe, Bart Preneel and Ingrid Verbauwhede. The main permutation is derived from SipHash, a fast short-input Pseudo-Random-Function (PRF) designed and published in 2012 by Daniel Bernstein and Jean-Phillippe Aumasson. It uses an Even-Mansour construction. Shimon Even and Yishay Mansour published a paper in 1997 titled A Construction of a Cipher From a Single Pseudorandom Permutation that suggested an incredibly simple but provably secure design for a cryptographic algorithm. Because only add-rotate-xor (ARX) instructions are used, it’s very suitable for many architectures.

even_mansour

The key is mixed with plaintext before encryption and after the application of permutation function F.

F function

The permutation uses 16 rounds of ADD/ROL/XOR (ARX) instructions for encryption. Decryption of ciphertext is simply reversing the process with SUB/ROR/XOR.

perm

Full function

This will perform encryption and decryption depending the enc parameter.

void chaskey(int enc, void *key, void *buf) 
{
   int      i;
   uint32_t *v=(uint32_t*)buf;
   uint32_t *k=(uint32_t*)key;
   
   // pre-whiten
   for (i=0; i<4; i++) {
     v[i] ^= k[i];
   }

   // apply permutation function
   for (i=0; i<16; i++) {
     if (enc==CHASKEY_ENCRYPT)
     {
       v[0] += v[1]; 
       v[1]=ROTL32(v[1], 5); 
       v[1] ^= v[0]; 
       v[0]=ROTL32(v[0],16);       
       v[2] += v[3]; 
       v[3]=ROTL32(v[3], 8); 
       v[3] ^= v[2];
       v[0] += v[3]; 
       v[3]=ROTL32(v[3],13); 
       v[3] ^= v[0];
       v[2] += v[1]; 
       v[1]=ROTL32(v[1], 7); 
       v[1] ^= v[2]; 
       v[2]=ROTL32(v[2],16);
     } else {     
       v[2]=ROTR32(v[2],16);
       v[1] ^= v[2];
       v[1]=ROTR32(v[1], 7);
       v[2] -= v[1];
       v[3] ^= v[0];
       v[3]=ROTR32(v[3],13);
       v[0] -= v[3];
       v[3] ^= v[2];
       v[3]=ROTR32(v[3], 8);
       v[2] -= v[3];
       v[0]=ROTR32(v[0],16);
       v[1] ^= v[0];
       v[1]=ROTR32(v[1], 5);
       v[0] -= v[1];
     }
   }
   // post-whiten
   for (i=0; i<4; i++) {
     v[i] ^= k[i];
   }
}

x86 assembly

The assembly is straight forward. We load buffer into ESI, key into EDI and enc into ECX. Load 4 32-bit registers with 128-bit data, apply pre-whitening with 128-bit key. Test ECX for zero, then save flag status with PUSHFD. This then frees ECX to use as a loop counter which is set to 16 (for LTS). After each round of permutation, restore the flag status with POPFD and keep looping until ECX is zero. Finally apply post-whitening using 128-bit key, save and return.

%define v0 eax
%define v1 ebx
%define v2 edx
%define v3 ebp

chaskey:
_chaskey:
    pushad
    lea     esi, [esp+32+4]
    lodsd
    xchg    ecx, eax          ; ecx = enc
    lodsd
    xchg    edi, eax          ; edi = key
    lodsd
    xchg    eax, esi          ; esi = buf
    push    esi
    ; load buf
    lodsd
    xchg    eax, v3
    lodsd
    xchg    eax, v1
    lodsd
    xchg    eax, v2
    lodsd
    xchg    eax, v3
    ; pre-whiten
    xor     v0, [edi   ]
    xor     v1, [edi+ 4]
    xor     v2, [edi+ 8]
    xor     v3, [edi+12]
    test    ecx, ecx
    mov     cl, 16
ck_l0:
    pushfd
    jz      ck_l1
    ; encrypt
    add     v0, v1
    rol     v1, 5
    xor     v1, v0
    rol     v0, 16
    add     v2, v3
    rol     v3, 8
    xor     v3, v2
    add     v0, v3
    rol     v3, 13
    xor     v3, v0
    add     v2, v1
    rol     v1, 7
    xor     v1, v2
    rol     v2, 16
    jmp     ck_l2
ck_l1:
    ; decrypt
    ror     v2, 16
    xor     v1, v2
    ror     v1, 7
    sub     v2, v1
    xor     v3, v0
    ror     v3, 13
    sub     v0, v3
    xor     v3, v2
    ror     v3, 8
    sub     v2, v3
    ror     v0, 16
    xor     v1, v0
    ror     v1, 5
    sub     v0, v1
ck_l2:
    popfd
    loop    ck_l0
ck_l3:
    ; post-whiten
    xor     v0, [edi   ]
    xor     v1, [edi+ 4]
    xor     v2, [edi+ 8]
    xor     v3, [edi+12]
    pop     edi
    ; save buf
    stosd
    xchg    eax, v1
    stosd
    xchg    eax, v2
    stosd
    xchg    eax, v3
    stosd
    popad
    ret

Compact code

#define R(v,n)(((v)>>(n))|((v)<<(32-(n))))
#define F(n)for(i=0;i<n;i++)
  
void chaskey(void*mk,void*p){
    unsigned int i,*x=p,*k=mk;

    F(4)x[i]^=k[i];
    F(16)
      *x+=x[1],
      x[1]=R(x[1],27)^*x,
      x[2]+=x[3],
      x[3]=R(x[3],24)^x[2],
      x[2]+=x[1],
      *x=R(*x,16)+x[3],
      x[3]=R(x[3],19)^*x,
      x[1]=R(x[1],25)^x[2],
      x[2]=R(x[2],16);
    F(4)x[i]^=k[i];
}

x86 assembly

; -----------------------------------------------
; Chaskey-LTS block cipher in x86 assembly (encryption only)
;
; size: 89 bytes
;
; global calls use cdecl convention
;
; -----------------------------------------------

    bits 32

%ifndef BIN
  global chaskey
  global _chaskey
%endif

%define v0 eax
%define v1 ebx
%define v2 edx
%define v3 ebp

chaskey:
_chaskey:
    pushad
    mov     edi, [esp+32+ 8]
    mov     esi, [esp+32+12]
    push    esi
    ; load buf
    lodsd
    xchg    eax, v3
    lodsd
    xchg    eax, v1
    lodsd
    xchg    eax, v2
    lodsd
    xchg    eax, v3
    ; pre-whiten
    xor     v0, [edi   ]
    xor     v1, [edi+ 4]
    xor     v2, [edi+ 8]
    xor     v3, [edi+12]
    ; 16 rounds
    push    16
    pop     ecx
ck_l0:    
    ; apply permutation
    add     v0, v1
    rol     v1, 5
    xor     v1, v0
    rol     v0, 16
    add     v2, v3
    rol     v3, 8
    xor     v3, v2
    add     v0, v3
    rol     v3, 13
    xor     v3, v0
    add     v2, v1
    rol     v1, 7
    xor     v1, v2
    rol     v2, 16
    loop    ck_l0
    ; post-whiten
    xor     v0, [edi   ]
    xor     v1, [edi+ 4]
    xor     v2, [edi+ 8]
    xor     v3, [edi+12]
    pop     edi
    ; save buf
    stosd
    xchg    eax, v1
    stosd
    xchg    eax, v2
    stosd
    xchg    eax, v3
    stosd
    popad
    ret

ARM32 / AArch32 assembly

k  .req r0
x  .req r1

k0 .req r2
k1 .req r3
k2 .req r4
k3 .req r5

x0 .req r6
x1 .req r7
x2 .req r8
x3 .req r9

i  .req r10
  
  // chaskey(void *key, void *data);
chaskey:
  
  // saxe registers
  push   {r0-r12,lr}
  
  // load 128-bit key
  ldm    k, {k0, k1, k2, k3}
  
  // load 128-bit plaintext
  ldm    x, {x0, x1, x2, x3}
  
  // xor plaintext with key
  eor    x0, x0, k0          // x[0] ^= k[0];
  eor    x1, x1, k1          // x[1] ^= k[1];
  eor    x2, x2, k2          // x[2] ^= k[2];
  eor    x3, x3, k3          // x[3] ^= k[3];
  mov    i, #16              // i = 16
chaskey_loop:
  add    x0, x0, x1          // x[0] += x[1];
  eor    x1, x0, x1, ror #27 // x[1]=ROTL32(x[1],  5) ^ x[0];
  add    x2, x2, x3          // x[2] += x[3];
  eor    x3, x2, x3, ror #24 // x[3]=ROTL32(x[3],  8) ^ x[2];
  add    x2, x2, x1          // x[2] += x[1];
  add    x0, x3, x0, ror #16 // x[0]=ROTL32(x[0], 16) + x[3];
  eor    x3, x0, x3, ror #19 // x[3]=ROTL32(x[3], 13) ^ x[0];
  eor    x1, x2, x1, ror #25 // x[1]=ROTL32(x[1],  7) ^ x[2];
  mov    x2, x2, ror #16     // x[2]=ROTL32(x[2], 16);
  subs   i, i, #1            // i--
  bne    chaskey_loop        // i>0
  
  // xor ciphertext with key
  eor    x0, x0, k0          // x[0] ^= k[0];
  eor    x1, x1, k1          // x[1] ^= k[1];
  eor    x2, x2, k2          // x[2] ^= k[2];
  eor    x3, x3, k3          // x[3] ^= k[3];
  
  // save ciphertext
  stm    x, {x0, x1, x2, x3}
  
  // restore registers
  pop    {r0-r12,pc}

ARM64 / AArch64 assembly

// CHASKEY in ARM64 assembly
// 112 bytes

  .arch armv8-a  
  .text
  
  .global chaskey

  // chaskey(void*mk, void*data);
chaskey:
    // load 128-bit key
    ldp    w2, w3, [x0]
    ldp    w4, w5, [x0, 8]

    // load 128-bit plain text
    ldp    w6, w7, [x1]
    ldp    w8, w9, [x1, 8]
 
    // xor plaintext with key
    eor    w6, w6, w2          // x[0] ^= k[0];
    eor    w7, w7, w3          // x[1] ^= k[1];
    eor    w8, w8, w4          // x[2] ^= k[2];
    eor    w9, w9, w5          // x[3] ^= k[3];
    mov    w10, 16             // i = 16
L0:
    add    w6, w6, w7          // x[0] += x[1];
    eor    w7, w6, w7, ror 27  // x[1]=R(x[1],27) ^ x[0];
    add    w8, w8, w9          // x[2] += x[3];
    eor    w9, w8, w9, ror 24  // x[3]=R(x[3],24) ^ x[2];
    add    w8, w8, w7          // x[2] += x[1];
    ror    w6, w6, 16
    add    w6, w9, w6          // x[0]=R(x[0],16) + x[3];
    eor    w9, w6, w9, ror 19  // x[3]=R(x[3],19) ^ x[0];
    eor    w7, w8, w7, ror 25  // x[1]=R(x[1],25) ^ x[2];
    ror    w8, w8, 16          // x[2]=R(x[2],16);
    subs   w10, w10, 1         // i--
    bne    L0                  // i > 0
  
    // xor cipher text with key
    eor    w6, w6, w2          // x[0] ^= k[0];
    eor    w7, w7, w3          // x[1] ^= k[1];
    eor    w8, w8, w4          // x[2] ^= k[2];
    eor    w9, w9, w5          // x[3] ^= k[3];
  
    // save 128-bit cipher text
    stp    w6, w7, [x1] 
    stp    w8, w9, [x1, 8]
    ret 

Sources here.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , . Bookmark the permalink.

2 Responses to Chaskey-LTS Block Cipher

  1. Pingback: XTEA Block Cipher | x86 crypto

  2. Pingback: Shellcode: Encryption Algorithms in ARM Assembly | modexp

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s