### Introduction

Chaskey is a 128-bit block cipher with support for 128-bit keys. It was designed by Nicky Mouha, Bart Mennink, Anthony Van Herrewege, Dai Watanabe, Bart Preneel and Ingrid Verbauwhede. The main permutation is derived from SipHash, a fast short-input Pseudo-Random-Function (PRF) designed and published in 2012 by Daniel Bernstein and Jean-Phillippe Aumasson. It uses an Even-Mansour construction. Shimon Even and Yishay Mansour published a paper in 1997 titled A Construction of a Cipher From a Single Pseudorandom Permutation that suggested an incredibly simple but provably secure design for a cryptographic algorithm. Because only add-rotate-xor (ARX) instructions are used, it’s very suitable for many architectures.

The key is mixed with plaintext before encryption and after the application of permutation function F.

### F function

The permutation uses 16 rounds of ADD/ROL/XOR (ARX) instructions for encryption. Decryption of ciphertext is simply reversing the process with SUB/ROR/XOR.

### Full function

This will perform encryption and decryption depending the enc parameter.

```void chaskey(int enc, void *key, void *buf)
{
int      i;
uint32_t *v=(uint32_t*)buf;
uint32_t *k=(uint32_t*)key;

// pre-whiten
for (i=0; i<4; i++) {
v[i] ^= k[i];
}

// apply permutation function
for (i=0; i<16; i++) {
{
v[0] += v[1];
v[1]=ROTL32(v[1], 5);
v[1] ^= v[0];
v[0]=ROTL32(v[0],16);
v[2] += v[3];
v[3]=ROTL32(v[3], 8);
v[3] ^= v[2];
v[0] += v[3];
v[3]=ROTL32(v[3],13);
v[3] ^= v[0];
v[2] += v[1];
v[1]=ROTL32(v[1], 7);
v[1] ^= v[2];
v[2]=ROTL32(v[2],16);
} else {
v[2]=ROTR32(v[2],16);
v[1] ^= v[2];
v[1]=ROTR32(v[1], 7);
v[2] -= v[1];
v[3] ^= v[0];
v[3]=ROTR32(v[3],13);
v[0] -= v[3];
v[3] ^= v[2];
v[3]=ROTR32(v[3], 8);
v[2] -= v[3];
v[0]=ROTR32(v[0],16);
v[1] ^= v[0];
v[1]=ROTR32(v[1], 5);
v[0] -= v[1];
}
}
// post-whiten
for (i=0; i<4; i++) {
v[i] ^= k[i];
}
}
```

### x86 assembly

The assembly is straight forward. We load buffer into ESI, key into EDI and enc into ECX. Load 4 32-bit registers with 128-bit data, apply pre-whitening with 128-bit key. Test ECX for zero, then save flag status with PUSHFD. This then frees ECX to use as a loop counter which is set to 16 (for LTS). After each round of permutation, restore the flag status with POPFD and keep looping until ECX is zero. Finally apply post-whitening using 128-bit key, save and return.

```%define v0 eax
%define v1 ebx
%define v2 edx
%define v3 ebp

lea     esi, [esp+32+4]
lodsd
xchg    ecx, eax          ; ecx = enc
lodsd
xchg    edi, eax          ; edi = key
lodsd
xchg    eax, esi          ; esi = buf
push    esi
lodsd
xchg    eax, v3
lodsd
xchg    eax, v1
lodsd
xchg    eax, v2
lodsd
xchg    eax, v3
; pre-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
test    ecx, ecx
mov     cl, 16
ck_l0:
pushfd
jz      ck_l1
; encrypt
rol     v1, 5
xor     v1, v0
rol     v0, 16
rol     v3, 8
xor     v3, v2
rol     v3, 13
xor     v3, v0
rol     v1, 7
xor     v1, v2
rol     v2, 16
jmp     ck_l2
ck_l1:
; decrypt
ror     v2, 16
xor     v1, v2
ror     v1, 7
sub     v2, v1
xor     v3, v0
ror     v3, 13
sub     v0, v3
xor     v3, v2
ror     v3, 8
sub     v2, v3
ror     v0, 16
xor     v1, v0
ror     v1, 5
sub     v0, v1
ck_l2:
popfd
loop    ck_l0
ck_l3:
; post-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
pop     edi
; save buf
stosd
xchg    eax, v1
stosd
xchg    eax, v2
stosd
xchg    eax, v3
stosd
ret
```

### Compact code

```#define R(v,n)(((v)>>(n))|((v)<<(32-(n))))
#define F(n)for(i=0;i<n;i++)

unsigned int i,*x=p,*k=mk;

F(4)x[i]^=k[i];
F(16)
*x+=x[1],
x[1]=R(x[1],27)^*x,
x[2]+=x[3],
x[3]=R(x[3],24)^x[2],
x[2]+=x[1],
*x=R(*x,16)+x[3],
x[3]=R(x[3],19)^*x,
x[1]=R(x[1],25)^x[2],
x[2]=R(x[2],16);
F(4)x[i]^=k[i];
}
```

### x86 assembly

```; -----------------------------------------------
; Chaskey-LTS block cipher in x86 assembly (encryption only)
;
; size: 89 bytes
;
; global calls use cdecl convention
;
; -----------------------------------------------

bits 32

%ifndef BIN
%endif

%define v0 eax
%define v1 ebx
%define v2 edx
%define v3 ebp

mov     edi, [esp+32+ 8]
mov     esi, [esp+32+12]
push    esi
lodsd
xchg    eax, v3
lodsd
xchg    eax, v1
lodsd
xchg    eax, v2
lodsd
xchg    eax, v3
; pre-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
; 16 rounds
push    16
pop     ecx
ck_l0:
; apply permutation
rol     v1, 5
xor     v1, v0
rol     v0, 16
rol     v3, 8
xor     v3, v2
rol     v3, 13
xor     v3, v0
rol     v1, 7
xor     v1, v2
rol     v2, 16
loop    ck_l0
; post-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
pop     edi
; save buf
stosd
xchg    eax, v1
stosd
xchg    eax, v2
stosd
xchg    eax, v3
stosd
ret
```

### ARM32 / AArch32 assembly

```k  .req r0
x  .req r1

k0 .req r2
k1 .req r3
k2 .req r4
k3 .req r5

x0 .req r6
x1 .req r7
x2 .req r8
x3 .req r9

i  .req r10

// saxe registers
push   {r0-r12,lr}

ldm    k, {k0, k1, k2, k3}

ldm    x, {x0, x1, x2, x3}

// xor plaintext with key
eor    x0, x0, k0          // x[0] ^= k[0];
eor    x1, x1, k1          // x[1] ^= k[1];
eor    x2, x2, k2          // x[2] ^= k[2];
eor    x3, x3, k3          // x[3] ^= k[3];
mov    i, #16              // i = 16
add    x0, x0, x1          // x[0] += x[1];
eor    x1, x0, x1, ror #27 // x[1]=ROTL32(x[1],  5) ^ x[0];
add    x2, x2, x3          // x[2] += x[3];
eor    x3, x2, x3, ror #24 // x[3]=ROTL32(x[3],  8) ^ x[2];
add    x2, x2, x1          // x[2] += x[1];
add    x0, x3, x0, ror #16 // x[0]=ROTL32(x[0], 16) + x[3];
eor    x3, x0, x3, ror #19 // x[3]=ROTL32(x[3], 13) ^ x[0];
eor    x1, x2, x1, ror #25 // x[1]=ROTL32(x[1],  7) ^ x[2];
mov    x2, x2, ror #16     // x[2]=ROTL32(x[2], 16);
subs   i, i, #1            // i--

// xor ciphertext with key
eor    x0, x0, k0          // x[0] ^= k[0];
eor    x1, x1, k1          // x[1] ^= k[1];
eor    x2, x2, k2          // x[2] ^= k[2];
eor    x3, x3, k3          // x[3] ^= k[3];

// save ciphertext
stm    x, {x0, x1, x2, x3}

// restore registers
pop    {r0-r12,pc}
```

### ARM64 / AArch64 assembly

```// CHASKEY in ARM64 assembly
// 112 bytes

.arch armv8-a
.text

ldp    w2, w3, [x0]
ldp    w4, w5, [x0, 8]

ldp    w6, w7, [x1]
ldp    w8, w9, [x1, 8]

// xor plaintext with key
eor    w6, w6, w2          // x[0] ^= k[0];
eor    w7, w7, w3          // x[1] ^= k[1];
eor    w8, w8, w4          // x[2] ^= k[2];
eor    w9, w9, w5          // x[3] ^= k[3];
mov    w10, 16             // i = 16
L0:
add    w6, w6, w7          // x[0] += x[1];
eor    w7, w6, w7, ror 27  // x[1]=R(x[1],27) ^ x[0];
add    w8, w8, w9          // x[2] += x[3];
eor    w9, w8, w9, ror 24  // x[3]=R(x[3],24) ^ x[2];
add    w8, w8, w7          // x[2] += x[1];
ror    w6, w6, 16
add    w6, w9, w6          // x[0]=R(x[0],16) + x[3];
eor    w9, w6, w9, ror 19  // x[3]=R(x[3],19) ^ x[0];
eor    w7, w8, w7, ror 25  // x[1]=R(x[1],25) ^ x[2];
ror    w8, w8, 16          // x[2]=R(x[2],16);
subs   w10, w10, 1         // i--
bne    L0                  // i > 0

// xor cipher text with key
eor    w6, w6, w2          // x[0] ^= k[0];
eor    w7, w7, w3          // x[1] ^= k[1];
eor    w8, w8, w4          // x[2] ^= k[2];
eor    w9, w9, w5          // x[3] ^= k[3];

// save 128-bit cipher text
stp    w6, w7, [x1]
stp    w8, w9, [x1, 8]
ret
```

Sources here.

This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , . Bookmark the permalink.

### 2 Responses to Chaskey-LTS Block Cipher

1. Pingback: XTEA Block Cipher | x86 crypto