Introduction
SPECK is a family of lightweight block ciphers designed and published by the National Security Agency (NSA) in June 2013. It uses an ARX (Add-Rotate-Xor) structure optimized for performance in software implementations and has been suggested for use on resource constrained devices or the Internet of Things (IoT). SPECK supports a variety of block and key sizes. A block is always two words, but the words may be 16, 24, 32, 48 or 64 bits in size. The corresponding key is 2, 3 or 4 words. The round function consists of two rotations, adding the right word to the left word, xoring the key into the left word, and xoring the left word to the right word. The number of rounds depends on the parameters selected. There are two variants implemented here in three different assembly languages and C. SPECK-64/128 written in x86,ARM32,ARM64 assembly and SPECK-128/256 written in AMD64,ARM64 assembly. SPECK-64/128 uses 27 rounds of encryption and fits well onto both legacy (x86) and long mode (x64) of x86 CPU. However, SPECK-128/256 is only suitable for 64-bit architectures.
Key schedule
void speck64_setkey(const void *in, void *out) { uint32_t i, t, k0, k1, k2, k3; uint32_t *k=(uint32_t*)in; uint32_t *ks=(uint32_t*)out; // copy 128-bit key to local space k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; // expand 128-bit key into round keys for (i=0; i<27; i++) { ks[i] = k0; k1 = (ROTR32(k1, 8) + k0) ^ i; k0 = ROTL32(k0, 3) ^ k1; // rotate left 32-bits XCHG(k3, k2); XCHG(k3, k1); } }
x86 assembly
%define SPECK_RNDS 27 %define k0 eax %define k1 ebx %define k2 ebp %define k3 edx speck_setkeyx: _speck_setkeyx: pushad mov esi, [esp+32+4] ; esi = in mov edi, [esp+32+8] ; edi = ks lodsd xchg eax, k3 lodsd xchg eax, k1 lodsd xchg eax, k2 lodsd xchg eax, k3 xor ecx, ecx spk_sk: ; ((uint32_t*)ks)[i] = k0; stosd ; k1 = (ROTR32(k1, 8) + k0) ^ i; ror k1, 8 add k1, k0 xor k1, ecx ; k0 = ROTL32(k0, 3) ^ k1; rol k0, 3 xor k0, k1 ; rotate left 32-bits xchg k3, k2 xchg k3, k1 ; i++ inc ecx cmp cl, SPECK_RNDS jnz spk_sk popad ret
Encryption/Decryption
void speck64_encrypt(const void *keys, int enc, void *data) { uint32_t i, x0, x1; uint32_t *ks=(uint32_t*)keys; uint32_t *x=(uint32_t*)data; // copy input to local space x0=x[0]; x1=x[1]; for (i=0; i<27; i++) { if (enc==SPECK_DECRYPT) { x1 = ROTR32(x1 ^ x0, 3); x0 = ROTL32((x0 ^ ks[27-1-i]) - x1, 8); } else { x0 = (ROTR32(x0, 8) + x1) ^ ks[i]; x1 = ROTL32(x1, 3) ^ x0; } } // save result x[0] = x0; x[1] = x1; }
x86 assembly
%define x0 eax %define x1 ebx speck_encryptx: _speck_encryptx: pushad lea esi, [esp+32+4] lodsd xchg edi, eax ; edi = ks lodsd xchg eax, ecx ; ecx = enc lodsd xchg eax, esi ; esi = in push esi lodsd xchg eax, x1 lodsd xchg eax, x1 test ecx, ecx mov cl, SPECK_RNDS jz spk_e0 spk_d0: ; x1 = ROTR32(x1 ^ x0, 3); xor x1, x0 ror x1, 3 ; x0 = ROTL32((x0 ^ ks[SPECK_RNDS-1-i]) - x1, 8); xor x0, [edi+4*ecx-4] sub x0, x1 rol x0, 8 loop spk_d0 jmp spk_end spk_e0: ; x0 = (ROTR32(x0, 8) + x1) ^ ks[i]; ror x0, 8 add x0, x1 xor x0, [edi] scasd ; x1 = ROTL32(x1, 3) ^ x0; rol x1, 3 xor x1, x0 loop spk_e0 spk_end: pop edi ; ((uint32_t*)in)[0] = x0; stosd xchg eax, x1 ; ((uint32_t*)in)[1] = x1; stosd popad ret
SPECK-64/128 in C
Many block ciphers are used in Counter Mode (CTR) that turns a block cipher into a stream cipher. Here’s the function with key scheduling and encryption combined.
#define R(v,n)(((v)>>(n))|((v)<<(32-(n)))) #define F(n)for(i=0;i<n;i++) typedef unsigned int W; void speck64(void*mk,void*p){ W k[4],*x=p,i,t; F(4)k[i]=((W*)mk)[i]; F(27) // apply linear+nonlinear layer, mix key x[0] = (R(x[0], 8) + x[1]) ^ k[0], x[1] = R(x[1], 29) ^ x[0], // create next subkey k[1] = (R(k[1], 8) + k[0]) ^ i, k[0] = R(k[0], 29) ^ k[1], // permute key t = k[1], k[1] = k[2], k[2] = k[3], k[3] = t; }
SPECK-64/128 in x86 assembly
; ----------------------------------------------- ; SPECK-64/128 Block Cipher in x86 assembly (Encryption only) ; ; size: 64 bytes ; ; global calls use cdecl convention ; ; ----------------------------------------------- bits 32 %define SPECK_RNDS 27 %define k0 eax %define k1 ebx %define k2 ebp %define k3 edx ; ; speck64/128 encryption in 64 bytes ; %ifndef BIN global speck global _speck %endif %define k0 edi %define k1 ebp %define k2 ecx %define k3 esi %define w0 ebx %define w1 edx speck: _speck: pushad mov esi, [esp+32+8] ; esi = in push esi ; save lodsd xchg eax, w0 ; w0 = in[0] lodsd xchg eax, w1 ; w1 = in[1] mov esi, [esp+32+8] ; esi = key lodsd xchg eax, k0 ; k0 = key[0] lodsd xchg eax, k1 ; k1 = key[1] lodsd xchg eax, k2 ; k2 = key[2] lodsd xchg eax, k3 ; k3 = key[3] xor eax, eax ; i = 0 spk_el: ; w0 = (ROTR32(w0, 8) + w1) ^ k0; ror w0, 8 add w0, w1 xor w0, k0 ; w1 = ROTR32(w1, 29) ^ w0; ror w1, 29 xor w1, w0 ; k1 = (ROTR32(k1, 8) + k0) ^ i; ror k1, 8 add k1, k0 xor k1, eax ; k0 = ROTR32(k0, 29) ^ k1; ror k0, 29 xor k0, k1 xchg k3, k2 xchg k3, k1 ; i++ inc eax cmp al, SPECK_RNDS jnz spk_el pop edi xchg eax, w0 stosd xchg eax, w1 stosd popad ret
SPECK-64/128 in ARM / AArch32 assembly
.arm .arch armv7 .text .global speck // key k0 .req r2 k1 .req r3 k2 .req r4 k3 .req r5 // plaintext x0 .req r6 x1 .req r7 // parameters k .req r0 x .req r1 i .req r0 t .req r8 // speck(void *key, void *data); speck: // save registers push {r0-r12, lr} // load 128-bit key // k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; ldm k, {k0, k1, k2, k3} // load 64-bit plain text ldm x, {x0, x1} // x0 = x[0]; x1 = k[1]; mov i, #0 // i=0 speck_loop: add x0, x1, x0, ror #8 // x0 = (ROTR32(x0, 8) + x1) ^ k0; eor x0, k0 // eor x1, x0, x1, ror #29 // x1 = ROTL32(x1, 3) ^ x0; mov t, k3 // backup k3 add k3, k0, k1, ror #8 // k3 = (ROTR32(k1, 8) + k0) ^ i; eor k3, i // eor k0, k3, k0, ror #29 // k0 = ROTL32(k0, 3) ^ k3; mov k1, k2 // k1 = k2; mov k2, t // k2 = t; add i, #1 // i++; cmp i, #27 // i<27; bne speck_loop // save result stm x, {x0, x1} // x[0] = x0; x[1] = x1; // restore registers pop {r0-r12, pc}
SPECK-128/256 in C
#define R(v,n)(((v)>>(n))|((v)<<(64-(n)))) #define F(n)for(i=0;i<n;i++) typedef unsigned long long W; void speck128(void*mk,void*p){ W k[4],*x=p,i,t; // load 256-bit key F(4)k[i]=((W*)mk)[i]; // encrypt 128-bit plaintext F(34) // apply linear+nonlinear layer x[1] = (R(x[1], 8) + x[0]) ^ k[0], x[0] = R(x[0], 61) ^ x[1], // create next subkey k[1] = (R(k[1], 8) + k[0]) ^ i, k[0] = R(k[0], 61) ^ k[1], // permute key t = k[1],k[1]=k[2],k[2]=k[3],k[3]=t; }
SPECK-128/256 in AMD64 assembly
; ----------------------------------------------- ; SPECK-128/256 block cipher in AMD64 assembly ; ; size: 83 bytes ; ; global calls use microsoft fastcall convention ; ; ----------------------------------------------- %ifndef BIN global speck128 %endif %define k0 rbx %define k1 rcx %define k2 rdx %define k3 rdi %define x0 rbp %define x1 rsi speck128: push rbp push rbx push rsi mov x0, [rsi ] ; x0 = data[0] mov x1, [rsi+8] ; x1 = data[1] ; F(4)k[i]=((W*)mk)[i]; mov k0, [rdi ] ; k0 = mk[0] mov k1, [rdi+ 8] ; k1 = mk[1] mov k2, [rdi+16] ; k2 = mk[2] mov k3, [rdi+24] ; k3 = mk[3] xor eax, eax ; i = 0 spk_L0: ; x[1] = (R(x[1], 8) + x[0]) ^ k[0]; ror x1, 8 add x1, x0 xor x1, k0 ; x[0] = R(x[0], 61) ^ x[1]; ror x0, 61 xor x0, x1 ; k[1] = (R(k[1], 8) + k[0]) ^ i; ror k1, 8 add k1, k0 xor cl, al ; k1 ^= i ; k[0] = R(k[0], 61) ^ k[1]; ror k0, 61 xor k0, k1 ; t = k[1], k[1] = k[2], k[2] = k[3], k[3] = t; xchg k1, k2 xchg k2, k3 ; i++ inc al cmp al, 34 jnz spk_L0 pop rax ; save 128-bit result mov [rax ], x0 mov [rax+8], x1 pop rbx pop rbp ret
SPECK-128/256 in ARM64 / AArch64 assembly
// SPECK128/256 in ARM64 assembly // 80 bytes .arch armv8-a .text .global speck128 // speck128(void*mk, void*data); speck128: // load 256-bit key // k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; ldp x5, x6, [x0] ldp x7, x8, [x0, 16] // load 128-bit plain text ldp x2, x4, [x1] // x0 = x[0]; x1 = k[1]; mov x3, xzr // i=0 L0: ror x4, x4, 8 add x4, x4, x2 // x1 = (R(x1, 8) + x0) ^ k0; eor x4, x4, x5 // eor x2, x4, x2, ror 61 // x0 = R(x0, 61) ^ x1; mov x9, x8 // backup k3 ror x6, x6, 8 add x8, x5, x6 // k3 = (R(k1, 8) + k0) ^ i; eor x8, x8, x3 // eor x5, x8, x5, ror 61 // k0 = R(k0, 61) ^ k3; mov x6, x7 // k1 = k2; mov x7, x9 // k2 = t; add x3, x3, 1 // i++; cmp x3, 34 // i < 34; bne L0 // save result stp x2, x4, [x1] // x[0] = x0; x[1] = x1; ret
Pingback: Shellcode: Encryption Algorithms in ARM Assembly | modexp