## Keccak Permutation Function

### Introduction

Permutation functions are incredibly useful, and Keccak which is used to implement the SHA-3 standard is a like a swiss army knife of cryptographic primitives.

Designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche, Keccak can be used to construct a stream cipher, a cryptographic hash function, a pseudo random number generator(PRNG), message authentication code (MAC) or authenticated encryption. (AE)

Although SHA-3 (FIPS 202) was covered in a previous post, the underlying permutation function (Keccak-p[1600, 24]) was for the most part overlooked.

A lightweight implementation of Keccak-p[800, 22] for 32-bit architectures will be shown here, specifically the x86 CPU.

### Keccak Parameters

F Target Architecture Capacity Rounds
p200 8-bit 200-bits 18
p400 16-bit 400-bits 20
p800 32-bit 800-bits 22
p1600 64-bit 1600-bits 24

### Keccak Modules

Module Description
Theta Renders the internal state into a 5-by-5 array of n-bit elements. Computes the parity of each column and combines them with an exclusive-or (XOR) operator.Then it XORs the resulting parity to each state bit as follows.
Rho Rotates each element by a triangular number.
Pi Permutes the elements.
Chi Adds a non-linear aspect to the permutation round. Combines the row elements using only three bitwise operators: AND, NOT, and XOR.
Iota Breaks up any symmetry caused by the other modules. This is done by XORing one of the array elements to a round constant. Without iota, the round mapping would be symmetric. Without iota, all rounds would be the same.

### Keccak-p800 in C

// round constant function
// Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
uint32_t rc (uint8_t *LFSR) {
uint32_t c;
int8_t   t;
uint8_t  i;

c = 0;
t = *LFSR;

for (i=1; i<128; i += i)
{
if (t & 1) {
// if shift value is < 32
if ((i-1) < 32) {
c ^= 1UL << (i - 1);
}
}
t = (t & 0x80) ? (t << 1) ^ 0x71 : t << 1;
}
*LFSR = (uint8_t)t;
return c;
}

void k800_permute (void *state) {
uint32_t i, j, r, t, u, bc[5];
uint8_t  lfsr=1;
uint32_t *st=(uint32_t*)state;
uint8_t  *p, *m;
int      rnd;

uint32_t piln[6]=
{ 0x110b070a, 0x10050312, 0x04181508,
0x0d13170f, 0x0e14020c, 0x01060916 };

uint32_t m5[3]=
{ 0x03020100, 0x02010004, 0x00000403 };

p = (uint8_t*)piln;
m = (uint8_t*)m5;

for (rnd=0; rnd<22; rnd++) {
// Theta
for (i=0; i<5; i++) {
t  = st[i];
t ^= st[i +  5];
t ^= st[i + 10];
t ^= st[i + 15];
t ^= st[i + 20];
bc[i] = t;
}
for (i=0; i<5; i++) {
t  = bc[m[(i + 4)]];
t ^= ROTL32(bc[m[(i + 1)]], 1);
for (j=i; j<25; j+=5) {
st[j] ^= t;
}
}
// Rho + Pi
u = st[1];
for (i=0, r=0; i<24; i++) {
r = r + i + 1;
u  = ROTL32(u, r);
XCHG(st[p[i]], u);
bc[0] = u;
}
// Chi
for (i=0; i<25; i+=5) {
memcpy(&bc, &st[i], 5*4);
for (j=0; j<5; j++) {
t  = ~bc[m[(j + 1)]];
t &=  bc[m[(j + 2)]];
st[j + i] ^= t;
}
}
// Iota
st[0] ^= rc(&lfsr);
}
}


### 32-bit x86 Assembly

The following code includes optmizations from Peter Ferrie

bits   32

struc kws_t
bc1  resd 1 ; edi
bc2  resd 1 ; esi
bc3  resd 1 ; ebp
bc4  resd 1 ; esp
bc5  resd 1 ; ebx
lfsr resd 1 ; edx
; ecx
; eax
.size:
endstruc

%ifndef BIN
global k800_permutex
global _k800_permutex
%endif

; void k800_permutex(void *state);
k800_permutex:
_k800_permutex:
mov    esi, [esp+32+4]      ; esi = st
call   k800_l0
; modulo 5
dd     003020100h, 002010004h, 000000403h
; rho pi
dd     0110b070ah, 010050312h, 004181508h
dd     00d13170fh, 00e14020ch, 001060916h
k800_l0:
pop    ebx                  ; m + p
push   22
pop    eax
cdq
inc    edx                  ; lfsr = 1
mov    edi, esp             ; edi = bc
k800_l1:
push   eax
push   5
pop    ecx
theta_l0:
; Theta
lodsd                       ; t  = st[i     ];
xor    eax, [esi+ 5*4-4]    ; t ^= st[i +  5];
xor    eax, [esi+10*4-4]    ; t ^= st[i + 10];
xor    eax, [esi+15*4-4]    ; t ^= st[i + 15];
xor    eax, [esi+20*4-4]    ; t ^= st[i + 20];
stosd                       ; bc[i] = t;
loop   theta_l0
xor    eax, eax
theta_l1:
movzx  ebp, byte[ebx+eax+4] ; ebp = m[(i + 4)];
mov    ebp, [edi+ebp*4]     ; t   = bc[m[(i + 4)]];
movzx  edx, byte[ebx+eax+1] ; edx = m[(i + 1)];
mov    edx, [edi+edx*4]     ; edx = bc[m[(i + 1)]];
rol    edx, 1               ; t  ^= ROTL32(edx, 1);
xor    ebp, edx
theta_l2:
xor    [esi+eax*4], ebp     ; st[j] ^= t;
cmp    al, 25               ; j<25
jb     theta_l2
sub    al, 24               ; i=i+1
loop   theta_l1             ; i<5
; *************************************
; Rho Pi
; *************************************
mov    ebp, [esi+1*4]       ; t = st[1];
rho_l0:
lea    ecx, [ecx+eax-4]     ; r = r + i + 1;
rol    ebp, cl              ; t = ROTL32(t, r);
movzx  edx, byte[ebx+eax+7] ; edx = p[i];
xchg   [esi+edx*4], ebp     ; XCHG(st[p[i]], t);
inc    eax                  ; i++
cmp    al, 24+5             ; i<24
jnz    rho_l0               ;
; *************************************
; Chi
; *************************************
xor    ecx, ecx             ; i = 0
chi_l0:
; memcpy(&bc, &st[i], 5*4);
lea    esi, [esi+ecx*4]     ; esi = &st[i];
mov    cl, 5
rep    movsd
xor    eax, eax
chi_l1:
movzx  ebp, byte[ebx+eax+1]
movzx  edx, byte[ebx+eax+2]
mov    ebp, [edi+ebp*4]     ; t = ~bc[m[(i + 1)]]
not    ebp
and    ebp, [edi+edx*4]
lea    edx, [eax+ecx]       ; edx = j + i
xor    [esi+edx*4], ebp     ; st[j + i] ^= t;
inc    eax                  ; j++
cmp    al, 5                ; j<5
jnz    chi_l1
cmp    cl, 25               ; i<25
jb     chi_l0
; Iota
mov    al, [esp+kws_t+lfsr+4]; al = t = *LFSR
mov    dl, 1                ; i = 1
xor    ebp, ebp
iota_l0:
test   al, 1                ; t & 1
je     iota_l1
lea    ecx, [edx-1]         ; ecx = (i - 1)
cmp    cl, 32               ; skip if (ecx >= 32)
jae    iota_l1
btc    ebp, ecx             ; c ^= 1UL << (i - 1)
iota_l1:
add    al, al               ; t << 1
sbb    ah, ah               ; ah = (t < 0) ? 0x00 : 0xFF
and    ah, 0x71             ; ah = (ah == 0xFF) ? 0x71 : 0x00
xor    al, ah
add    dl, dl               ; i += i
jns    iota_l0              ; while (i != 128)
mov    [esp+kws_t+lfsr+4], al; save t
xor    [esi], ebp           ; st[0] ^= rc(&lfsr);
pop    eax
dec    eax
jnz    k800_l1              ; rnds<22
ret


### Summary

The x86 assembly generated by MSVC is approx. 342 bytes while the hand written x86 assembly is currently 236 bytes.

Posted in assembly, cryptography, encryption, programming | Tagged , , , | 1 Comment

## Gimli: a cross-platform permutation function

Introduction

Gimli, named after the Lord Of The Rings character, is a 384-bit permutation function, designed specifically to be lightweight, high performance, and secure across multiple architectures.

Gimli can easily be used to build high-security block ciphers, tweakable block ciphers, stream ciphers, message-authentication codes, authenticated ciphers and hash functions.

What follows here is a compact implementation in C and x86 assembly. For performance enhanced implementations, please look at reference code written by Frank Denis here.

So happy was Frank with this function, that the LibHydrogen library is built atop Gimli.

However, Mike Hamburg points out in Cryptanalysis of 22 1/2 rounds of Gimli, there are weaknesses in how the Linear layer is applied which may lead to an update of this function in the future.

The Gimli team responded to Hamburg in Statement regarding “Cryptanalysis of 22 1/2 rounds of Gimli”

## C code

The only thing to mention here is the cyclic left shift by 24 bits is replaced with cyclic right shift by 8 bits. Same outcome, different direction.

void gimli(void *state)
{
int      r, j;
uint32_t t, x, y, z;
uint32_t *s=(uint32_t*)state;

for (r=0x9e377918; r!=0x9e377900; r--) {
for (j=0; j<4; j++) {
x = ROTR32(s[    j], 8);
y = ROTL32(s[4 + j], 9);
z =        s[8 + j];

s[8 + j] = x ^ (z << 1) ^ ((y & z) << 2);
s[4 + j] = y ^ x        ^ ((x | z) << 1);
s[j]     = z ^ y        ^ ((x & y) << 3);
}

t = r & 3;

// small swap
if (t == 0) {
XCHG(s[0], s[1]);
XCHG(s[2], s[3]);

s[0] ^= r;
}
// big swap
if (t == 2) {
XCHG(s[0], s[2]);
XCHG(s[1], s[3]);
}
}
}


## x86 Assembly

The following code includes optmizations from Peter Ferrie It’s currently 112 bytes.

%define j  ebx
%define x  eax
%define y  ebp
%define z  edx

%define s  esi

%define t0 ebp
%define t1 edi

%define r  ecx

%define s0 eax
%define s1 ebx
%define s2 ebp
%define s3 esi

gimlix:
_gimlix:
mov    r, 0x9e377900 + 24
g_l0:
mov    s, [esp+32+4]        ; esi = s
push   s
push   4
pop    j
g_l1:
; x = ROTR32(s[    j], 8);
lodsd  ;mov    x, [s]
ror    x, 8

; y = ROTL32(s[4 + j], 9);
mov    y, [s + (4*3)]
rol    y, 9

; z = s[8 + j];
mov    z, [s + (4*7)]

; s[8 + j] = x ^ (z << 1) ^ ((y & z) << 2);
push   x
push   y
lea    t1, [z + z]
and    y, z
shl    y, 2
xor    t1, y
xor    x, t1
mov    [s + (7*4)], x
pop    y
pop    x

; s[4 + j] = y ^ x        ^ ((x | z) << 1);
push   x
push   y
xor    y, x
or     x, z
shl    x, 1
xor    y, x
mov    [s + (3*4)], y
pop    y
pop    x

; s[j]     = z ^ y        ^ ((x & y) << 3);
xor    z, y
and    x, y
shl    x, 3
xor    z, x
push   z

dec    j
jnz    g_l1

pop    s3
pop    s2
pop    s1
pop    s0

pop    t1

mov    dl, cl
and    dl, 3
jnz    g_l2

; XCHG (s[0], s[1]);
xchg   s0, s1
; XCHG (s[2], s[3]);
xchg   s2, s3
; s[0] ^= 0x9e377900 ^ r;
xor    s0, r
g_l2:
cmp    dl, 2
jnz    g_l3
; XCHG (s[0], s[2]);
xchg   s0, s2
; XCHG (s[1], s[3]);
xchg   s1, s3
g_l3:
stosd
xchg   eax, s1
stosd
xchg   eax, s2
stosd
xchg   eax, s3
stosd
dec    cl
jnz    g_l0
ret


See sources here

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , , | 3 Comments

## Light Message Authentication Code (LightMAC)

### Introduction

A Message Authentication Code or MAC is a cryptographic primitive that enables two parties using a shared secret key to authenticate messages exchanged between them.

MACs are used daily to secure online communications and will inevitably become a critical component of embedded systems in the future that have wireless functionality; specifically passive, low-power and IoT devices.

Some MACs are constructed from cryptographic hash algorithms. The Hash-based message authentication code (HMAC) published in 1996 for example recommended using MD5 or SHA-1. However, due to resources required for these hash functions, they were never suitable for microcontrollers.

Other MACs are constructed from block ciphers and this is more ideal for a developer where ROM and RAM are in short supply since there’s already a number of lightweight block ciphers available.

There’s no shortage of cryptographic solutions for desktop computers and mobile devices, however there still remains a significant problem with some of the current standardized algorithms when attempting to implement for resource constrained software or hardware devices.

A draft Report on Lightweight Cryptography published by NIST in August 2016 states that none of the NIST approved hash functions are suitable for resource constrained environments based on the findings in A Study on RAM Requirements of Various SHA-3 Candidates on Low-cost 8-bit CPUs

There’s currently an effort on part of engineers, cryptographers and organizations to design, evaluate and standardize lightweight cryptographic primitives for the embedded industry.

Whether we agree or disagree on the need to embed wireless devices in everyday objects, there will be many that have networking or wireless functionality in future which will require robust encryption solutions.

### LightMAC

A MAC Mode for Lightweight Block Ciphers was designed by Atul Luykx, Bart Preneel, Elmar Tischhauser, Kan Yasuda and published in 2016.

$\begin{array}{l} \\\hline \textbf{Algorithm 1: }\text{LightMAC} _{K_1,K_2}(M) \\\hline \quad\textbf{Input: } K_1, K_2\in \{0, 1\}^k,\: M\in \{0,1\}^{\leq 2^s(n-s)}\\ \quad\textbf{Output: } T\in\{0, 1\}^t\\ \textbf{\scriptsize{1}}\;\; V\leftarrow 0^n\in\{0, 1\}^n\\ \textbf{\scriptsize{2}}\;\; M[1]M[2]\cdots M[\ell]\xleftarrow{n-s}M\\ \textbf{\scriptsize{3}}\;\; \textbf{for } i = 1\textbf{ to } \ell - 1\textbf{ do}\\ \textbf{\scriptsize{4}}\;\; \mid V\leftarrow V\oplus E_{K_1}(i_s M[i])\\ \textbf{\scriptsize{5}}\;\; \textbf{end}\\ \textbf{\scriptsize{6}}\;\; V\leftarrow V\oplus (M[\ell]10^\ast)\\ \textbf{\scriptsize{7}}\;\; T\leftarrow \lfloor E_{K_2}(V)\rfloor_t\\ \textbf{\scriptsize{8}}\;\; \textbf{return } T \\\hline \end{array}$

The other lightweight MAC algorithms under evaluation by NIST are Chaskey and TuLp. I’ve already covered Chaskey in a previous post.

All code and details of the algorithm shown here are derived from reference implementations provided by Atul here on github

Any developers that require a reference implementation should use those source codes instead of what’s shown here.

Instead of using PRESENT block cipher, I’ll use SPECK64/128 due to the ARX design which is much easier to implement on x86 architecture.

### Block Cipher Parameters

LightMAC is a mode of operation and depends directly on an external block cipher.

The only other block ciphers that come close to the size of Speck would be XTEA or Chaskey. The following is a list of parameters recommended for use with Speck.

### LightMAC Parameters

The authors define the following in reference material which are based on the block cipher.

• COUNTER_LENGTH
• Length of the protected counter sum in bytes. Not greater than N/2.

• TAG_LENGTH
• Length of tag in bytes. Should be at least 64-bits but not greater than N.

• BLOCK_LENGTH
• Length of block in bytes. Same as N.

• BC_KEY_LENGTH
• Length of block cipher key. The MAC key is twice this length.

The following table shows some example parameters using existing lightweight block ciphers.

Cipher (E) Block Length (N) Cipher Key Length MAC Key Length (K) Counter Length (S) Tag Length (T)
PRESENT 64-bits 128-bits 256-bits 32-bits 64-bits
SPECK 64-bits 128-bits 256-bits 32-bits 64-bits
AES 128-bits 128-bits 256-bits 32-bits 128-bits

### Initialization

In the specification, V denotes an intermediate/local variable of cipher block size N. It is initialized to zero and updated after every encryption using an XOR operation with ciphertext before returning the result in T (truncated if required)

But in my own implementation, I assume T to be of cipher block size N and initialize it to zero. I then update T instead which is why I prefer readers use the reference implementation instead of what’s shown here. 🙂

My reason for not allocating space for V and using T directly is simply to reduce the amount of code required for the assembly code.

### Updating

The update process is very similar to what you see used in cryptographic hash algorithms. I was gonna have a more detailed description here but I think comments should be clear enough.

### Finalization

An end bit (0x80) is appended to M buffer along with any data remaining or none if the input length was a multiple of the block cipher length.

This is then XORed with any previous cipher block state before being encrypted with the 2nd key before returning.

### x86 Assembly code

First, here’s the SPECK block cipher using 64-bit block and 128-bit key.

%define SPECK_RNDS    27
; *****************************************
; Light MAC parameters based on SPECK64-128
;
; N = 64-bits
; K = 128-bits
;
%define COUNTER_LENGTH 4  ; should be <= N/2
%define BLOCK_LENGTH   8  ; equal to N
%define TAG_LENGTH     8  ; >= 64 && <= N
%define BC_KEY_LENGTH 16  ; K

%define ENCRYPT speck64_encryptx
%define LIGHTMAC_KEY_LENGTH BC_KEY_LENGTH*2 ; K*2

%define k0 edi
%define k1 ebp
%define k2 ecx
%define k3 esi

%define x0 ebx
%define x1 edx

; esi = input
; ebp = key

speck64_encryptx:

push    esi            ; save M

lodsd                  ; x0 = x->w[0]
xchg    eax, x0
lodsd                  ; x1 = x->w[1]
xchg    eax, x1

mov     esi, ebp       ; esi = key
lodsd
xchg    eax, k0        ; k0 = key[0]
lodsd
xchg    eax, k1        ; k1 = key[1]
lodsd
xchg    eax, k2        ; k2 = key[2]
lodsd
xchg    eax, k3        ; k3 = key[3]
xor     eax, eax       ; i = 0
spk_el:
; x0 = (ROTR32(x0, 8) + x1) ^ k0;
ror     x0, 8
xor     x0, k0
; x1 = ROTL32(x1, 3) ^ x0;
rol     x1, 3
xor     x1, x0
; k1 = (ROTR32(k1, 8) + k0) ^ i;
ror     k1, 8
xor     k1, eax
; k0 = ROTL32(k0, 3) ^ k1;
rol     k0, 3
xor     k0, k1
xchg    k3, k2
xchg    k3, k1
; i++
inc     eax
cmp     al, SPECK_RNDS
jnz     spk_el

pop     edi
xchg    eax, x0        ; x->w[0] = x0
stosd
xchg    eax, x1        ; x->w[1] = x1
stosd
ret


Now LightMAC.

You might notice how ctr and idx variables are initialized to zero at the same time using CDQ instruction. Once PUSHAD is executed, it preserves EDX on the stack and is then used as the protected counter sum S.

Although we convert the counter to big endian format before saving in block buffer, it wouldn’t affect the security to skip this. I’ve retained it for compatibility with reference but might remove it later.

; void lightmac_tag(const void *msg, uint32_t msglen,
;     void *tag, void* mkey)
lightmac_tagx:
_lightmac_tagx:
lea     esi, [esp+32+4]; esi = argv
lodsd
xchg    eax, ebx       ; ebx = msg
lodsd
cdq                    ; ctr = 0, idx = 0,
xchg    eax, ecx       ; ecx = msglen
lodsd
xchg    eax, edi       ; edi = tag
lodsd
xchg    eax, ebp       ; ebp = mkey
pushad                 ; allocate N-bytes for M
; zero initialize T
mov     [edi+0], edx   ; t->w[0] = 0;
mov     [edi+4], edx   ; t->w[1] = 0;
; while we have msg data
lmx_l0:
mov     esi, esp       ; esi = M
jecxz   lmx_l2         ; exit loop if msglen == 0
lmx_l1:
mov     al, [ebx]      ; al = *data++
inc     ebx
mov     [esi+edx+COUNTER_LENGTH], al
inc     edx            ; idx++
; M filled?
cmp     dl, BLOCK_LENGTH - COUNTER_LENGTH
; --msglen
loopne  lmx_l1
jne     lmx_l2
; add S counter in big endian format
inc     dword[esp+_edx]; ctr++
mov     eax, [esp+_edx]
; reset index
cdq                    ; idx = 0
bswap   eax            ; m.ctr = SWAP32(ctr)
mov     [esi], eax
; encrypt M with E using K1
call    ENCRYPT
; update T
lodsd                  ; t->w[0] ^= m.w[0];
xor     [edi+0], eax
lodsd                  ; t->w[1] ^= m.w[1];
xor     [edi+4], eax
jmp     lmx_l0         ; keep going
lmx_l2:
mov     byte[esi+edx+COUNTER_LENGTH], 0x80
xchg    esi, edi       ; swap T and M
lmx_l3:
; update T with any msg data remaining
mov     al, [edi+edx+COUNTER_LENGTH]
xor     [esi+edx], al
dec     edx
jns     lmx_l3
; encrypt T with E using K2
call    ENCRYPT
popad                  ; release memory for M
ret


### Summary

The x86 assembly generated by MSVC using /O2 /Os is 238 bytes. The x86 assembly written by hand is 152 bytes.

In order for developers to benefit from LightMAC on microcontrollers, they should choose a lightweight block cipher but not necessarily SPECK. It’s only used here for illustration.

See lmx32.asm and lightmac.c for any future updates.

Posted in assembly, cryptography, encryption, programming | Tagged , , , , , , , , , | Leave a comment

## PRESENT Block Cipher

### Introduction

PRESENT is a 64-bit block cipher published in 2007 which provides support for key lengths of 80 and 128-bits. It was designed specifically for hardware implementation and uses a Substitution Permutation Network (SPN) structure which is similar to AES but at the same time completely different.

The difference is that PRESENT is bit oriented whereas AES is byte oriented. For that reason, PRESENT is not suitable for software implementation although I found the bit-permutation layer interesting and was just curious to see results using x86 assembly.

It was designed by Andrey Bogdanov, Lars R. Knudsen, Gregor Leander, Christof Paar, Axel Poschmann, Matthew J. B. Robshaw, Yannick Seurin, and C. Vikkelsoe.

PRESENT is intended to be used in situations where low-power consumption and high chip efficiency is desired. It has been standardized in ISO-29192 along with CLEFIA which was designed by Sony Corporation.

Today, we’ll focus on the 128-bit key version since it has more potential for future applications than something supporting only 80-bit keys.

I’ve no doubt you can implement PRESENT in 32-bit x86 assembly but because it’s a hardware design, I decided to stick with x86-64 assembly and only implement encryption.

In future blog entries, depending on the design of an algorithm, it will be either 32 or 64-bit assembly and maybe both if suitable but I’ll avoid using MMX which was used in some past entries.

You can find a number of implementations in C and python here.

### Key whitening

A linear operation seen in pretty much every block cipher designed since being used by Ron Rivest in 1984 to strengthen the DES block cipher.

It uses a simple eXclusive OR operation with the key and data.

$\textbf{addRoundKey. } \text{Given round key } K_i=\kappa_{63}^i\dots \kappa_0^i \text{ for } 1\leq i\leq 32 \text{ and current }\\ \textsc{state } b_{63}\dots b_0, \text{ addRoundKey consists of the operation for } 0\leq j\leq 63,$
$b_j\rightarrow b_j\oplus\kappa_j^i.$

### Byte substitution

The non-linear layer is based on a single 4-bit S-box which was designed with hardware optimizations in mind. It replaces each nibble of 64-bits with 4-bit value from predefined table.

$\textbf{sBoxlayer. }\text{The S-box used in }\textsc{present }\text{is a 4-bit to 4-bit S-box } S:\mathbb{F}^4_2 \rightarrow\mathbb{F}_2^4.\\ \text{The action of this box in hexadecimal notation is given by the following table.}$

$\begin{array}{|c||c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|} \hline i & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & A & B & C & D & E & F \\\hline P(i) & C & 5 & 6 & B & 9 & 0 & A & D & 3 & E & F & 8 & 4 & 7 & 1 & 2 \\\hline \end{array}$

Here’s code generated by MSVC

; 62   :     hi.b[0] = sub_byte(hi.b[0]);

movzx edx, cl
mov QWORD PTR hi$[rbp-48], rcx mov eax, edx and edx, 15 shr rax, 4 mov cl, BYTE PTR sbox$3697[rbp+rax-48]
shl cl, 4
or  cl, BYTE PTR sbox$3697[rbp+rdx-48]  The assembly code loads a memory pointer for sbox into the RBX register and expects the AL register to hold the value we’re replacing with XLATB. For those of you who don’t know, the one-byte instruction XLATB locates a byte entry from a table in memory which is pointed to by the base register (BX). Using the contents of the AL register as a table index, it then copies the contents of the table entry back into the AL register. lsb: pop rbx ; rbx = sbox mov cl, al ; cl = x shr al, 4 ; al = sbox[x >> 4] << 4 xlatb ; shl al, 4 xchg al, cl and al, 15 ; al |= sbox[x & 0x0F] xlatb or al, cl pop rcx pop rbx ret sub_byte: push rbx push rcx call lsb ; sbox db 0xc, 0x5, 0x6, 0xb, 0x9, 0x0, 0xa, 0xd db 0x3, 0xe, 0xf, 0x8, 0x4, 0x7, 0x1, 0x2  ### Bit Permutation A linear operation that permutates 64-bits of the ciphertext state. Each bit is repositioned according to a predefined table. $\textbf{pLayer. } \text{The bit permutation used in }\textsc{present} \text{ is given by the following table.}\\ \text{Bit i of }\textsc{state } \text{is moved to bit position } P(i).$ $\begin{array}{|c||c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|} \hline i & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 & 12 & 13 & 14 & 15\\ P(i) & 0 & 16 & 32 & 48 & 1 & 17 & 33 & 49 & 2 & 18 & 34 & 50 & 3 & 19 & 35 & 51 \\\hline \hline i & 16 & 17 & 18 & 19 & 20 & 21 & 22 & 23 & 24 & 25 & 26 & 27 & 28 & 29 & 30 & 31\\ P(i) & 4 & 20 & 36 & 52 & 5 & 21 & 37 & 53 & 6 & 22 & 38 & 54 & 7 & 23 & 39 & 55 \\\hline \hline i & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 & 41 & 42 & 43 & 44 & 45 & 46 & 47\\ P(i) & 8 & 24 & 40 & 56 & 9 & 25 & 41 & 57 & 10 & 26 & 42 & 58 & 11 & 27 & 43 & 59 \\\hline \hline i & 48 & 49 & 50 & 51 & 52 & 53 & 54 & 55 & 56 & 57 & 58 & 59 & 60 & 61 & 62 & 63\\ P(i) & 12 & 28 & 44 & 60 & 13 & 29 & 45 & 61 & 14 & 30 & 46 & 62 & 15 & 31 & 47 & 63 \\\hline \end{array}$ As you might already tell from the table, each bit position increases sequentially by 16 modulo 63. To obtain the correct bit position during permutation without the use of a table, we can multiply the loop counter by 16 and modulo the result by 63. The approach I use is to initialize a 32-bit variable R to 0x30201000. Imagine this to be a 4 byte array initialized to 0,16,32,48 which are used for shifting the first 4 bits of ciphertext state. By incrementing each byte of the R variable every round and rotating by 8 bits, this will generate every shift value required for the permutation operation. Here’s assembly output from MSVC for the C code above. ; 83 : t.q = 0; xor eax, eax ; 84 : r = 0x30201000; mov r9d, 807407616 ; 30201000H ; 85 : for (j=0; j<64; j++) { xor ebx, ebx$LL3@present128@2:

; 86   :       // test bit at j'th position and shift left
; 87   :       t.q |= ((p.q >> j) & 1) << (r & 255);

mov rdx, QWORD PTR p$[rbp-16] mov ecx, ebx inc ebx shr rdx, cl mov ecx, r9d and edx, 1 shl rdx, cl or rax, rdx ; 88 : r = ROTR32(r+1, 8); inc r9d ror r9d, 8 cmp ebx, 64 ; 00000040H jl SHORT$LL3@present128@2


One thing C can’t directly do is take advantage of the x86 status flags so the assembly code is much different.

If bit 0 of P variable is 1, a right shift by 1 will set the carry flag. Then we just avoid setting bits of t variable with Jump If Not Carry (JNC)

Bit Test and Set (BTS) will set bit in RBP based on value in CL.

Here’s handwritten assembly taking advantage of status flags and bit manipulation instructions.

;
mov     ecx, 0x30201000  ; r   = 0x30201000;
xor     ebp, ebp         ; t.q = 0;
xor     ebx, ebx         ; j   = 0;
pse_l2:
shr     rax, 1           ; CF = (p.q >> j) & 1
jnc     pse_l3           ; no bit carried
bts     rbp, rcx         ; set bit in rbp at position in cl
pse_l3:
inc     cl               ; r = ROTR32(r+1, 8);
ror     ecx, 8

add     bl, 4            ; j++, j < 64
jne     pse_l2


The only other thing to mention is how the j counter represented by the BL register is updated. Notice how it’s being increased by 4 instead of 1.

64 x 4 = 256 so once we’ve permutated 64-bits of ciphertext, BL register will be zero again, hence the use of Jump If Not Equal (JNE) which is testing the Zero Flag (ZF)

While describing the loop above, I rewrote the code to use the following instead which takes advantage of ECX being zero after the loop you can see in full routine.

;
mov     ebx, 0x30201000  ; r   = 0x30201000;
xor     ebp, ebp
mov     cl, 64
pse_l2:
shr     rax, 1           ; CF = (p.q >> j) & 1
jnc     pse_l3           ; no bit carried
bts     rbp, rbx         ; set bit in rbp at position in bl
pse_l3:
inc     bl               ; r = ROTR32(r+1, 8);
ror     ebx, 8
loop    pse_l2


### Key scheduling

Function creates 32 64-bit subkeys using linear/non-linear operations from a 128-bit key.

; rcx = present_ctx
; rdx = key
present128_setkeyx:
push    rsi
push    rdi
push    rbp
push    rbx

push    rcx
pop     rdi              ; rdi = ctx

mov     rax, [rdx+8]     ; hi = key[1]
mov     rdx, [rdx  ]     ; lo = key[0]

xor     ecx, ecx         ; rnd = 0
psk:
stosq                    ; ctx->key[rnd] = hi.q;

lea     ebx, [rcx*2+2]   ; lo.q  ^= (rnd + rnd) + 2;
xor     rdx, rbx

push    rax              ; t.q    = hi.q;
pop     rbx
push    rdx
pop     rbp

shl     rax, 61          ; hi.q   = (hi.q & 7) << 61;
shr     rbp, 3           ; hi.q  |= (lo.q >> 3);
or      rax, rbp

shl     rdx, 61          ; lo.q    = (lo.q & 7) << 61;
shr     rbx, 3           ; lo.q   |= ( t.q >> 3);
or      rdx, rbx

rol     rax, 8           ; hi.q    = ROTL64(hi.q, 8);
call    sub_byte
ror     rax, 8
inc     cl
cmp     cl, PRESENT_RNDS
jne     psk
jmp     pse_l4


### Assembly

; rcx = present_ctx
; rdx = buf
present128_encryptx:
push    rsi
push    rdi
push    rbp
push    rbx

push    rcx              ; rdi = present_ctx
pop     rdi

mov     rax, [rdx]       ; p.q = buf[0]

push    PRESENT_RNDS-1
pop     rcx
pse_l0:
xor     rax, [rdi]       ; p.q ^= ctx->key[i]
push    rcx              ; save i
mov     cl, 8            ; j = 0
pse_l1:
call    sub_byte         ; p.b[j] = sub_byte(p.b[j]);
ror     rax, 8           ; p.q    = ROTR64(s.q, 8);
loop    pse_l1           ; j++ < 8
;
mov     ebx, 0x30201000  ; r   = 0x30201000;
xor     ebp, ebp
pse_l2:
shr     rax, 1           ; CF = (p.q >> j) & 1
jnc     pse_l3           ; no bit carried
bts     rbp, rbx         ; set bit in rbp at position in bl
pse_l3:
inc     bl               ; r = ROTR32(r+1, 8);
ror     ebx, 8

jne     pse_l2

xchg    rax, rbp         ; p.q = t.q
pop     rcx
loop    pse_l0

; post whitening
xor     rax, [rdi]  ; p.q ^= ctx->key[PRESENT_RNDS-1];
mov     [rdx], rax  ; buf[0] = p.q
pse_l4:
pop     rbx
pop     rbp
pop     rdi
pop     rsi
ret


### Summary

PRESENT is clearly intended for hardware and not terribly efficient in software but has some interesting features/ideas that could be used elsewhere.

The size of x86-64 assembly is approx. 188 bytes.

Thanks to 0x4d_ for $\LaTeX$ formulas.

## LEA Block Cipher

### Introduction

LEA is a 128-bit block cipher with support for 128, 192 and 256-bit keys published in 2014. It was designed by Deukjo Hong, Jung-Keun Lee, Dong-Chan Kim, Daesung Kwon, Kwon Ho Ryu, and Dong-Geon Lee.

The only operations used for encryption and the key schedule are 32-bit Addition, eXclusive OR and Rotation (ARX structure): the designers state “the usage of 32-bit and 64-bit processors will grow rapidly compared to 8-bit and 16-bit ones”

Today I’ll just focus on an implementation using 128-bit keys referred to as LEA-128. This just about fits onto the 32-bit x86 architecture. The 256-bit version requires additional registers and is probably better suited for 64-bit mode.

### Key Schedule

During generation of subkeys, a number of predefined constants are used.

$\textbf{Constants. }$
$\text{The key schedule uses several constants for generating round keys, which are defined as}$

$\delta[0] = 0xc3efe9db, \quad \delta[1] = 0x44626b02,$
$\delta[2] = 0x79e27c8a, \quad \delta[3] = 0x78df30ec,$
$\delta[4] = 0x715ea49e, \quad \delta[5] = 0xc785da0a,$
$\delta[6] = 0xe04ef22a, \quad \delta[7] = 0xe5c40957.$

$\text{They are obtained from hexadecimal expression of } \sqrt{766965},$
$\text{ where 76, 69 and 65 are ASCII codes of 'L', 'E' and 'A'.}$

You can obtain the values using a tool like speedcrunch.

There are 3 different key schedule functions but I only focus on the 128-bit variant for now.

$\textbf{Key Schedule with a 128-Bit Key. }$
$\text{Let } K = (K[0], K[1], K[2], K[3]) \text{ be a 128-bit key. We set } T[i] = K[i]\: for \: 0\leq 4. \text{ Round key} RK_i = (RK_i[0], RK_i[1],\dots , RK_i[5])\: for\: 0\leq i < 24 \text{ are produced through the following relations:}$

$T[0]\leftarrow ROL_1(T[0]\oplus ROL_i(\delta[i\:mod\: 4])),\\ T[1]\leftarrow ROL_3(T[1]\oplus ROL_{i+1}(\delta[i\:mod\: 4])),\\ T[2]\leftarrow ROL_6(T[2]\oplus ROL_{i+2}(\delta[i\:mod\: 4])),\\ T[3]\leftarrow ROL_{11}(T[3]\oplus ROL_{i+3}(\delta[i\:mod\: 4])),\\ RK_i\leftarrow (T[0], T[1], T[2], T[3], T[1]).$

### Encryption

This is a really simple algorithm to implement and there’s not much to say that can’t be found in the specification published by the authors.

So here’s the function in C to perform encryption on 128-bits of plaintext using 128-bit key in one single call. I’d suggest using something like this with counter (CTR) mode so it doesn’t require the decryption procedure.

void lea128_encryptx(void *key, void *data) {
uint32_t k0, k1, k2, k3, i, t0;
uint32_t x0, x1, x2, x3;
uint32_t *x, *k;
uint32_t c[4];

x = (uint32_t*)data;
k = (uint32_t*)key;

// init constants
c[0] = 0xc3efe9db; c[1] = 0x88c4d604;
c[2] = 0xe789f229; c[3] = 0xc6f98763;

k0 = k[0]; k1 = k[1];
k2 = k[2]; k3 = k[3];

x0 = x[0]; x1 = x[1];
x2 = x[2]; x3 = x[3];

// perform encryption
for (i=0; i<24; i++) {
t0       = c[i & 3];
c[i & 3] = ROTR32(t0, 28);

// create subkey
k0 = ROTR32(k0 + t0, 31);
k1 = ROTR32(k1 + ROTR32(t0, 31), 29);
k2 = ROTR32(k2 + ROTR32(t0, 30), 26);
k3 = ROTR32(k3 + ROTR32(t0, 29), 21);

// encrypt block
t0 = x0;
x0 = ROTR32((x0 ^ k0) + (x1 ^ k1),23);
x1 = ROTR32((x1 ^ k2) + (x2 ^ k1), 5);
x2 = ROTR32((x2 ^ k3) + (x3 ^ k1), 3);
x3 = t0;
}
// save result
x[0] = x0; x[1] = x1;
x[2] = x2; x[3] = x3;
}


### Full x86 assembly

You might notice the constants are different from C sources. For whatever reason, the last 3 are rotated a number of bits left before entering the encryption loop as you can see above.

Obviously a compiler will be smart enough to see this and automatically optimize but for assembly code, we must rotate them manually. They’re stored on the stack using PUSHAD. So, EDI, ESI, EBP and ESP are used for TD array.

ESP has to be initialized after the PUSHAD for obvious reasons. We don’t want to cause an exception.

bits 32

_edi resd 1
_esi resd 1
_ebp resd 1
_esp resd 1
_ebx resd 1
_edx resd 1
_ecx resd 1
_eax resd 1
.size:
endstruc

%define k0 ebx
%define k1 edx
%define k2 edi
%define k3 ebp

%define x0 dword[esi+4*0]
%define x1 dword[esi+4*1]
%define x2 dword[esi+4*2]
%define x3 dword[esi+4*3]

%define t0 ecx

%define LEA128_RNDS 24

lea128_encryptx:
_lea128_encryptx:
; initialize 4 constants
mov    edi, 0xc3efe9db   ; td[0]
mov    esi, 0x88c4d604   ; td[1]
mov    ebp, 0xe789f229   ; td[2]
mov    dword[esp+_esp], 0xc6f98763   ; td[3]
mov    esi, [esp+64+4]   ; esi = key
lodsd
xchg   eax, k0
lodsd
xchg   eax, k1
lodsd
xchg   eax, k2
lodsd
xchg   eax, k3
mov    esi, [esp+64+8]   ; esi = block
xor    eax, eax          ; i = 0
lea_l0:
push   eax
; t0 = c[i % 4];
and    al, 3
mov    t0, [esp+eax*4+4]
; c[i & 3] = ROTR32(t, 28);
ror    t0, 28
mov    [esp+eax*4+4], t0
; **************************************
; create sub key
; **************************************
; k0 = ROTR32(k0 + t0, 1);
ror    k0, 31
; k1 = ROTR32(k1 + ROTR32(t0, 31), 29);
ror    t0, 31
ror    k1, 3
; k2 = ROTR32(k2 + ROTR32(t0, 30), 26);
ror    t0, 1
ror    k2, 6
; k3 = ROTR32(k3 + ROTR32(t0, 29), 21);
ror    t0, 1
ror    k3, 11
; **************************************
; encrypt block
; **************************************
; t0 = x0;
push   x0
; x0 = ROTR32((x0 ^ k0) + (x1 ^ k1),23);
mov    t0, x1
xor    x0, k0
xor    t0, k1
ror    x0, 23
; x1 = ROTR32((x1 ^ k2) + (x2 ^ k1), 5);
mov    t0, x2
xor    x1, k2
xor    t0, k1
ror    x1, 5
; x2 = ROTR32((x2 ^ k3) + (x3 ^ k1), 3);
mov    t0, x3
xor    x2, k3
xor    t0, k1
ror    x2, 3
; x3 = t0;
pop    x3
; i++;
inc    eax
; i<LEA128_RNDS
cmp    al, LEA128_RNDS
jnz    lea_l0

ret


### Summary

The assembly generated by MSVC using /O2 /Os is 232 bytes for the single call (encryption only) and 266 for 2 separate functions. The hand written x86 assembly for just the single call is currently 149 bytes.

See lx.asm for x86 assembly or lea.c for C source

Thanks to 0x4d_ for $\LaTeX$ formulas.

Posted in assembly, cryptography, encryption, programming | Tagged , , , | 1 Comment

## SM3 Cryptographic Hash Algorithm (Chinese Standard)

### Introduction

In December of 2007, the Chinese National Cryptographic Administration Bureau released the specification of a Trusted Cryptography Module, detailing a cryptoprocessor to be used within the Trusted Computing framework in China.

The module specifies a set of cryptographic algorithms that include the SM4 block cipher, the SM2 asymmetric algorithm and SM3 cryptographic hash algorithm.

I’ve discussed SM4 in an earlier post here.

SM3 uses a Merkle-Damgard construction similar to MD5 or SHA-2 that processes 512-bit input message blocks and returns a 256-bit hash value.

It was designed by Mathematician and cryptographer Xiaoyun Wang who is responsible for discovering attacks against many cryptographic hash functions, most notably MD5 and SHA-1.

At the CRYPTO 2004 conference, she and co-authors demonstrated collision attacks against MD5 and SHA-0. Attacks against SHA-1 were published later in 2005.

Unfortunately there’s no explanation for how the changes made to SHA-2 strengthen it against attacks so I’ll briefly cover some of the minor differences between SM3 and SHA-2

### Initialization

In the original SHA-2 specification, the 8 initialization values are fractional parts of square roots for the first 8 primes 2..19). See here for more detailed description on how to create these.

$H_0^{(0)}\: =\: 6a09e667$
$H_1^{(0)}\: =\: bb67ae85$
$H_2^{(0)}\: =\: 3c6ef372$
$H_3^{(0)}\: =\: a54ff53a$
$H_4^{(0)}\: =\: 510e527f$
$H_5^{(0)}\: =\: 9b05688c$
$H_6^{(0)}\: =\: 1f83d9ab$
$H_7^{(0)}\: =\: 5be0cd19$

In SM3, the following 8 values are used:

$H_0^{(0)}\: =\: 7380166f$
$H_1^{(0)}\: =\: 4914b2b9$
$H_2^{(0)}\: =\: 172442d7$
$H_3^{(0)}\: =\: da8a0600$
$H_4^{(0)}\: =\: a96f30bc$
$H_5^{(0)}\: =\: 163138aa$
$H_6^{(0)}\: =\: e38dee4d$
$H_7^{(0)}\: =\: b0fb0e4e$

Unfortunately, there’s no explanation for why or how these values were selected.

void SM3_Init (SM3_CTX *c) {
c->s.w[0] = 0x7380166f;
c->s.w[1] = 0x4914b2b9;
c->s.w[2] = 0x172442d7;
c->s.w[3] = 0xda8a0600;
c->s.w[4] = 0xa96f30bc;
c->s.w[5] = 0x163138aa;
c->s.w[6] = 0xe38dee4d;
c->s.w[7] = 0xb0fb0e4e;
c->len    = 0;
}


### Update and Finalization

The update and final processing are exactly the same as SHA-2 which is based on the original design for MD4 by Ron Rivest. I will not explain here since the main changes take place inside the compression function.

### Message Expansion

In SHA-2, the 512-bit message is expanded into 64 32-bit words. For SM3, it’s expanded into 68 32-bit words. The following 2 linear functions are defined:

$P_1$ for expansion, $P_0$ for compression

$P_0(X) = X\oplus (X\lll 9)\oplus (X\lll 17)$
$P_1(X) = X\oplus (X\lll 15)\oplus (X\lll 23)$

$\text{for}\: i=16\: \text{to}\: 67\\ \hphantom{---}W_i\leftarrow P_1(W_{i-16}\oplus W_{i-9}\oplus(W_{i-3}\lll 15))\oplus (W_{i-13}\lll 7)\oplus W_{i-6}\\ \text{endfor}$

$\text{for}\: i = 0\: \text{to}\: 63\\ \hphantom{---}W_i'=W_i\oplus W_{i+4}\\ \text{endfor}$

// load data in big endian format
for (i=0; i<16; i++) {
w[i] = SWAP32(ctx->buf.w[i]);
}

// expand message
for (i=16; i<68; i++) {
x = ROTL32(w[i- 3], 15);
y = ROTL32(w[i-13],  7);

x ^= w[i-16];
x ^= w[i- 9];
y ^= w[i- 6];

w[i] = P1(x) ^ y;
}


The additional 4 words in SM3 are required for a separate expansion that I’ve inlined with the compression function to try minimize code usage.

$W_i'=W_i\oplus W_{i+4}\: 0\leq i < 64.$

You can see in main compression loop while creating TT1 and TT2.

### Compression

For SHA-2, 64 words are used during compression which are described here in detail.

Both use the same sequence of sixty-four constant 32-bit words, K0{256}, K1{256}, …,K63{256} These words represent the first thirty-two bits of the fractional parts of the cube roots of the first sixty-four prime numbers. In hex, these constant words are (from left to right)

For SM3, only 2 constants are used. One is selected depending on the position of loop counter, then a circular shift by loop counter is applied before adding into the state. This certainly helps make SM3 more compact than SHA-2 although I can’t say if it strengthens in some way.

$T_i = \begin{cases} 79cc4519 & 0\leq i \leq 15 \\[1ex] 7a879d8a & 16\leq i \leq 63 \end{cases}$

There are 3 boolean operations used during the compression which are similar to bitwise majority MAJ and bitwise choice CH operations in SHA-2.

$FF_i(X,Y,Z) = \begin{cases} X\oplus Y\oplus Z & 0\leq i \leq 15 \\[1ex] (X\wedge Y)\vee (X\wedge Z)\vee (Y\wedge Z) & 16\leq i \leq 63 \end{cases}$

$GG_i(X,Y,Z) = \begin{cases} X\oplus Y\oplus Z & 0\leq i \leq 15 \\[1ex] (X\wedge Y) \vee (\neg X \wedge Z) & 16\leq i \leq 63 \end{cases}$

The GG operation is changed to use less instructions, the original is.

The new one along with others.

$ABCDEFGH\leftarrow V^{(i)}$
\begin{aligned} \text{for}\; & i= 0\; \text{to}\: 63\\ & SS_1\leftarrow ((A\lll 12)+E+(T_i\lll i))\lll 7\\ & SS_2\leftarrow SS_1\oplus(A\lll 12)\\ & TT_1\leftarrow FF_i(A,B,C)+D+SS_2+W_i'\\ & TT_2\leftarrow GG_i(E,F,G)+H+SS_1+W_i\\ & D\leftarrow C\\ & C\leftarrow B\lll 9\\ & B\leftarrow A\\ & A\leftarrow TT_1\\ & H\leftarrow G\\ & G\leftarrow F\lll 19\\ & F\leftarrow E\\ & E\leftarrow P_0(TT_2)\\ \text{endfor}&\; \end{aligned}

### Updating state

At the end of compression function in SHA-2, the input hash is updated by adding the output from compression.

$H_0^{(i)}=a+H_0^{(i-1)}$
$H_1^{(i)}=b+H_1^{(i-1)}$
$H_2^{(i)}=c+H_2^{(i-1)}$
$H_3^{(i)}=d+H_3^{(i-1)}$
$H_4^{(i)}=e+H_4^{(i-1)}$
$H_5^{(i)}=f+H_5^{(i-1)}$
$H_6^{(i)}=g+H_6^{(i-1)}$
$H_7^{(i)}=h+H_7^{(i-1)}$

In SM3, it is instead XOR’d which follows a Davies-Meyer idea for compression functions.

$H_0^{(i)}=a\oplus H_0^{(i-1)}$
$H_1^{(i)}=b\oplus H_1^{(i-1)}$
$H_2^{(i)}=c\oplus H_2^{(i-1)}$
$H_3^{(i)}=d\oplus H_3^{(i-1)}$
$H_4^{(i)}=e\oplus H_4^{(i-1)}$
$H_5^{(i)}=f\oplus H_5^{(i-1)}$
$H_6^{(i)}=g\oplus H_6^{(i-1)}$
$H_7^{(i)}=h\oplus H_7^{(i-1)}$

The output state from compression is XORed with the previous hash value to produce the next hash value. In the first round when there is no previous hash value it uses a constant pre-specified initial values.

$H_i=E_{m_i}(H_{i-1})\oplus H_{i-1}.$

Here’s the full compression function.

void SM3_Transform (SM3_CTX *ctx)
{
uint32_t tt1, tt2, i, t, ss1, ss2, x, y;
uint32_t w[68], s[8];

#define a s[0]
#define b s[1]
#define c s[2]
#define d s[3]
#define e s[4]
#define f s[5]
#define g s[6]
#define h s[7]

// load state into local buffer
memcpy((uint8_t*)&s[0], (uint8_t*)&ctx->s.w[0], 8*4);

// load data in big endian format
for (i=0; i<16; i++) {
w[i] = SWAP32(ctx->buf.w[i]);
}

// expand message
for (i=16; i<68; i++) {
x = ROTL32(w[i- 3], 15);
y = ROTL32(w[i-13],  7);

x ^= w[i-16];
x ^= w[i- 9];
y ^= w[i- 6];

w[i] = P1(x) ^ y;
}

// compression function
for (i=0; i<64; i++) {
t  = (i < 16) ? 0x79cc4519 : 0x7a879d8a;

ss2 = ROTL32(a, 12);
ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
ss2 ^= ss1;

tt1 = d + ss2 + (w[i] ^ w[i+4]);
tt2 = h + ss1 + w[i];

if (i < 16) {
tt1 += F(a, b, c);
tt2 += F(e, f, g);
} else {
tt1 += FF(a, b, c);
tt2 += GG(e, f, g);
}
d = c;
c = ROTL32(b, 9);
b = a;
a = tt1;
h = g;
g = ROTL32(f, 19);
f = e;
e = P0(tt2);
}

// Davies–Meyer idea for compression function
for (i=0; i<8; i++) {
ctx->s.w[i] ^= s[i];
}
#undef a
#undef b
#undef c
#undef d
#undef e
#undef f
#undef g
#undef h
}


And assembly just to illustrate

%define _a dword[edi+0*4]
%define _b dword[edi+1*4]
%define _c dword[edi+2*4]
%define _d dword[edi+3*4]
%define _e dword[edi+4*4]
%define _f dword[edi+5*4]
%define _g dword[edi+6*4]
%define _h dword[edi+7*4]

_SM3_Transformx:

lea     esi, [ebx+state]
push    esi  ; save for later

; allocate 512 bytes
push    64
pop     ecx
shl     ecx, 3
sub     esp, ecx

; load state into local buffer
mov     edi, esp
push    8
pop     ecx
rep     movsd

; store message in big endian format
mov     cl, 16
s3t_l0:
lodsd
bswap   eax
stosd
loop    s3t_l0

; expand message
mov     cl, 68-16
s3t_l1:
; x = ROTL32(w[i-3], 15);
mov     eax, [edi- 3*4]
rol     eax, 15
; y = ROTL32(w[i-13], 7);
mov     ebx, [edi-13*4]
rol     ebx, 7
; x ^= w[i-16];
xor     eax, [edi-16*4]
; x ^= w[i-9];
xor     eax, [edi- 9*4]
; y ^= w[i-6];
xor     ebx, [edi- 6*4]
; x ^= ROTL32(x, 15) ^ ROTL32(x, 23);
mov     edx, eax
rol     edx, 15
xor     eax, edx
rol     edx, 23-15
xor     eax, edx
; x ^= y;
xor     eax, ebx
; w[i] = x;
stosd
loop    s3t_l1

; permute message
mov     edi, esp
s3t_l2:
; t  = (i < 16) ? 0x79cc4519 : 0x7a879d8a;
cmp     ecx, 16
sbb     eax, eax
and     eax, 0x79cc4519 - 0x7a879d8a
; ss2 = ROTL32(a, 12);
mov     ebx, _a
rol     ebx, 12
; ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
rol     eax, cl
rol     eax, 7
; ss2 ^= ss1;
xor     ebx, eax
; tt1 = d + ss2 + (w[i] ^ w[i+4]);
mov     ebp, eax         ; save ss1
mov     esi, [edi+4*ecx+32]       ; esi = w[i]
mov     edx, esi         ; save w[i]
xor     esi, [edi+4*ecx+32+16]    ; esi ^= w[i+4]
lea     eax, [esi+ebx]   ; set tt1
; tt2 = h + ss1 + w[i];
lea     ebx, [edx+ebp]
; if (i < 16) {
cmp     ecx, 16
jae     s3t_l3
; tt1 += F(a, b, c);
mov     edx, _c
xor     edx, _b
xor     edx, _a
; tt2 += F(e, f, g);
mov     edx, _g
xor     edx, _f
xor     edx, _e
jmp     s3t_l4
s3t_l3:
; tt1 += FF(a, b, c);
mov     edx, _b
or      edx, _a
mov     ebp, _b
and     edx, _c
and     ebp, _a
or      edx, ebp
; tt2 += GG(e, f, g);
mov     edx, _g
xor     edx, _f
and     edx, _e
xor     edx, _g
s3t_l4:
; d = c;
mov     edx, _c
mov     _d, edx
; c = ROTL32(b, 9);
mov     edx, _b
rol     edx, 9
mov     _c, edx
; b = a;
mov     edx, _a
mov     _b, edx
; a = tt1;
mov     _a, eax
; h = g;
mov     edx, _g
mov     _h, edx
; g = ROTL32(f, 19);
mov     edx, _f
rol     edx, 19
mov     _g, edx
; f = e;
mov     edx, _e
mov     _f, edx
; e = P0(tt2);
; e = x ^ ROTL32(x,  9) ^ ROTL32(x, 17)
mov     edx, ebx
rol     edx, 9
xor     ebx, edx
rol     edx, 17-9
xor     ebx, edx
mov     _e, ebx

inc     ecx
cmp     ecx, 64
jnz     s3t_l2

mov     esi, esp
lea     esp, [esi+ecx*8]

pop     edi
mov     cl, 8
s3t_l5:
lodsd
xor     eax, [edi]
stosd
loop    s3t_l5
ret


### Summary

Size of C generated assembly using /O2 /Os switches is approx. 609 bytes. The x86 assembly is approx. 470 bytes. See sources here.

Thanks to 0x4d_ for $\LaTeX$ equations.

### Introduction

The Chaskey cipher is a 128-bit block, 128-bit key symmetric encryption algorithm which is the underlying function used for the Chaskey Message Authentication Code (MAC).

It’s based on an Even-Mansour construction which makes it very simple to implement and because of its permutation function derived from SipHash, using only ARX instructions makes it highly suitable for many architectures.

Shimon Even and Yishay Mansour published a paper in 1997 titled A Construction of a Cipher From a Single Pseudorandom Permutation which suggested an incredibly simple but provably secure design for a cryptographic algorithm.

Key Whitening, a technique invented by Ron Rivest in 1984 is performed before and after the application of F which is a publicly known permutation function.

Key Whitening consists of using a simple XOR operation of the key with data which is intended to increase resistance to brute force attack and is used in many modern ciphers today.

### F function

The permutation function uses 16 rounds of ADD/ROL/XOR (ARX) instructions for encryption and is derived from the SipHash algorithm.

SipHash is a fast short-input Pseudo-Random-Function(PRF) designed and published in 2012 by Jean-Philippe Aumasson and Daniel J. Bernstein.

The decryption of ciphertext is simply reversing the permutation function using SUB/ROR/XOR.

void chas_encrypt(int enc, void *key, void *buf)
{
int      i;
uint32_t *v=(uint32_t*)buf;
uint32_t *k=(uint32_t*)key;

// pre-whiten
for (i=0; i<4; i++) {
v[i] ^= k[i];
}

// apply permutation function
for (i=0; i<16; i++) {
{
v[0] += v[1];
v[1]=ROTL32(v[1], 5);
v[1] ^= v[0];
v[0]=ROTL32(v[0],16);
v[2] += v[3];
v[3]=ROTL32(v[3], 8);
v[3] ^= v[2];
v[0] += v[3];
v[3]=ROTL32(v[3],13);
v[3] ^= v[0];
v[2] += v[1];
v[1]=ROTL32(v[1], 7);
v[1] ^= v[2];
v[2]=ROTL32(v[2],16);
} else {
v[2]=ROTR32(v[2],16);
v[1] ^= v[2];
v[1]=ROTR32(v[1], 7);
v[2] -= v[1];
v[3] ^= v[0];
v[3]=ROTR32(v[3],13);
v[0] -= v[3];
v[3] ^= v[2];
v[3]=ROTR32(v[3], 8);
v[2] -= v[3];
v[0]=ROTR32(v[0],16);
v[1] ^= v[0];
v[1]=ROTR32(v[1], 5);
v[0] -= v[1];
}
}
// post-whiten
for (i=0; i<4; i++) {
v[i] ^= k[i];
}
}


The assembly is straight forward. We load buffer into ESI, key into EDI and enc into ECX. Load 4 32-bit registers with 128-bit data, apply pre-whitening with 128-bit key. Test ECX for zero, then save flag status with PUSHFD. This then frees ECX to use as a loop counter which is set to 16 (for LTS).

After each round of permutation, restore the flag status with POPFD and keep looping until ECX is zero. Finally apply post-whitening using 128-bit key, save and return.

%define v0 eax
%define v1 ebx
%define v2 edx
%define v3 ebp

chas_encryptx:
_chas_encryptx:
lea     esi, [esp+32+4]
lodsd
xchg    ecx, eax          ; ecx = enc
lodsd
xchg    edi, eax          ; edi = key
lodsd
xchg    eax, esi          ; esi = buf
push    esi
lodsd
xchg    eax, v3
lodsd
xchg    eax, v1
lodsd
xchg    eax, v2
lodsd
xchg    eax, v3
; pre-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
test    ecx, ecx
mov     cl, 16
ck_l0:
pushfd
jz      ck_l1
; encrypt
rol     v1, 5
xor     v1, v0
rol     v0, 16
rol     v3, 8
xor     v3, v2
rol     v3, 13
xor     v3, v0
rol     v1, 7
xor     v1, v2
rol     v2, 16
jmp     ck_l2
ck_l1:
; decrypt
ror     v2, 16
xor     v1, v2
ror     v1, 7
sub     v2, v1
xor     v3, v0
ror     v3, 13
sub     v0, v3
xor     v3, v2
ror     v3, 8
sub     v2, v3
ror     v0, 16
xor     v1, v0
ror     v1, 5
sub     v0, v1
ck_l2:
popfd
loop    ck_l0
ck_l3:
; post-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
pop     edi
; save buf
stosd
xchg    eax, v1
stosd
xchg    eax, v2
stosd
xchg    eax, v3
stosd
ret


### Encryption in one call

To use with CTR mode as a stream cipher.

; -----------------------------------------------
; Chaskey-LTS block cipher in x86 assembly (encryption only)
;
; size: 89 bytes
;
; global calls use cdecl convention
;
; -----------------------------------------------

bits 32

%ifndef BIN
global chas_encryptx
global _chas_encryptx
%endif

%define v0 eax
%define v1 ebx
%define v2 edx
%define v3 ebp

chas_encryptx:
_chas_encryptx:
mov     edi, [esp+32+ 8]
mov     esi, [esp+32+12]
push    esi
lodsd
xchg    eax, v3
lodsd
xchg    eax, v1
lodsd
xchg    eax, v2
lodsd
xchg    eax, v3
; pre-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
; 16 rounds
push    16
pop     ecx
ck_l0:
; apply permutation
rol     v1, 5
xor     v1, v0
rol     v0, 16
rol     v3, 8
xor     v3, v2
rol     v3, 13
xor     v3, v0
rol     v1, 7
xor     v1, v2
rol     v2, 16
loop    ck_l0
; post-whiten
xor     v0, [edi   ]
xor     v1, [edi+ 4]
xor     v2, [edi+ 8]
xor     v3, [edi+12]
pop     edi
; save buf
stosd
xchg    eax, v1
stosd
xchg    eax, v2
stosd
xchg    eax, v3
stosd
ret


A version for ARMv6

k  .req r0
x  .req r1

k0 .req r2
k1 .req r3
k2 .req r4
k3 .req r5

x0 .req r6
x1 .req r7
x2 .req r8
x3 .req r9

i  .req r10

// chas_encryptx(void *key, void *data);
chas_encryptx:

// saxe registers
push   {r0-r12,lr}

ldm    k, {k0, k1, k2, k3}

ldm    x, {x0, x1, x2, x3}

// xor plaintext with key
eor    x0, x0, k0          // x[0] ^= k[0];
eor    x1, x1, k1          // x[1] ^= k[1];
eor    x2, x2, k2          // x[2] ^= k[2];
eor    x3, x3, k3          // x[3] ^= k[3];
mov    i, #16              // i = 16
add    x0, x0, x1          // x[0] += x[1];
eor    x1, x0, x1, ror #27 // x[1]=ROTL32(x[1],  5) ^ x[0];
add    x2, x2, x3          // x[2] += x[3];
eor    x3, x2, x3, ror #24 // x[3]=ROTL32(x[3],  8) ^ x[2];
add    x2, x2, x1          // x[2] += x[1];
add    x0, x3, x0, ror #16 // x[0]=ROTL32(x[0], 16) + x[3];
eor    x3, x0, x3, ror #19 // x[3]=ROTL32(x[3], 13) ^ x[0];
eor    x1, x2, x1, ror #25 // x[1]=ROTL32(x[1],  7) ^ x[2];
mov    x2, x2, ror #16     // x[2]=ROTL32(x[2], 16);
subs   i, i, #1            // i--

// xor ciphertext with key
eor    x0, x0, k0          // x[0] ^= k[0];
eor    x1, x1, k1          // x[1] ^= k[1];
eor    x2, x2, k2          // x[2] ^= k[2];
eor    x3, x3, k3          // x[3] ^= k[3];

// save ciphertext
stm    x, {x0, x1, x2, x3}

// restore registers
pop    {r0-r12,pc}


### Summary

C code is approx. 185 bytes using /O2 /Os flags. Assembly is currently 132 bytes.

See sources here

Posted in assembly, cryptography, encryption, programming, security | Tagged , , | 2 Comments

## SM4 Block Cipher (Chinese Standard for WAPI)

### Introduction

SM4 (formerly SMS4) is a 128-bit block cipher with a 128-bit user key and 32 rounds. It’s used in the WLAN Authentication and Privacy Infrastructure (WAPI), a Chinese WLAN national standard.

It was published or rather declassified in 2006 and standardized in 2012.

I’m not clear what the SM stands for. There are other specifications like SM2 which describes Elliptic Curve algorithms for digital signatures and key exchange. SM3 which is a cryptographic hash algorithm and of course, SM4 a symmetric block cipher which I’ll implement here in C and x86 assembly optimized for size.

The only other specification to mention is SM9 which documents a set of identity-based cryptographic schemes from pairings.

English translations of the specifications for SM2, SM3 have been provided by Sean Shen and Xiaodong Lee at China Internet Network Information Center (CNNIC) a non-profit organization based in China.

But the only english translation for SM4 was by Whitfield Diffie and Dr. George Ledin.

If you want high performance implementations of SM4 in C and x86/x86-x64 assembly, please look at GMSSL which appears to be a fork of OpenSSL but includes SM2, SM3 SM4 and SM9.

### Mixer-substitution T

T is a substitution that generates 32 bits of output from 32 bits of input. Source code includes the full 256-byte array which obviously increases the size of code considerably.

• Non-Linear function

$(b_0, b_1, b_2, b_3) = \tau (A) = (Sbox(a_0),Sbox(a_1),Sbox(a_2),Sbox(a_3))$

The Sbox is a 256-byte array and there’s no description of how these elements were chosen. If anyone knows, please leave a comment.

• Linear function (for key setup)

$L'(B) = B \oplus (B \lll 13)\oplus (B \lll 23)$

• Linear function (for encryption)

$C=L(B)=B\oplus(B\lll 2)\oplus (B\lll 10)\oplus (B\lll 18)\oplus(B\lll 24)$

static const uint8_t S[256] = {
0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7,
0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3,
0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a,
0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95,
0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba,
0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b,
0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2,
0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52,
0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5,
0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55,
0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60,
0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f,
0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f,
0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd,
0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e,
0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20,
0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48 };

// Mixer-substitution T
//
// T is a substitution that generates 32 bits from 32 bits T : Z32
// This substitution is a reversible process.
// It consists of linear and non-linear substitution operations.

uint32_t T(uint32_t x, int t)
{
int   i;
w32_t u;

u.w = x;

// apply non-linear substitution
for (i=0; i<4; i++) {
u.b[i] = S[u.b[i]];
}
// apply linear substitution
if (t==0)
{
// for key setup
return u.w ^ ROTL32(u.w, 13) ^
ROTL32(u.w, 23);
} else {
// for encryption
return u.w ^ ROTL32(u.w,  2) ^
ROTL32(u.w, 10) ^
ROTL32(u.w, 18) ^
ROTL32(u.w, 24);
}
}


The assembly sticks close to the original in C.

t_l0:
pop    ebx
xor    eax, x1
xor    eax, x2
xor    eax, x3
; apply non-linear substitution
mov    cl, 4
t_l1:
xlatb
ror    eax, 8
loop   t_l1
mov    ebx, eax
mov    ecx, eax
mov    edx, eax
mov    ebp, eax
; apply linear substitution
popfd
jc     t_l2
; for key setup
rol    ebx, 13
rol    ecx, 23
xor    eax, ebx
xor    eax, ecx
jmp    t_l3
t_l2:
; for encryption
rol    ebx, 2
rol    ecx, 10
rol    edx, 18
rol    ebp, 24

xor    eax, ebx
xor    eax, ecx
xor    eax, edx
xor    eax, ebp
t_l3:
mov    [esp+_eax], eax
ret

; in:  eax
; out: eax
T_function:
pushfd
call   t_l0  ; pushes address of sbox on stack
; sbox for SM4 goes here


### The round function F

$F(X_0,X_1,X_2,X_3,rk)=X_0\oplus T(X_1\oplus X_2\oplus X_3\oplus rk)$

The value of 1 in last parameter to T function tells it to use the linear function for encryption. In x86 assembly, I use the Carry Flag (CF) setting or clearing with STC and CLC instructions.

// The round function F
//
// This algorithm uses a nonlinear substitution structure,
// encrypting 32 bits at a time. This is called a one-round exchange.

uint32_t F(uint32_t x0, uint32_t x1,
uint32_t x2, uint32_t x3, uint32_t rk)
{
return x0 ^ T(x1 ^ x2 ^ x3 ^ rk, 1);
}


### The constant parameter CK

$(ck_{i,0},ck_{i,1},ck_{i,2},ck_{i,3}) \in \Big(Z_2^8\Big)^4,\quad then\quad ck_{ij}=(4i+j)\times 7\: (mod \: 256)$

You can include the precomputed array, or generate at runtime using the algorithm.

// The constant parameter CK
//
// generate CK constant for index i
#ifdef TABLE
uint32_t CK[]=
{ 0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229,
0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 };
#else
uint32_t CK(int i)
{
int      j;
uint32_t ck=0;

for (j=0; j<4; j++) {
ck <<= 8;
ck |= U8V((((i << 2) + j) * 7));
}
return ck;
}
#endif


The assembly computes at runtime, since it requires less ROM.

; expects ecx to hold index
; returns constant in eax
CK:
xor    eax, eax          ; ck = 0
cdq                      ; j  = 0
ck_l0:
shl    eax, 8            ; ck <<= 8
lea    ebx, [ecx*4+edx]  ; ebx = (i*4) + j
imul   ebx, ebx, 7       ; ebx *= 7
or     al, bl            ; ck |= ebx %= 256
inc    edx               ; j++
cmp    edx, 4            ; j<4
jnz    ck_l0
mov    [esp+_eax], eax   ; return ck
ret


### Key setup

void sm4_setkey(sm4_ctx *c, const void *key, int enc) {
uint32_t t, rk0, rk1, rk2, rk3;
int      i;

rk0 = SWAP32(((uint32_t*)key)[0]);
rk1 = SWAP32(((uint32_t*)key)[1]);
rk2 = SWAP32(((uint32_t*)key)[2]);
rk3 = SWAP32(((uint32_t*)key)[3]);

// xor FK values
rk0 ^= 0xa3b1bac6; rk1 ^= 0x56aa3350;
rk2 ^= 0x677d9197; rk3 ^= 0xb27022dc;

// generate 32 sub keys
for (i=0; i<32; i++) {
#ifdef TABLE
rk0 ^= T(rk1 ^ rk2 ^ rk3 ^ CK[i], 0);
#else
rk0 ^= T(rk1 ^ rk2 ^ rk3 ^ CK(i), 0);
#endif
c->rk[i] = rk0;
XCHG(rk0, rk1);
XCHG(rk1, rk2);
XCHG(rk2, rk3);
}
// reverse the order of keys if decrypting
if (enc == SM4_DECRYPT) {
for (i = 0; i < 16; i++) {
XCHG(c->rk[i], c->rk[31 - i]);
}
}
}


Assembly code..

sm4_setkeyx:
_sm4_setkeyx:
mov    edi, [esp+32+4]  ; edi = ctx
mov    esi, [esp+32+8]  ; esi = 128-bit key
lodsd
bswap  eax
xchg   eax, rk0
lodsd
bswap  eax
xchg   eax, rk1
lodsd
bswap  eax
xchg   eax, rk2
lodsd
bswap  eax
xchg   eax, rk3

; xor FK values
xor    rk0, 0xa3b1bac6
xor    rk1, 0x56aa3350
xor    rk2, 0x677d9197
xor    rk3, 0xb27022dc
xor    ecx, ecx
sk_l1:
call   CK
clc
call   T_function
xor    rk0, eax
mov    eax, rk0
stosd                ; rk[i] = rk0
xchg   rk0, rk1
xchg   rk1, rk2
xchg   rk2, rk3
inc    ecx           ; i++
cmp    ecx, 32
jnz    sk_l1
ret


### Encryption

// swapping bytes *shouldn't* affect the cipher

void sm4_encrypt(sm4_ctx *c, void *buf)
{
uint32_t x0, x1, x2, x3, i, t;
uint32_t *rk=c->rk;
uint32_t *x=(uint32_t*)buf;

x0 = SWAP32(x[0]); x1 = SWAP32(x[1]);
x2 = SWAP32(x[2]); x3 = SWAP32(x[3]);

for (i=0; i<32; i++) {
x0 = F(x0, x1, x2, x3, rk[i]);
XCHG(x0, x1);
XCHG(x1, x2);
XCHG(x2, x3);
}
x[0] = SWAP32(x3); x[1] = SWAP32(x2);
x[2] = SWAP32(x1); x[3] = SWAP32(x0);
}


There’s swapping of bytes between big-endian/little-endian numbering convention which shouldn’t affect the security of ciphertext.

sm4_encryptx:
_sm4_encryptx:
mov    edi, [esp+32+4] ; edi = ctx
mov    esi, [esp+32+8] ; esi = buf
push   esi ; save buffer for later

lodsd
bswap  eax
xchg   eax, x0
lodsd
bswap  eax
xchg   eax, x1
lodsd
bswap  eax
xchg   eax, x2
lodsd
bswap  eax
xchg   eax, x3

; do 32 rounds
push   32
pop    ecx
e_l0:
; apply F round
mov    eax, [edi] ; rk[i]
scasd
stc
call   T_function
xor    x0, eax
xchg   x0, x1
xchg   x1, x2
xchg   x2, x3
loop   e_l0

; save data
pop    edi
xchg   eax, x3
bswap  eax
stosd
xchg   eax, x2
bswap  eax
stosd
xchg   eax, x1
bswap  eax
stosd
xchg   eax, x0
bswap  eax
stosd
ret


### Summary

If there was a way to generate the sbox on the fly, this could be a good cipher for resource constrained devices. The size of code using /O2 /Os switches resulted in 690 bytes. The assembly for just encryption is approx. 500 bytes.

If you want to contribute or just access full source code, see here

Thanks to 0x4d_ for $\LaTeX$ formulas.

Posted in assembly, cryptography, encryption, programming | Tagged , , , , , , | 3 Comments

## CubeMAC128 Message Authentication Code

### Introduction

CubeMAC128 is a cryptographic Message Authentication Code (MAC) designed for packet authentication that was proposed in 2010 by mathematician and cryptographer Daniel J. Bernstein.

The CubeMAC proposal was in response to NIST concerns about using CubeHash as a MAC function for small messages. The parameters for MAC were 16 initial rounds, a 32-byte block and 32 final rounds. The recommended key length is 512-bits or 64-bytes.

You can find a detailed discussion about it in CubeHash parameter tweak: 10 times smaller MAC overhead

### Permutation Function

Like other designs by Dan, the pseudo-random-function used to provide properties of confusion and diffusion only uses Add-Rotate-Xor operations which makes it suitable for many different architectures.

The following is taken from reference code.

static void transform(hashState *state)
{
int i;
int r;
crypto_uint32 y[16];

for (r = 0;r < CUBEHASH_ROUNDS;++r) {
for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
for (i = 0;i < 16;++i) y[i ^ 8] = state->x[i];
for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],7);
for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
for (i = 0;i < 16;++i) y[i ^ 2] = state->x[i + 16];
for (i = 0;i < 16;++i) state->x[i + 16] = y[i];
for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
for (i = 0;i < 16;++i) y[i ^ 4] = state->x[i];
for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],11);
for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
for (i = 0;i < 16;++i) y[i ^ 1] = state->x[i + 16];
for (i = 0;i < 16;++i) state->x[i + 16] = y[i];
}
}


The changes made are to obviously reduce code at the expense of performance but not intentionally. Rather than move upwards in steps of one for variable i, it’s set to 15 and moved down until less than zero.

This should eliminate a CMP opcode and depends on Sign status flag (SF) to indicate when to end loop.

// permutation function
void permute(cube_state *s)
{
int      i, j, k, n;
uint32_t y[16];

for (n=16; n>0; n--)
{
for (k=7, j=2; j>0; k+=4, j--)
{
for (i=15; i>=0; --i) s->w[i + 16] += s->w[i];
for (i=15; i>=0; --i) y[i ^ (j*4)] = s->w[i];
for (i=15; i>=0; --i) s->w[i] = ROTL32(y[i], k);

for (i=15; i>=0; --i) s->w[i] ^= s->w[i + 16];
for (i=15; i>=0; --i) y[i ^ j] = s->w[i + 16];
for (i=15; i>=0; --i) s->w[i + 16] = y[i];
}
}
}


The assembly works similar except we’re using the LOOP instruction quite a lot with PUSHAD/POPAD.

; ==================================
; permutation function
;
; edi = s
mov    esi, esp          ; esi = y[16]
; for (n=16; n>0; n--)
push   16
pop    ecx
; for (k=7, j=2; j>0; k+=4, j--)
pm_l0:
push   ecx               ; save n
mov    cl, 16
push   2
pop    ebp               ; j=2
push   7
pop    edx               ; k=7
pm_l1:
; **************************
; for (i=15; i>=0; --i)
;   s->w[i + 16] += s->w[i];
; **************************
pm_l2:
mov    eax, [edi]
scasd
loop   pm_l2
; **************************
; for (i=15; i>=0; --i)
;   y[i ^ (j*4)] = s->w[i];
; **************************
shl    ebp, 2
pm_l3:
lea    ebx, [ecx-1]
mov    eax, [edi+ebx*4]
xor    ebx, ebp
mov    [esi+ebx*4], eax
loop   pm_l3
; **************************
; for (i=15; i>=0; --i)
;   s->w[i] = ROTL32(y[i], k);
; **************************
xchg   ecx, edx
pm_l4:
lodsd
rol    eax, cl
stosd
dec    edx
jnz    pm_l4
; **************************
; for (i=15; i>=0; --i)
;   s->w[i] ^= s->w[i + 16];
; **************************
pm_l5:
mov    eax, [edi]
xor    eax, [edi+64]
stosd
loop   pm_l5
; **************************
; for (i=15; i>=0; --i)
;   y[i ^ j] = s->w[i + 16];
; **************************
pm_l6:
lea    ebx, [ecx-1]
mov    eax, [edi+ebx*4+64]
xor    ebx, ebp
mov    [esi+ebx*4], eax
loop   pm_l6
; **************************
; for (i=15; i>=0; --i)
;   s->w[i + 16] = y[i];
; **************************
rep    movsd

add    edx, 4          ; k += 4
dec    ebp             ; j--
jnz    pm_l1

pop    ecx
loop   pm_l0           ; will set CF to 0

ret


### Sponge Function

We absorb 32-byte blocks of data into the state before applying the permutation function.

// absorb data into the state
uint32_t absorb(cube_state *s,
const void *msg, uint32_t len)
{
uint32_t i, idx=0;
uint8_t  *p=(uint8_t*)msg;

for (i=0; i<len; i++) {
s->b[idx++] ^= p[i];
if (idx == 32) {
permute(s);
idx = 0;
}
}
return idx;
}


We inline this to minimize the overall size of code.

;
; ==================================
; absorb data into state
;
; ebx = data
; ecx = len
; edi = s
absorb:
xor    eax, eax      ; idx = 0
jecxz  abs_l1        ; exit if len == 0
abs_l0:
mov    dl, [ebx]
xor    [edi+eax], dl ; s->b[idx] ^= *data
inc    eax           ; idx++
inc    ebx           ; data++
cmp    al, 32        ; absorbed block?
loopne abs_l0        ; while (al != 32 && ecx != 0)
jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
call   ebp           ; permute(s)
jmp    absorb        ; keep going
abs_l1:
popfd
cmc                  ; CF = !CF
jc     cm_l0         ; loop twice


### MAC Function

This is purely intended for small messages; authenticating network packets for example. It takes a 512-bit key, a message and returns 128-bit MAC.

The length of message shouldn’t exceed 4GB but that should be obvious. I was naturally thinking about packets of 512-bytes or less 🙂

// cube message authentication code
void cube_macx(const void *mkey, uint32_t keylen,
const void *msg, uint32_t msglen, void *tag)
{
uint32_t   idx;
cube_state s;

// 1. initialize state
memset(&s, 0, sizeof(cube_state));

s.w[0] = 16; // 16-byte output
s.w[1] = 32; // 32-byte block size
s.w[2] = 16; // 16 rounds per block

permute(&s);

// 2. absorb key
absorb (&s, mkey, 64);

// 3. absorb message
idx = absorb(&s, msg, msglen);

// 4. absorb end bit
s.b[idx] ^= 0x80;
permute(&s);

// 5. absorb final bit
s.w[31] ^= 1;

permute(&s);
permute(&s);

// 6. return 128-bit tag
memcpy(tag, s.b, 16);
}


So here’s the rest of assembly code with permutation function stripped out.

cube_macx:
_cube_macx:
call   ld_pm
; permutation function goes here
ld_pm:
pop    ebp           ; ebp = permute
lea    esi, [esp+32+4]
xor    eax, eax
xor    ecx, ecx
mov    cl, 128
sub    esp, ecx
; 1. initialize local state
; memset(&s, 0, sizeof(cube_state));
mov    edi, esp
rep    stosb

mov    edi, esp
push   edi
; s.w[0] = 16;
mov    al, 16
stosd
; s.w[1] = 32;
mov    al, 32
stosd
; s.w[2] = 16;
mov    al, 16
stosd
pop    edi
; permute(&s);
call   ebp
; 2. absorb key
; 3. absorb message
cm_l0:
pushfd
lodsd
xchg   ebx, eax        ; ebx = data
lodsd
xchg   ecx, eax        ; ecx = len
; ==================================
; absorb data into state
;
; ebx = data
; ecx = len
; edi = s
absorb:
xor    eax, eax      ; idx = 0
jecxz  abs_l1        ; exit if len == 0
abs_l0:
mov    dl, [ebx]
xor    [edi+eax], dl ; s->b[idx] ^= *data
inc    eax           ; idx++
inc    ebx           ; data++
cmp    al, 32        ; absorbed block?
loopne abs_l0        ; while (al != 32 && ecx != 0)
jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
call   ebp           ; permute(s)
jmp    absorb        ; keep going
abs_l1:
popfd
cmc                  ; CF = !CF
jc     cm_l0         ; loop twice

; 4. absorb end bit
xor    byte[edi+eax], 0x80
call   ebp

; 5. absorb final bit
xor    byte[edi+31*4], 1
call   ebp
call   ebp

; 6. return 128-bit tag
lodsd
xchg   eax, edi        ; edi = tag
xchg   eax, esi        ; esi = s
mov    cl, 16
rep    movsb

; release stack
pop    eax
ret


### Summary

The size of C code using /Os /O2 switches was approx. 380 bytes. The assembly code is currently 197 bytes which I doubt can be reduced much more.

I’ve documented Poly1305 here which is another MAC function designed by the same author.

If I had to choose a compact function for authentication of small packets, I’d probably pick CubeMAC instead although it hasn’t received as much scrutiny by the cryptographic community as Poly1305.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , , | Leave a comment

## Speck Block Cipher

### Introduction

Speck is a family of lightweight block ciphers publicly released by the National Security Agency (NSA) in June 2013. It’s an ARX (add-rotate-xor) design optimized for performance in software implementations and has been suggested for use on resource constrained devices.

Speck supports a variety of block and key sizes. A block is always two words, but the words may be 16, 24, 32, 48 or 64 bits in size. The corresponding key is 2, 3 or 4 words. The round function consists of two rotations, adding the right word to the left word, xoring the key into the left word, then and xoring the left word to the right word. The number of rounds depends on the parameters selected.

There’s some interesting speculation on the reasons for publishing Simon and Speck in post to a crypto mailing list Speculation on the origin of Speck and Simon

There are 2 implementations provided here. 1 will use 64-bit block size, 128-bit key and 27 rounds since this all fits easily onto 32-bit mode of x86 architecture. The other uses 128-bit block size, 256-bit key and 34 rounds since this all fits easily onto x86-64 mode of x86 architecture.

### Key schedule

void speck64_setkey(const void *in, void *out)
{
uint32_t i, t, k0, k1, k2, k3;
uint32_t *k=(uint32_t*)in;
uint32_t *ks=(uint32_t*)out;

// copy 128-bit key to local space
k0 = k[0]; k1 = k[1];
k2 = k[2]; k3 = k[3];

// expand 128-bit key into round keys
for (i=0; i<27; i++)
{
ks[i] = k0;

k1 = (ROTR32(k1, 8) + k0) ^ i;
k0 = ROTL32(k0, 3) ^ k1;

// rotate left 32-bits
XCHG(k3, k2);
XCHG(k3, k1);
}
}


x86 assembly

%define SPECK_RNDS 27

%define k0 eax
%define k1 ebx
%define k2 ebp
%define k3 edx

speck_setkeyx:
_speck_setkeyx:
mov    esi, [esp+32+4]   ; esi = in
mov    edi, [esp+32+8]   ; edi = ks
lodsd
xchg   eax, k3
lodsd
xchg   eax, k1
lodsd
xchg   eax, k2
lodsd
xchg   eax, k3
xor    ecx, ecx
spk_sk:
; ((uint32_t*)ks)[i] = k0;
stosd
; k1 = (ROTR32(k1, 8) + k0) ^ i;
ror    k1, 8
xor    k1, ecx
; k0 = ROTL32(k0, 3) ^ k1;
rol    k0, 3
xor    k0, k1
; rotate left 32-bits
xchg   k3, k2
xchg   k3, k1
; i++
inc    ecx
cmp    cl, SPECK_RNDS
jnz    spk_sk
ret


### Encryption/Decryption

void speck64_encrypt(const void *keys, int enc, void *data)
{
uint32_t i, x0, x1;
uint32_t *ks=(uint32_t*)keys;
uint32_t *x=(uint32_t*)data;

// copy input to local space
x0=x[0]; x1=x[1];

for (i=0; i<27; i++)
{
if (enc==SPECK_DECRYPT)
{
x1 = ROTR32(x1 ^ x0, 3);
x0 = ROTL32((x0 ^ ks[27-1-i]) - x1, 8);
} else {
x0 = (ROTR32(x0, 8) + x1) ^ ks[i];
x1 = ROTL32(x1, 3) ^ x0;
}
}
// save result
x[0] = x0; x[1] = x1;
}


x86 assembly

%define x0 eax
%define x1 ebx

speck_encryptx:
_speck_encryptx:
lea    esi, [esp+32+4]
lodsd
xchg   edi, eax          ; edi = ks
lodsd
xchg   eax, ecx          ; ecx = enc
lodsd
xchg   eax, esi          ; esi = in
push   esi
lodsd
xchg   eax, x1
lodsd
xchg   eax, x1
test   ecx, ecx
mov    cl, SPECK_RNDS
jz     spk_e0
spk_d0:
; x1 = ROTR32(x1 ^ x0, 3);
xor    x1, x0
ror    x1, 3
; x0 = ROTL32((x0 ^ ks[SPECK_RNDS-1-i]) - x1, 8);
xor    x0, [edi+4*ecx-4]
sub    x0, x1
rol    x0, 8
loop   spk_d0
jmp    spk_end
spk_e0:
; x0 = (ROTR32(x0, 8) + x1) ^ ks[i];
ror    x0, 8
xor    x0, [edi]
scasd
; x1 = ROTL32(x1, 3) ^ x0;
rol    x1, 3
xor    x1, x0
loop   spk_e0
spk_end:
pop    edi
; ((uint32_t*)in)[0] = x0;
stosd
xchg   eax, x1
; ((uint32_t*)in)[1] = x1;
stosd
ret


### Just encryption

Most block ciphers are used with Counter Mode (CTR) which turns a block cipher into a stream cipher. Here’s the function with key scheduling and encryption integrated.

void speck64_encryptx(const void *key, void *data)
{
uint32_t i, t, k0, k1, k2, k3, x0, x1;
uint32_t *k=(uint32_t*)key;
uint32_t *x=(uint32_t*)data;

// copy 128-bit key to local space
k0 = k[0]; k1 = k[1];
k2 = k[2]; k3 = k[3];

// copy input to local space
x0 = x[0]; x1 = x[1];

for (i=0; i<27; i++)
{
// encrypt block
x0 = (ROTR32(x0, 8) + x1) ^ k0;
x1 =  ROTL32(x1, 3) ^ x0;

// create next sub key
t  = k3;
k3 = (ROTR32(k1, 8) + k0) ^ i;
k0 =  ROTL32(k0, 3) ^ k3;

k1 = k2;
k2 = t;
}
// save result
x[0] = x0; x[1] = x1;
}


x86 assembly..

speck64_encryptx:
_speck64_encryptx:
mov    esi, [esp+32+8]   ; esi = in
push   esi               ; save

lodsd
xchg   eax, x0           ; x0 = in[0]
lodsd
xchg   eax, x1           ; x1 = in[1]

mov    esi, [esp+32+8]   ; esi = key
lodsd
xchg   eax, k0           ; k0 = key[0]
lodsd
xchg   eax, k1           ; k1 = key[1]
lodsd
xchg   eax, k2           ; k2 = key[2]
lodsd
xchg   eax, k3           ; k3 = key[3]
xor    eax, eax          ; i = 0
spk_el:
; x0 = (ROTR32(x0, 8) + x1) ^ k0;
ror    x0, 8
xor    x0, k0
; x1 = ROTL32(x1, 3) ^ x0;
rol    x1, 3
xor    x1, x0
; k1 = (ROTR32(k1, 8) + k0) ^ i;
ror    k1, 8
xor    k1, eax
; k0 = ROTL32(k0, 3) ^ k1;
rol    k0, 3
xor    k0, k1
xchg   k3, k2
xchg   k3, k1
; i++
inc    eax
cmp    al, SPECK_RNDS
jnz    spk_el

pop    edi
xchg   eax, x0
stosd
xchg   eax, x1
stosd
ret


The x86-64 version to support 256-bit keys and 128-bit blocks is only 24 bytes more.

;
; speck128/256 encryption in 88 bytes
;
%ifndef BIN
global speck128_encryptx
%endif

%define k0 rdi
%define k1 rbp
%define k2 rsi
%define k3 rcx

%define x0 rbx
%define x1 rdx

speck128_encryptx:
push   rbp
push   rbx
push   rdi
push   rsi

mov    k0, [rcx   ]      ; k0 = key[0]
mov    k1, [rcx+ 8]      ; k1 = key[1]
mov    k2, [rcx+16]      ; k2 = key[2]
mov    k3, [rcx+24]      ; k3 = key[3]

push   rdx
mov    x0, [rdx  ]       ; x0 = in[0]
mov    x1, [rdx+8]       ; x1 = in[1]

xor    eax, eax          ; i = 0
spk_el:
; x1 = (ROTR64(x1, 8) + x0) ^ k0;
ror    x1, 8
xor    x1, k0
; x0 =  ROTL64(x0, 3) ^ x1;
rol    x0, 3
xor    x0, x1
; k1 = (ROTR64(k1, 8) + k0) ^ i;
ror    k1, 8
xor    k1, rax
; k0 = ROTL64(k0, 3) ^ k1;
rol    k0, 3
xor    k0, k1

xchg   k3, k2
xchg   k3, k1
; i++
cmp    al, SPECK_RNDS
jnz    spk_el

pop    rax
mov    [rax  ], x0
mov    [rax+8], x1

pop    rsi
pop    rdi
pop    rbx
pop    rbp
ret


And here’s some ARMv6 assembly to finish with.

// key
k0 .req r2
k1 .req r3
k2 .req r4
k3 .req r5

// plaintext
x0 .req r6
x1 .req r7

// parameters
k  .req r0
x  .req r1
i  .req r0
t  .req r8

// speck64_encryptx(void *key, void *data);
speck64_encryptx:
// save registers
push   {r0-r12,lr}

// k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
ldm    k, {k0, k1, k2, k3}
ldm    x, {x0, x1}          // x0 = x[0]; x1 = k[1];
mov    i, #0                // i=0
speck_loop:
add    x0, x1, x0, ror #8   // x0 = (ROTR32(x0, 8) + x1) ^ k0;
eor    x0, x0, k0           //
eor    x1, x0, x1, ror #29  // x1 = ROTL32(x1, 3) ^ x0;
mov    t, k3                // backup k3
add    k3, k0, k1, ror #8   // k3 = (ROTR32(k1, 8) + k0) ^ i;
eor    k3, k3, i            //
eor    k0, k3, k0, ror #29  // k0 = ROTL32(k0, 3) ^ k3;
mov    k1, k2               // k1 = k2;
mov    k2, t                // k2 = t;
add    i, i, #1             // i++;
cmp    i, #27               // i<27;
bne    speck_loop

// save result
stm    x, {x0, x1}          // x[0] = x0; x[1] = x1;

// restore registers
pop    {r0-r12,pc}


### Summary

instruction set setkey + encrypt + decrypt (bytes) setkey + encrypt (bytes)
x86 105 64
x86-64 132 86

MSVC generated code is 139 bytes using /O2 /Os flags. x86 assembly is 105 bytes for enc+dec or 64 bytes for just enc. Nice algorithm that would be fun for those new to cryptography. Not suggesting it’s secure though.

See sources here

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , | 1 Comment