Keccak Permutation Function


Permutation functions are incredibly useful, and Keccak which is used to implement the SHA-3 standard is a like a swiss army knife of cryptographic primitives.

Designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche, Keccak can be used to construct a stream cipher, a cryptographic hash function, a pseudo random number generator(PRNG), message authentication code (MAC) or authenticated encryption. (AE)

Although SHA-3 (FIPS 202) was covered in a previous post, the underlying permutation function (Keccak-p[1600, 24]) was for the most part overlooked.

A lightweight implementation of Keccak-p[800, 22] for 32-bit architectures will be shown here, specifically the x86 CPU.

Keccak Parameters

F Target Architecture Capacity Rounds
p200 8-bit 200-bits 18
p400 16-bit 400-bits 20
p800 32-bit 800-bits 22
p1600 64-bit 1600-bits 24

Keccak Modules

Module Description
Theta Renders the internal state into a 5-by-5 array of n-bit elements. Computes the parity of each column and combines them with an exclusive-or (XOR) operator.Then it XORs the resulting parity to each state bit as follows.
Rho Rotates each element by a triangular number.
Pi Permutes the elements.
Chi Adds a non-linear aspect to the permutation round. Combines the row elements using only three bitwise operators: AND, NOT, and XOR.
Iota Breaks up any symmetry caused by the other modules. This is done by XORing one of the array elements to a round constant. Without iota, the round mapping would be symmetric. Without iota, all rounds would be the same.

Keccak-p800 in C

// round constant function
// Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
uint32_t rc (uint8_t *LFSR) {
    uint32_t c; 
    int8_t   t;
    uint8_t  i;

    c = 0;
    t = *LFSR;
    for (i=1; i<128; i += i) 
      if (t & 1) {
        // if shift value is < 32
        if ((i-1) < 32) {
          c ^= 1UL << (i - 1);
      t = (t & 0x80) ? (t << 1) ^ 0x71 : t << 1;
    *LFSR = (uint8_t)t;
    return c;

void k800_permute (void *state) {
    uint32_t i, j, r, t, u, bc[5];
    uint8_t  lfsr=1;
    uint32_t *st=(uint32_t*)state;
    uint8_t  *p, *m;
    int      rnd;
    uint32_t piln[6]=
    { 0x110b070a, 0x10050312, 0x04181508, 
      0x0d13170f, 0x0e14020c, 0x01060916 };

    uint32_t m5[3]=
    { 0x03020100, 0x02010004, 0x00000403 };
    p = (uint8_t*)piln;
    m = (uint8_t*)m5;
    for (rnd=0; rnd<22; rnd++) {
      // Theta
      for (i=0; i<5; i++) {     
        t  = st[i]; 
        t ^= st[i +  5]; 
        t ^= st[i + 10]; 
        t ^= st[i + 15]; 
        t ^= st[i + 20];
        bc[i] = t;
      for (i=0; i<5; i++) {
        t  = bc[m[(i + 4)]]; 
        t ^= ROTL32(bc[m[(i + 1)]], 1);
        for (j=i; j<25; j+=5) {
          st[j] ^= t;
      // Rho + Pi
      u = st[1];      
      for (i=0, r=0; i<24; i++) {
        r = r + i + 1;       
        u  = ROTL32(u, r);
        XCHG(st[p[i]], u);
        bc[0] = u;
      // Chi
      for (i=0; i<25; i+=5) {
        memcpy(&bc, &st[i], 5*4);      
        for (j=0; j<5; j++) {
          t  = ~bc[m[(j + 1)]];
          t &=  bc[m[(j + 2)]];        
          st[j + i] ^= t;
      // Iota
      st[0] ^= rc(&lfsr);

32-bit x86 Assembly

The following code includes optmizations from Peter Ferrie

bits   32
struc kws_t
  bc1  resd 1 ; edi
  bc2  resd 1 ; esi
  bc3  resd 1 ; ebp
  bc4  resd 1 ; esp
  bc5  resd 1 ; ebx
  lfsr resd 1 ; edx
  ; ecx
  ; eax
    %ifndef BIN
      global k800_permutex
      global _k800_permutex
; void k800_permutex(void *state);    
    mov    esi, [esp+32+4]      ; esi = st
    call   k800_l0
    ; modulo 5    
    dd     003020100h, 002010004h, 000000403h
    ; rho pi
    dd     0110b070ah, 010050312h, 004181508h 
    dd     00d13170fh, 00e14020ch, 001060916h
    pop    ebx                  ; m + p
    push   22
    pop    eax
    inc    edx                  ; lfsr = 1    
    pushad                      ; create local space
    mov    edi, esp             ; edi = bc   
    push   eax    
    push   5 
    pop    ecx    
    ; Theta
    lodsd                       ; t  = st[i     ];  
    xor    eax, [esi+ 5*4-4]    ; t ^= st[i +  5];
    xor    eax, [esi+10*4-4]    ; t ^= st[i + 10];
    xor    eax, [esi+15*4-4]    ; t ^= st[i + 15];
    xor    eax, [esi+20*4-4]    ; t ^= st[i + 20];
    stosd                       ; bc[i] = t;
    loop   theta_l0        
    xor    eax, eax    
    movzx  ebp, byte[ebx+eax+4] ; ebp = m[(i + 4)];
    mov    ebp, [edi+ebp*4]     ; t   = bc[m[(i + 4)]];    
    movzx  edx, byte[ebx+eax+1] ; edx = m[(i + 1)];
    mov    edx, [edi+edx*4]     ; edx = bc[m[(i + 1)]];
    rol    edx, 1               ; t  ^= ROTL32(edx, 1);
    xor    ebp, edx
    xor    [esi+eax*4], ebp     ; st[j] ^= t;
    add    al, 5                ; j+=5 
    cmp    al, 25               ; j<25
    jb     theta_l2    
    sub    al, 24               ; i=i+1
    loop   theta_l1             ; i<5 
    ; *************************************
    ; Rho Pi
    ; *************************************
    mov    ebp, [esi+1*4]       ; t = st[1];
    lea    ecx, [ecx+eax-4]     ; r = r + i + 1;
    rol    ebp, cl              ; t = ROTL32(t, r); 
    movzx  edx, byte[ebx+eax+7] ; edx = p[i];
    xchg   [esi+edx*4], ebp     ; XCHG(st[p[i]], t);
    inc    eax                  ; i++
    cmp    al, 24+5             ; i<24
    jnz    rho_l0               ; 
    ; *************************************
    ; Chi
    ; *************************************
    xor    ecx, ecx             ; i = 0   
    ; memcpy(&bc, &st[i], 5*4);
    lea    esi, [esi+ecx*4]     ; esi = &st[i];
    mov    cl, 5
    rep    movsd
    xor    eax, eax
    movzx  ebp, byte[ebx+eax+1]
    movzx  edx, byte[ebx+eax+2]
    mov    ebp, [edi+ebp*4]     ; t = ~bc[m[(i + 1)]] 
    not    ebp            
    and    ebp, [edi+edx*4]
    lea    edx, [eax+ecx]       ; edx = j + i    
    xor    [esi+edx*4], ebp     ; st[j + i] ^= t;  
    inc    eax                  ; j++
    cmp    al, 5                ; j<5
    jnz    chi_l1        
    add    cl, al               ; i+=5;
    cmp    cl, 25               ; i<25
    jb     chi_l0
    ; Iota
    mov    al, [esp+kws_t+lfsr+4]; al = t = *LFSR
    mov    dl, 1                ; i = 1
    xor    ebp, ebp
    test   al, 1                ; t & 1
    je     iota_l1    
    lea    ecx, [edx-1]         ; ecx = (i - 1)
    cmp    cl, 32               ; skip if (ecx >= 32)
    jae    iota_l1    
    btc    ebp, ecx             ; c ^= 1UL << (i - 1)
    add    al, al               ; t << 1
    sbb    ah, ah               ; ah = (t < 0) ? 0x00 : 0xFF
    and    ah, 0x71             ; ah = (ah == 0xFF) ? 0x71 : 0x00  
    xor    al, ah  
    add    dl, dl               ; i += i
    jns    iota_l0              ; while (i != 128)
    mov    [esp+kws_t+lfsr+4], al; save t
    xor    [esi], ebp           ; st[0] ^= rc(&lfsr);      
    pop    eax
    dec    eax
    jnz    k800_l1              ; rnds<22    
    popad                       ; release bc
    popad                       ; restore registers 


The x86 assembly generated by MSVC is approx. 342 bytes while the hand written x86 assembly is currently 236 bytes.

Posted in assembly, cryptography, encryption, programming | Tagged , , , | Leave a comment

Gimli: a cross-platform permutation function


Gimli, named after the Lord Of The Rings character, is a 384-bit permutation function, designed specifically to be lightweight, high performance, and secure across multiple architectures.

It was designed by Daniel J. Bernstein, Stefan Kölbl, Stefan Lucks, Pedro Maat Costa Massolino, Florian Mendel, Kashif Nawaz, Tobias Schneider, Peter Schwabe, François-Xavier Standaert, Yosuke Todo, and Benoît Viguier.

Gimli can easily be used to build high-security block ciphers, tweakable block ciphers, stream ciphers, message-authentication codes, authenticated ciphers and hash functions.

What follows here is a compact implementation in C and x86 assembly. For performance enhanced implementations, please look at reference code written by Frank Denis here.

So happy was Frank with this function, that the LibHydrogen library is built atop Gimli.

However, Mike Hamburg points out in Cryptanalysis of 22 1/2 rounds of Gimli, there are weaknesses in how the Linear layer is applied which may lead to an update of this function in the future.

The Gimli team responded to Hamburg in Statement regarding “Cryptanalysis of 22 1/2 rounds of Gimli”

Mike responded further here.

C code

The only thing to mention here is the cyclic left shift by 24 bits is replaced with cyclic right shift by 8 bits. Same outcome, different direction.

void gimli(void *state)
  int      r, j;
  uint32_t t, x, y, z;
  uint32_t *s=(uint32_t*)state;
  for (r=0x9e377918; r!=0x9e377900; r--) {
    for (j=0; j<4; j++) {
      x = ROTR32(s[    j], 8);
      y = ROTL32(s[4 + j], 9);
      z =        s[8 + j];

      s[8 + j] = x ^ (z << 1) ^ ((y & z) << 2);
      s[4 + j] = y ^ x        ^ ((x | z) << 1);
      s[j]     = z ^ y        ^ ((x & y) << 3);

    t = r & 3;
    // small swap
    if (t == 0) {
      XCHG(s[0], s[1]);
      XCHG(s[2], s[3]);
      // add constant      
      s[0] ^= r;
    // big swap
    if (t == 2) {
      XCHG(s[0], s[2]);
      XCHG(s[1], s[3]);

x86 Assembly

The following code includes optmizations from Peter Ferrie It’s currently 112 bytes.

%define j  ebx
%define x  eax
%define y  ebp
%define z  edx

%define s  esi

%define t0 ebp 
%define t1 edi

%define r  ecx

%define s0 eax
%define s1 ebx
%define s2 ebp
%define s3 esi

    mov    r, 0x9e377900 + 24
    mov    s, [esp+32+4]        ; esi = s 
    push   s
    push   4
    pop    j
    ; x = ROTR32(s[    j], 8);
    lodsd  ;mov    x, [s]  
    ror    x, 8  
    ; y = ROTL32(s[4 + j], 9);
    mov    y, [s + (4*3)]   
    rol    y, 9
    ; z = s[8 + j];
    mov    z, [s + (4*7)]
    ; s[8 + j] = x ^ (z << 1) ^ ((y & z) << 2);
    push   x
    push   y
    lea    t1, [z + z]
    and    y, z
    shl    y, 2
    xor    t1, y
    xor    x, t1    
    mov    [s + (7*4)], x
    pop    y
    pop    x
    ; s[4 + j] = y ^ x        ^ ((x | z) << 1);
    push   x
    push   y
    xor    y, x
    or     x, z
    shl    x, 1
    xor    y, x
    mov    [s + (3*4)], y
    pop    y
    pop    x
    ; s[j]     = z ^ y        ^ ((x & y) << 3);    
    xor    z, y
    and    x, y
    shl    x, 3
    xor    z, x
    push   z
    dec    j
    jnz    g_l1

    pop    s3
    pop    s2
    pop    s1
    pop    s0

    pop    t1

    mov    dl, cl
    and    dl, 3
    jnz    g_l2
    ; XCHG (s[0], s[1]);
    xchg   s0, s1
    ; XCHG (s[2], s[3]);
    xchg   s2, s3
    ; s[0] ^= 0x9e377900 ^ r;
    xor    s0, r    
    cmp    dl, 2
    jnz    g_l3  
    ; XCHG (s[0], s[2]);
    xchg   s0, s2
    ; XCHG (s[1], s[3]);
    xchg   s1, s3
    xchg   eax, s1
    xchg   eax, s2
    xchg   eax, s3
    dec    cl   
    jnz    g_l0    

See sources here

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , , | 2 Comments

Light Message Authentication Code (LightMAC)


A Message Authentication Code or MAC is a cryptographic primitive that enables two parties using a shared secret key to authenticate messages exchanged between them.

MACs are used daily to secure online communications and will inevitably become a critical component of embedded systems in the future that have wireless functionality; specifically passive, low-power and IoT devices.

Some MACs are constructed from cryptographic hash algorithms. The Hash-based message authentication code (HMAC) published in 1996 for example recommended using MD5 or SHA-1. However, due to resources required for these hash functions, they were never suitable for microcontrollers.

Other MACs are constructed from block ciphers and this is more ideal for a developer where ROM and RAM are in short supply since there’s already a number of lightweight block ciphers available.

There’s no shortage of cryptographic solutions for desktop computers and mobile devices, however there still remains a significant problem with some of the current standardized algorithms when attempting to implement for resource constrained software or hardware devices.

A draft Report on Lightweight Cryptography published by NIST in August 2016 states that none of the NIST approved hash functions are suitable for resource constrained environments based on the findings in A Study on RAM Requirements of Various SHA-3 Candidates on Low-cost 8-bit CPUs

There’s currently an effort on part of engineers, cryptographers and organizations to design, evaluate and standardize lightweight cryptographic primitives for the embedded industry.

Whether we agree or disagree on the need to embed wireless devices in everyday objects, there will be many that have networking or wireless functionality in future which will require robust encryption solutions.


A MAC Mode for Lightweight Block Ciphers was designed by Atul Luykx, Bart Preneel, Elmar Tischhauser, Kan Yasuda and published in 2016.

\begin{array}{l}  \\\hline   \textbf{Algorithm 1: }\text{LightMAC} _{K_1,K_2}(M)  \\\hline  \quad\textbf{Input: } K_1, K_2\in \{0, 1\}^k,\: M\in \{0,1\}^{\leq 2^s(n-s)}\\  \quad\textbf{Output: } T\in\{0, 1\}^t\\  \textbf{\scriptsize{1}}\;\; V\leftarrow 0^n\in\{0, 1\}^n\\  \textbf{\scriptsize{2}}\;\; M[1]M[2]\cdots M[\ell]\xleftarrow{n-s}M\\  \textbf{\scriptsize{3}}\;\; \textbf{for } i = 1\textbf{ to } \ell - 1\textbf{ do}\\  \textbf{\scriptsize{4}}\;\; \mid V\leftarrow V\oplus E_{K_1}(i_s M[i])\\  \textbf{\scriptsize{5}}\;\; \textbf{end}\\  \textbf{\scriptsize{6}}\;\; V\leftarrow V\oplus (M[\ell]10^\ast)\\  \textbf{\scriptsize{7}}\;\; T\leftarrow \lfloor E_{K_2}(V)\rfloor_t\\  \textbf{\scriptsize{8}}\;\; \textbf{return } T    \\\hline  \end{array}

The other lightweight MAC algorithms under evaluation by NIST are Chaskey and TuLp. I’ve already covered Chaskey in a previous post.

All code and details of the algorithm shown here are derived from reference implementations provided by Atul here on github

Any developers that require a reference implementation should use those source codes instead of what’s shown here.

Instead of using PRESENT block cipher, I’ll use SPECK64/128 due to the ARX design which is much easier to implement on x86 architecture.

Block Cipher Parameters

LightMAC is a mode of operation and depends directly on an external block cipher.

The only other block ciphers that come close to the size of Speck would be XTEA or Chaskey. The following is a list of parameters recommended for use with Speck.

LightMAC Parameters

The authors define the following in reference material which are based on the block cipher.

  • Length of the protected counter sum in bytes. Not greater than N/2.

  • Length of tag in bytes. Should be at least 64-bits but not greater than N.

  • Length of block in bytes. Same as N.

  • Length of block cipher key. The MAC key is twice this length.

The following table shows some example parameters using existing lightweight block ciphers.

Cipher (E) Block Length (N) Cipher Key Length MAC Key Length (K) Counter Length (S) Tag Length (T)
PRESENT 64-bits 128-bits 256-bits 32-bits 64-bits
SPECK 64-bits 128-bits 256-bits 32-bits 64-bits
AES 128-bits 128-bits 256-bits 32-bits 128-bits


In the specification, V denotes an intermediate/local variable of cipher block size N. It is initialized to zero and updated after every encryption using an XOR operation with ciphertext before returning the result in T (truncated if required)

But in my own implementation, I assume T to be of cipher block size N and initialize it to zero. I then update T instead which is why I prefer readers use the reference implementation instead of what’s shown here. 🙂

My reason for not allocating space for V and using T directly is simply to reduce the amount of code required for the assembly code.


The update process is very similar to what you see used in cryptographic hash algorithms. I was gonna have a more detailed description here but I think comments should be clear enough.


An end bit (0x80) is appended to M buffer along with any data remaining or none if the input length was a multiple of the block cipher length.

This is then XORed with any previous cipher block state before being encrypted with the 2nd key before returning.

x86 Assembly code

First, here’s the SPECK block cipher using 64-bit block and 128-bit key.

%define SPECK_RNDS    27
; *****************************************
; Light MAC parameters based on SPECK64-128
; N = 64-bits
; K = 128-bits
%define COUNTER_LENGTH 4  ; should be <= N/2
%define BLOCK_LENGTH   8  ; equal to N
%define TAG_LENGTH     8  ; >= 64 && <= N
%define BC_KEY_LENGTH 16  ; K

%define ENCRYPT speck64_encryptx
%define k0 edi    
%define k1 ebp    
%define k2 ecx    
%define k3 esi

%define x0 ebx    
%define x1 edx

; esi = input
; ebp = key


      push    esi            ; save M
      lodsd                  ; x0 = x->w[0]
      xchg    eax, x0
      lodsd                  ; x1 = x->w[1]
      xchg    eax, x1
      mov     esi, ebp       ; esi = key
      xchg    eax, k0        ; k0 = key[0] 
      xchg    eax, k1        ; k1 = key[1]
      xchg    eax, k2        ; k2 = key[2]
      xchg    eax, k3        ; k3 = key[3]    
      xor     eax, eax       ; i = 0
      ; x0 = (ROTR32(x0, 8) + x1) ^ k0;
      ror     x0, 8
      add     x0, x1
      xor     x0, k0
      ; x1 = ROTL32(x1, 3) ^ x0;
      rol     x1, 3
      xor     x1, x0
      ; k1 = (ROTR32(k1, 8) + k0) ^ i;
      ror     k1, 8
      add     k1, k0
      xor     k1, eax
      ; k0 = ROTL32(k0, 3) ^ k1;
      rol     k0, 3
      xor     k0, k1    
      xchg    k3, k2
      xchg    k3, k1
      ; i++
      inc     eax
      cmp     al, SPECK_RNDS    
      jnz     spk_el
      pop     edi    
      xchg    eax, x0        ; x->w[0] = x0
      xchg    eax, x1        ; x->w[1] = x1

Now LightMAC.

You might notice how ctr and idx variables are initialized to zero at the same time using CDQ instruction. Once PUSHAD is executed, it preserves EDX on the stack and is then used as the protected counter sum S.

Although we convert the counter to big endian format before saving in block buffer, it wouldn’t affect the security to skip this. I’ve retained it for compatibility with reference but might remove it later.

; void lightmac_tag(const void *msg, uint32_t msglen, 
;     void *tag, void* mkey) 
      lea     esi, [esp+32+4]; esi = argv
      xchg    eax, ebx       ; ebx = msg
      cdq                    ; ctr = 0, idx = 0, 
      xchg    eax, ecx       ; ecx = msglen
      xchg    eax, edi       ; edi = tag
      xchg    eax, ebp       ; ebp = mkey      
      pushad                 ; allocate N-bytes for M
      ; zero initialize T
      mov     [edi+0], edx   ; t->w[0] = 0;
      mov     [edi+4], edx   ; t->w[1] = 0;
      ; while we have msg data
      mov     esi, esp       ; esi = M
      jecxz   lmx_l2         ; exit loop if msglen == 0
      ; add byte to M
      mov     al, [ebx]      ; al = *data++
      inc     ebx
      mov     [esi+edx+COUNTER_LENGTH], al           
      inc     edx            ; idx++
      ; M filled?
      ; --msglen
      loopne  lmx_l1
      jne     lmx_l2
      ; add S counter in big endian format
      inc     dword[esp+_edx]; ctr++
      mov     eax, [esp+_edx]
      ; reset index
      cdq                    ; idx = 0
      bswap   eax            ; m.ctr = SWAP32(ctr)
      mov     [esi], eax
      ; encrypt M with E using K1
      call    ENCRYPT
      ; update T
      lodsd                  ; t->w[0] ^= m.w[0];
      xor     [edi+0], eax      
      lodsd                  ; t->w[1] ^= m.w[1];
      xor     [edi+4], eax         
      jmp     lmx_l0         ; keep going
      ; add the end bit
      mov     byte[esi+edx+COUNTER_LENGTH], 0x80
      xchg    esi, edi       ; swap T and M
      ; update T with any msg data remaining    
      mov     al, [edi+edx+COUNTER_LENGTH]
      xor     [esi+edx], al
      dec     edx
      jns     lmx_l3
      ; advance key to K2
      add     ebp, BC_KEY_LENGTH
      ; encrypt T with E using K2
      call    ENCRYPT
      popad                  ; release memory for M
      popad                  ; restore registers


The x86 assembly generated by MSVC using /O2 /Os is 238 bytes. The x86 assembly written by hand is 152 bytes.

In order for developers to benefit from LightMAC on microcontrollers, they should choose a lightweight block cipher but not necessarily SPECK. It’s only used here for illustration.

See lmx32.asm and lightmac.c for any future updates.

Posted in assembly, cryptography, encryption, programming | Tagged , , , , , , , , , | Leave a comment

PRESENT Block Cipher


PRESENT is a 64-bit block cipher published in 2007 which provides support for key lengths of 80 and 128-bits. It was designed specifically for hardware implementation and uses a Substitution Permutation Network (SPN) structure which is similar to AES but at the same time completely different.

The difference is that PRESENT is bit oriented whereas AES is byte oriented. For that reason, PRESENT is not suitable for software implementation although I found the bit-permutation layer interesting and was just curious to see results using x86 assembly.

It was designed by Andrey Bogdanov, Lars R. Knudsen, Gregor Leander, Christof Paar, Axel Poschmann, Matthew J. B. Robshaw, Yannick Seurin, and C. Vikkelsoe.

PRESENT is intended to be used in situations where low-power consumption and high chip efficiency is desired. It has been standardized in ISO-29192 along with CLEFIA which was designed by Sony Corporation.

Today, we’ll focus on the 128-bit key version since it has more potential for future applications than something supporting only 80-bit keys.

I’ve no doubt you can implement PRESENT in 32-bit x86 assembly but because it’s a hardware design, I decided to stick with x86-64 assembly and only implement encryption.

In future blog entries, depending on the design of an algorithm, it will be either 32 or 64-bit assembly and maybe both if suitable but I’ll avoid using MMX which was used in some past entries.

You can find a number of implementations in C and python here.

Key whitening

A linear operation seen in pretty much every block cipher designed since being used by Ron Rivest in 1984 to strengthen the DES block cipher.

It uses a simple eXclusive OR operation with the key and data.

\textbf{addRoundKey. } \text{Given round key } K_i=\kappa_{63}^i\dots \kappa_0^i \text{ for } 1\leq i\leq 32 \text{ and current }\\  \textsc{state } b_{63}\dots b_0, \text{ addRoundKey consists of the operation for } 0\leq j\leq 63,
b_j\rightarrow b_j\oplus\kappa_j^i.

Byte substitution

The non-linear layer is based on a single 4-bit S-box which was designed with hardware optimizations in mind. It replaces each nibble of 64-bits with 4-bit value from predefined table.

\textbf{sBoxlayer. }\text{The S-box used in }\textsc{present }\text{is a 4-bit to 4-bit S-box } S:\mathbb{F}^4_2 \rightarrow\mathbb{F}_2^4.\\  \text{The action of this box in hexadecimal notation is given by the following table.}

\begin{array}{|c||c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}  \hline i & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & A & B & C & D & E & F \\\hline    P(i)  & C & 5 & 6 & B & 9 & 0 & A & D & 3 & E & F & 8 & 4 & 7 & 1 & 2  \\\hline  \end{array}

Here’s code generated by MSVC

; 62   :     hi.b[0] = sub_byte(hi.b[0]);               

  movzx edx, cl
  mov QWORD PTR hi$[rbp-48], rcx
  mov eax, edx
  and edx, 15
  shr rax, 4
  mov cl, BYTE PTR sbox$3697[rbp+rax-48]
  shl cl, 4
  or  cl, BYTE PTR sbox$3697[rbp+rdx-48]

The assembly code loads a memory pointer for sbox into the RBX register and expects the AL register to hold the value we’re replacing with XLATB.

For those of you who don’t know, the one-byte instruction XLATB locates a byte entry from a table in memory which is pointed to by the base register (BX).

Using the contents of the AL register as a table index, it then copies the contents of the table entry back into the AL register.

    pop     rbx              ; rbx = sbox
    mov     cl, al           ; cl = x
    shr     al, 4            ; al = sbox[x >> 4] << 4
    xlatb                    ; 
    shl     al, 4    
    xchg    al, cl
    and     al, 15           ; al |= sbox[x & 0x0F]  
    or      al, cl    
    pop     rcx
    pop     rbx    
    push    rbx 
    push    rcx 
    call    lsb
    ; sbox
    db      0xc, 0x5, 0x6, 0xb, 0x9, 0x0, 0xa, 0xd
    db      0x3, 0xe, 0xf, 0x8, 0x4, 0x7, 0x1, 0x2

Bit Permutation

A linear operation that permutates 64-bits of the ciphertext state. Each bit is repositioned according to a predefined table.

\textbf{pLayer. } \text{The bit permutation used in }\textsc{present} \text{ is given by the following table.}\\  \text{Bit i of }\textsc{state } \text{is moved to bit position } P(i).

\begin{array}{|c||c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}  \hline i & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 & 12 & 13 & 14 & 15\\    P(i)  & 0 & 16 & 32 & 48 & 1 & 17 & 33 & 49 & 2 & 18 & 34 & 50 & 3 & 19 & 35 & 51  \\\hline  \hline i & 16 & 17 & 18 & 19 & 20 & 21 & 22 & 23 & 24 & 25 & 26 & 27 & 28 & 29 & 30 & 31\\    P(i)  & 4 & 20 & 36 & 52 & 5 & 21 & 37 & 53 & 6 & 22 & 38 & 54 & 7 & 23 & 39 & 55  \\\hline  \hline i & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 & 41 & 42 & 43 & 44 & 45 & 46 & 47\\    P(i)  & 8 & 24 & 40 & 56 & 9 & 25 & 41 & 57 & 10 & 26 & 42 & 58 & 11 & 27 & 43 & 59  \\\hline  \hline i & 48 & 49 & 50 & 51 & 52 & 53 & 54 & 55 & 56 & 57 & 58 & 59 & 60 & 61 & 62 & 63\\    P(i)  & 12 & 28 & 44 & 60 & 13 & 29 & 45 & 61 & 14 & 30 & 46 & 62 & 15 & 31 & 47 & 63  \\\hline  \end{array}

As you might already tell from the table, each bit position increases sequentially by 16 modulo 63.

To obtain the correct bit position during permutation without the use of a table, we can multiply the loop counter by 16 and modulo the result by 63.

The approach I use is to initialize a 32-bit variable R to 0x30201000. Imagine this to be a 4 byte array initialized to 0,16,32,48 which are used for shifting the first 4 bits of ciphertext state.

By incrementing each byte of the R variable every round and rotating by 8 bits, this will generate every shift value required for the permutation operation.

Here’s assembly output from MSVC for the C code above.

; 83   :     t.q = 0;    

  xor eax, eax

; 84   :     r   = 0x30201000;    

  mov r9d, 807407616        ; 30201000H

; 85   :     for (j=0; j<64; j++) {

  xor ebx, ebx

; 86   :       // test bit at j'th position and shift left
; 87   :       t.q |= ((p.q >> j) & 1) << (r & 255);      

  mov rdx, QWORD PTR p$[rbp-16]
  mov ecx, ebx
  inc ebx
  shr rdx, cl
  mov ecx, r9d
  and edx, 1
  shl rdx, cl
  or  rax, rdx

; 88   :       r = ROTR32(r+1, 8);

  inc r9d
  ror r9d, 8
  cmp ebx, 64         ; 00000040H
  jl  SHORT $LL3@present128@2

One thing C can’t directly do is take advantage of the x86 status flags so the assembly code is much different.

If bit 0 of P variable is 1, a right shift by 1 will set the carry flag. Then we just avoid setting bits of t variable with Jump If Not Carry (JNC)

Bit Test and Set (BTS) will set bit in RBP based on value in CL.

Here’s handwritten assembly taking advantage of status flags and bit manipulation instructions.

    mov     ecx, 0x30201000  ; r   = 0x30201000;
    xor     ebp, ebp         ; t.q = 0;
    xor     ebx, ebx         ; j   = 0;
    shr     rax, 1           ; CF = (p.q >> j) & 1
    jnc     pse_l3           ; no bit carried
    bts     rbp, rcx         ; set bit in rbp at position in cl
    inc     cl               ; r = ROTR32(r+1, 8);
    ror     ecx, 8
    add     bl, 4            ; j++, j < 64
    jne     pse_l2

The only other thing to mention is how the j counter represented by the BL register is updated. Notice how it’s being increased by 4 instead of 1.

64 x 4 = 256 so once we’ve permutated 64-bits of ciphertext, BL register will be zero again, hence the use of Jump If Not Equal (JNE) which is testing the Zero Flag (ZF)

While describing the loop above, I rewrote the code to use the following instead which takes advantage of ECX being zero after the loop you can see in full routine.

    mov     ebx, 0x30201000  ; r   = 0x30201000;
    xor     ebp, ebp
    mov     cl, 64
    shr     rax, 1           ; CF = (p.q >> j) & 1
    jnc     pse_l3           ; no bit carried
    bts     rbp, rbx         ; set bit in rbp at position in bl
    inc     bl               ; r = ROTR32(r+1, 8);
    ror     ebx, 8
    loop    pse_l2

Key scheduling

Function creates 32 64-bit subkeys using linear/non-linear operations from a 128-bit key.

; rcx = present_ctx
; rdx = key    
    push    rsi
    push    rdi
    push    rbp
    push    rbx
    push    rcx              
    pop     rdi              ; rdi = ctx
    mov     rax, [rdx+8]     ; hi = key[1]
    mov     rdx, [rdx  ]     ; lo = key[0]
    xor     ecx, ecx         ; rnd = 0
    stosq                    ; ctx->key[rnd] = hi.q;  
    lea     ebx, [rcx*2+2]   ; lo.q  ^= (rnd + rnd) + 2;
    xor     rdx, rbx
    push    rax              ; t.q    = hi.q;
    pop     rbx
    push    rdx
    pop     rbp
    shl     rax, 61          ; hi.q   = (hi.q & 7) << 61;
    shr     rbp, 3           ; hi.q  |= (lo.q >> 3);
    or      rax, rbp
    shl     rdx, 61          ; lo.q    = (lo.q & 7) << 61;
    shr     rbx, 3           ; lo.q   |= ( t.q >> 3);
    or      rdx, rbx
    rol     rax, 8           ; hi.q    = ROTL64(hi.q, 8);
    call    sub_byte
    ror     rax, 8
    inc     cl
    cmp     cl, PRESENT_RNDS
    jne     psk
    jmp     pse_l4



; rcx = present_ctx
; rdx = buf    
    push    rsi
    push    rdi
    push    rbp
    push    rbx
    push    rcx              ; rdi = present_ctx
    pop     rdi
    mov     rax, [rdx]       ; p.q = buf[0]
    push    PRESENT_RNDS-1
    pop     rcx
    xor     rax, [rdi]       ; p.q ^= ctx->key[i]
    scasq                    ; advance key position
    push    rcx              ; save i
    mov     cl, 8            ; j = 0
    call    sub_byte         ; p.b[j] = sub_byte(p.b[j]); 
    ror     rax, 8           ; p.q    = ROTR64(s.q, 8);
    loop    pse_l1           ; j++ < 8
    mov     ebx, 0x30201000  ; r   = 0x30201000;
    xor     ebp, ebp
    shr     rax, 1           ; CF = (p.q >> j) & 1
    jnc     pse_l3           ; no bit carried
    bts     rbp, rbx         ; set bit in rbp at position in bl
    inc     bl               ; r = ROTR32(r+1, 8);
    ror     ebx, 8

    add     cl, 4            ; j++
    jne     pse_l2

    xchg    rax, rbp         ; p.q = t.q 
    pop     rcx
    loop    pse_l0 
    ; post whitening
    xor     rax, [rdi]  ; p.q ^= ctx->key[PRESENT_RNDS-1];
    mov     [rdx], rax  ; buf[0] = p.q
    pop     rbx    
    pop     rbp    
    pop     rdi    
    pop     rsi    


PRESENT is clearly intended for hardware and not terribly efficient in software but has some interesting features/ideas that could be used elsewhere.

The size of x86-64 assembly is approx. 188 bytes.

Thanks to 0x4d_ for \LaTeX formulas.

Posted in assembly, cryptography, programming, security | Tagged , , | Leave a comment

LEA-128 Block Cipher


LEA is a 128-bit block cipher with support for 128, 192 and 256-bit keys published in 2014. It was designed by Deukjo Hong, Jung-Keun Lee, Dong-Chan Kim, Daesung Kwon, Kwon Ho Ryu, and Dong-Geon Lee.

The only operations used for encryption and the key schedule are 32-bit Addition, eXclusive OR and Rotation (ARX structure): the designers state “the usage of 32-bit and 64-bit processors will grow rapidly compared to 8-bit and 16-bit ones”

Today I’ll just focus on an implementation using 128-bit keys referred to as LEA-128. This just about fits onto the 32-bit x86 architecture. The 256-bit version requires additional registers and is probably better suited for 64-bit mode.

Key Schedule

During generation of subkeys, a number of predefined constants are used.

\textbf{Constants. }
\text{The key schedule uses several constants for generating round keys, which are defined as}

\delta[0] = 0xc3efe9db, \quad \delta[1] = 0x44626b02,
\delta[2] = 0x79e27c8a, \quad \delta[3] = 0x78df30ec,
\delta[4] = 0x715ea49e, \quad \delta[5] = 0xc785da0a,
\delta[6] = 0xe04ef22a, \quad \delta[7] = 0xe5c40957.

\text{They are obtained from hexadecimal expression of } \sqrt{766965},
\text{ where 76, 69 and 65 are ASCII codes of 'L', 'E' and 'A'.}

You can obtain the values using a tool like speedcrunch.

There are 3 different key schedule functions but I only focus on the 128-bit variant for now.

\textbf{Key Schedule with a 128-Bit Key. }
\text{Let } K = (K[0], K[1], K[2], K[3]) \text{ be a 128-bit key. We set } T[i] = K[i]\: for \: 0\leq 4.   \text{ Round key} RK_i = (RK_i[0], RK_i[1],\dots , RK_i[5])\: for\: 0\leq i < 24 \text{ are produced through the following relations:}

T[0]\leftarrow ROL_1(T[0]\oplus ROL_i(\delta[i\:mod\: 4])),\\  T[1]\leftarrow ROL_3(T[1]\oplus ROL_{i+1}(\delta[i\:mod\: 4])),\\  T[2]\leftarrow ROL_6(T[2]\oplus ROL_{i+2}(\delta[i\:mod\: 4])),\\  T[3]\leftarrow ROL_{11}(T[3]\oplus ROL_{i+3}(\delta[i\:mod\: 4])),\\  RK_i\leftarrow (T[0], T[1], T[2], T[3], T[1]).


This is a really simple algorithm to implement and there’s not much to say that can’t be found in the specification published by the authors.

So here’s the function in C to perform encryption on 128-bits of plaintext using 128-bit key in one single call. I’d suggest using something like this with counter (CTR) mode so it doesn’t require the decryption procedure.

Full x86 assembly

You might notice the constants are different from C sources. For whatever reason, the last 3 are rotated a number of bits left before entering the encryption loop as you can see above.

Obviously a compiler will be smart enough to see this and automatically optimize but for assembly code, we must rotate them manually. They’re stored on the stack using PUSHAD. So, EDI, ESI, EBP and ESP are used for TD array.

ESP has to be initialized after the PUSHAD for obvious reasons. We don’t want to cause an exception.

bits 32
    %ifndef BIN
      global lea128_encryptx
      global _lea128_encryptx
struc pushad_t
  _edi resd 1
  _esi resd 1
  _ebp resd 1
  _esp resd 1
  _ebx resd 1
  _edx resd 1
  _ecx resd 1
  _eax resd 1
%define k0 ebx
%define k1 edx
%define k2 edi
%define k3 ebp
%define p0 [esi+4*0]     
%define p1 [esi+4*1]     
%define p2 [esi+4*2]     
%define p3 [esi+4*3]
%define t ecx
%define x eax
%define y ecx
%define LEA128_RNDS 24
    ; initialize 4 constants
    mov    edi, 0xc3efe9db   ; td[0]
    mov    esi, 0x88c4d604   ; td[1]
    mov    ebp, 0xe789f229   ; td[2]
    mov    dword[esp+_esp], 0xc6f98763   ; td[3] 
    mov    esi, [esp+64+4]   ; esi = key
    ; load key
    xchg   eax, k0
    xchg   eax, k1
    xchg   eax, k2
    xchg   eax, k3
    mov    esi, [esp+64+8]   ; esi = block
    xor    eax, eax          ; i = 0    
    push   eax
    and    al, 3
    mov    t, [esp+eax*4+4]  ; t = td[i % 4]
    rol    t, 4
    mov    [esp+eax*4+4], t  ; td[i & 3] = ROTL32(t, 4);
    ror    t, 4
    ; **************************************
    ; create subkey
    ; **************************************
    ; k0 = ROTL32(k0 + t, 1);
    add    k0, t             
    rol    k0, 1
    ; k1 = ROTL32(k1 + ROTL32(t, 1),  3);
    rol    t, 1              
    add    k1, t
    rol    k1, 3
    ; k2 = ROTL32(k2 + ROTL32(t, 2),  6);
    rol    t, 1              
    add    k2, t
    rol    k2, 6
    ; k3 = ROTL32(k3 + ROTL32(t, 3), 11);
    rol    t, 1              
    add    k3, t
    rol    k3, 11
    ; **************************************
    ; encrypt block
    ; **************************************
    ; p3 = ROTR32((p2 ^ k3) + (p3 ^ k1), 3);
    mov    x, p2             
    mov    y, p3
    xor    x, k3
    xor    y, k1
    add    x, y
    ror    x, 3
    mov    p3, x
    ; p2 = ROTR32((p1 ^ k2) + (p2 ^ k1), 5);
    mov    x, p1             
    mov    y, p2
    xor    x, k2
    xor    y, k1
    add    x, y
    ror    x, 5
    mov    p2, x
    ; p1 = ROTL32((p0 ^ k0) + (p1 ^ k1), 9);
    mov    x, p0             
    mov    y, p1
    xor    x, k0
    xor    y, k1
    add    x, y
    rol    x, 9
    mov    p1, x
    ; rotate block 32-bits
    mov    t, p3
    xchg   t, p2             ; XCHG(p0, p1); 
    xchg   t, p1             ; XCHG(p1, p2);
    xchg   t, p0             ; XCHG(p2, p3);
    mov    p3, t
    pop    eax
    inc    eax               ; i++
    cmp    al, LEA128_RNDS
    jnz    lea_l0


The assembly generated by MSVC using /O2 /Os is 232 bytes for the single call (encryption only) and 266 for 2 separate functions. The hand written x86 assembly for just the single call is currently 161 bytes.

See lx.asm for x86 assembly or lea.c for C source

Thanks to 0x4d_ for \LaTeX formulas.

Posted in assembly, cryptography, encryption, programming | Tagged , , , | Leave a comment

SM3 Cryptographic Hash Algorithm (Chinese Standard)


In December of 2007, the Chinese National Cryptographic Administration Bureau released the specification of a Trusted Cryptography Module, detailing a cryptoprocessor to be used within the Trusted Computing framework in China.

The module specifies a set of cryptographic algorithms that include the SM4 block cipher, the SM2 asymmetric algorithm and SM3 cryptographic hash algorithm.

I’ve discussed SM4 in an earlier post here.

SM3 uses a Merkle-Damgard construction similar to MD5 or SHA-2 that processes 512-bit input message blocks and returns a 256-bit hash value.

It was designed by Mathematician and cryptographer Xiaoyun Wang who is responsible for discovering attacks against many cryptographic hash functions, most notably MD5 and SHA-1.

At the CRYPTO 2004 conference, she and co-authors demonstrated collision attacks against MD5 and SHA-0. Attacks against SHA-1 were published later in 2005.

Unfortunately there’s no explanation for how the changes made to SHA-2 strengthen it against attacks so I’ll briefly cover some of the minor differences between SM3 and SHA-2


In the original SHA-2 specification, the 8 initialization values are the first 32 bits of the fractional parts of the square roots of the first 8 primes 2..19). See here for more detailed description of how to create these.

H_0^{(0)}\: =\: 6a09e667
H_1^{(0)}\: =\: bb67ae85
H_2^{(0)}\: =\: 3c6ef372
H_3^{(0)}\: =\: a54ff53a
H_4^{(0)}\: =\: 510e527f
H_5^{(0)}\: =\: 9b05688c
H_6^{(0)}\: =\: 1f83d9ab
H_7^{(0)}\: =\: 5be0cd19

In SM3, the following 8 values are used:

H_0^{(0)}\: =\: 7380166f
H_1^{(0)}\: =\: 4914b2b9
H_2^{(0)}\: =\: 172442d7
H_3^{(0)}\: =\: da8a0600
H_4^{(0)}\: =\: a96f30bc
H_5^{(0)}\: =\: 163138aa
H_6^{(0)}\: =\: e38dee4d
H_7^{(0)}\: =\: b0fb0e4e

Unfortunately, there’s no explanation for why or how these values were selected.


Updating and Finalization

The update and final processing are exactly the same as SHA-2 which is based on the original design for MD4 by Ron Rivest. I will not explain here since the main changes take place inside the compression function.

Message Expansion

The 512-bit message is expanded into 68 32-bit words. The following 2 linear functions are defined:

P_1 for expansion, P_0 for compression

P_0(X) = X\oplus (X\lll 9)\oplus (X\lll 17)
P_1(X) = X\oplus (X\lll 15)\oplus (X\lll 23)


\text{for}\: i=16\: \text{to}\: 67\\  \hphantom{---}W_i\leftarrow P_1(W_{i-16}\oplus W_{i-9}\oplus(W_{i-3}\lll 15))\oplus (W_{i-13}\lll 7)\oplus   W_{i-6}\\  \text{endfor}

\text{for}\: i = 0\: \text{to}\: 63\\  \hphantom{---}W_i'=W_i\oplus W_{i+4}\\  \text{endfor}


The additional 4 words in SM3 are required for a separate expansion that I’ve inlined with the compression function to try minimize code usage.

W_i'=W_i\oplus W_{i+4}\: 0\leq i < 64.

You can see in main compression loop while creating TT1 and TT2.



For SHA-2, 64 words are used during compression which are described here in detail.

Both use the same sequence of sixty-four constant 32-bit words, K0{256}, K1{256}, …,K63{256} These words represent the first thirty-two bits of the fractional parts of the cube roots of the first sixty-four prime numbers. In hex, these constant words are (from left to right)

For SM3, only 2 constants are used. One is selected depending on the position of loop counter, then a circular shift by loop counter is applied before adding into the state. This certainly helps make SM3 more compact than SHA-2 although I can’t say if it strengthens in some way.

T_i =  \begin{cases}  79cc4519 & 0\leq i \leq 15 \\[1ex]  7a879d8a & 16\leq i \leq 63  \end{cases}


There are 3 boolean operations used during the compression which are similar to bitwise majority MAJ and bitwise choice CH operations in SHA-2.

FF_i(X,Y,Z) =  \begin{cases}  X\oplus Y\oplus Z  & 0\leq i \leq 15 \\[1ex]  (X\wedge Y)\vee (X\wedge Z)\vee (Y\wedge Z) & 16\leq i \leq 63  \end{cases}

GG_i(X,Y,Z) =  \begin{cases}  X\oplus Y\oplus Z  & 0\leq i \leq 15 \\[1ex]  (X\wedge Y) \vee (\neg X \wedge Z) & 16\leq i \leq 63  \end{cases}

The GG operation is changed to use less instructions, the original is.


The new one along with others.


ABCDEFGH\leftarrow V^{(i)}
\begin{aligned}  \text{for}\; & i= 0\; \text{to}\: 63\\  & SS_1\leftarrow ((A\lll 12)+E+(T_i\lll i))\lll 7\\  & SS_2\leftarrow SS_1\oplus(A\lll 12)\\  & TT_1\leftarrow FF_i(A,B,C)+D+SS_2+W_i'\\  & TT_2\leftarrow GG_i(E,F,G)+H+SS_1+W_i\\  & D\leftarrow C\\  & C\leftarrow B\lll 9\\  & B\leftarrow A\\  & A\leftarrow TT_1\\  & H\leftarrow G\\  & G\leftarrow F\lll 19\\  & F\leftarrow E\\  & E\leftarrow P_0(TT_2)\\  \text{endfor}&\;  \end{aligned}

Updating state

At the end of compression function in SHA-2, the input hash is updated by adding the output from compression.


In SM3, it is instead XOR’d which follows a Davies-Meyer idea for compression functions.

H_0^{(i)}=a\oplus H_0^{(i-1)}
H_1^{(i)}=b\oplus H_1^{(i-1)}
H_2^{(i)}=c\oplus H_2^{(i-1)}
H_3^{(i)}=d\oplus H_3^{(i-1)}
H_4^{(i)}=e\oplus H_4^{(i-1)}
H_5^{(i)}=f\oplus H_5^{(i-1)}
H_6^{(i)}=g\oplus H_6^{(i-1)}
H_7^{(i)}=h\oplus H_7^{(i-1)}

The output state from compression is XORed with the previous hash value to produce the next hash value. In the first round when there is no previous hash value it uses a constant pre-specified initial values.

H_i=E_{m_i}(H_{i-1})\oplus H_{i-1}.

Here’s the full compression function.

void SM3_Transform (SM3_CTX *ctx) 
    uint32_t tt1, tt2, i, t, ss1, ss2, x, y;
    uint32_t w[68], s[8];

    #define a s[0]
    #define b s[1]
    #define c s[2]
    #define d s[3]
    #define e s[4]
    #define f s[5]
    #define g s[6]
    #define h s[7]
    // load state into local buffer
    memcpy((uint8_t*)&s[0], (uint8_t*)&ctx->s.w[0], 8*4);
    // load data in big endian format
    for (i=0; i<16; i++) {
      w[i] = SWAP32(ctx->buf.w[i]);

    // expand message
    for (i=16; i<68; i++) {
      x = ROTL32(w[i- 3], 15);
      y = ROTL32(w[i-13],  7);
      x ^= w[i-16];
      x ^= w[i- 9];
      y ^= w[i- 6];
      w[i] = P1(x) ^ y; 

    // compression function
    for (i=0; i<64; i++) {
      t  = (i < 16) ? 0x79cc4519 : 0x7a879d8a;
      ss2 = ROTL32(a, 12);      
      ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
      ss2 ^= ss1;
      tt1 = d + ss2 + (w[i] ^ w[i+4]);
      tt2 = h + ss1 + w[i];
      if (i < 16) {
        tt1 += F(a, b, c);
        tt2 += F(e, f, g);
      } else {
        tt1 += FF(a, b, c);
        tt2 += GG(e, f, g);       
      d = c;
      c = ROTL32(b, 9);
      b = a;
      a = tt1;
      h = g;
      g = ROTL32(f, 19);
      f = e;
      e = P0(tt2); 
    // Davies–Meyer idea for compression function
    for (i=0; i<8; i++) {
      ctx->s.w[i] ^= s[i];
    #undef a
    #undef b
    #undef c
    #undef d
    #undef e
    #undef f
    #undef g
    #undef h

And assembly just to illustrate

%define _a dword[edi+0*4]
%define _b dword[edi+1*4]
%define _c dword[edi+2*4]
%define _d dword[edi+3*4]
%define _e dword[edi+4*4]
%define _f dword[edi+5*4]
%define _g dword[edi+6*4]
%define _h dword[edi+7*4]


    ; load state into esi
    lea     esi, [ebx+state]
    push    esi  ; save for later

    ; allocate 512 bytes
    push    64
    pop     ecx
    shl     ecx, 3
    sub     esp, ecx

    ; load state into local buffer
    mov     edi, esp
    push    8
    pop     ecx
    rep     movsd

    ; store message in big endian format
    mov     cl, 16
    bswap   eax
    loop    s3t_l0

    ; expand message
    mov     cl, 68-16
    ; x = ROTL32(w[i-3], 15);
    mov     eax, [edi- 3*4]
    rol     eax, 15
    ; y = ROTL32(w[i-13], 7);
    mov     ebx, [edi-13*4]
    rol     ebx, 7
    ; x ^= w[i-16];
    xor     eax, [edi-16*4]
    ; x ^= w[i-9];
    xor     eax, [edi- 9*4]
    ; y ^= w[i-6];
    xor     ebx, [edi- 6*4]
    ; x ^= ROTL32(x, 15) ^ ROTL32(x, 23);
    mov     edx, eax
    rol     edx, 15
    xor     eax, edx
    rol     edx, 23-15
    xor     eax, edx
    ; x ^= y;
    xor     eax, ebx
    ; w[i] = x;
    loop    s3t_l1

    ; permute message
    mov     edi, esp
    ; t  = (i < 16) ? 0x79cc4519 : 0x7a879d8a;
    cmp     ecx, 16
    sbb     eax, eax
    and     eax, 0x79cc4519 - 0x7a879d8a
    add     eax, 0x7a879d8a
    ; ss2 = ROTL32(a, 12);
    mov     ebx, _a
    rol     ebx, 12
    ; ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
    rol     eax, cl
    add     eax, ebx
    add     eax, _e
    rol     eax, 7
    ; ss2 ^= ss1;
    xor     ebx, eax
    ; tt1 = d + ss2 + (w[i] ^ w[i+4]);
    mov     ebp, eax         ; save ss1
    mov     esi, [edi+4*ecx+32]       ; esi = w[i]
    mov     edx, esi         ; save w[i]
    xor     esi, [edi+4*ecx+32+16]    ; esi ^= w[i+4]
    add     esi, _d
    lea     eax, [esi+ebx]   ; set tt1
    ; tt2 = h + ss1 + w[i];
    lea     ebx, [edx+ebp]
    add     ebx, _h
    ; if (i < 16) {
    cmp     ecx, 16
    jae     s3t_l3
    ; tt1 += F(a, b, c);
    mov     edx, _c
    xor     edx, _b
    xor     edx, _a
    add     eax, edx
    ; tt2 += F(e, f, g);
    mov     edx, _g
    xor     edx, _f
    xor     edx, _e
    add     ebx, edx
    jmp     s3t_l4
    ; tt1 += FF(a, b, c);
    mov     edx, _b
    or      edx, _a
    mov     ebp, _b
    and     edx, _c
    and     ebp, _a
    or      edx, ebp
    add     eax, edx
    ; tt2 += GG(e, f, g);
    mov     edx, _g
    xor     edx, _f
    and     edx, _e
    xor     edx, _g
    add     ebx, edx
    ; d = c;
    mov     edx, _c
    mov     _d, edx
    ; c = ROTL32(b, 9);
    mov     edx, _b
    rol     edx, 9
    mov     _c, edx
    ; b = a;
    mov     edx, _a
    mov     _b, edx
    ; a = tt1;
    mov     _a, eax
    ; h = g;
    mov     edx, _g
    mov     _h, edx
    ; g = ROTL32(f, 19);
    mov     edx, _f
    rol     edx, 19
    mov     _g, edx
    ; f = e;
    mov     edx, _e
    mov     _f, edx
    ; e = P0(tt2);
    ; e = x ^ ROTL32(x,  9) ^ ROTL32(x, 17)
    mov     edx, ebx
    rol     edx, 9
    xor     ebx, edx
    rol     edx, 17-9
    xor     ebx, edx
    mov     _e, ebx

    inc     ecx
    cmp     ecx, 64
    jnz     s3t_l2

    mov     esi, esp
    lea     esp, [esi+ecx*8]

    pop     edi
    mov     cl, 8
    xor     eax, [edi]
    loop    s3t_l5


Size of C generated assembly using /O2 /Os switches is approx. 609 bytes. The x86 assembly is approx. 470 bytes. See sources here.

Thanks to 0x4d_ for \LaTeX equations.

Posted in assembly, cryptography, programming, security | Tagged , , , , | Leave a comment

Chaskey-LTS Block Cipher


The Chaskey cipher is a 128-bit block, 128-bit key symmetric encryption algorithm which is the underlying function used for the Chaskey Message Authentication Code (MAC).

It’s based on an Even-Mansour construction which makes it very simple to implement and because of its permutation function derived from SipHash, using only ARX instructions makes it highly suitable for many architectures.

Shimon Even and Yishay Mansour published a paper in 1997 titled A Construction of a Cipher From a Single Pseudorandom Permutation which suggested an incredibly simple but provably secure design for a cryptographic algorithm.


Key Whitening, a technique invented by Ron Rivest in 1984 is performed before and after the application of F which is a publicly known permutation function.

Key Whitening consists of using a simple XOR operation of the key with data which is intended to increase resistance to brute force attack and is used in many modern ciphers today.

F function

The permutation function uses 16 rounds of ADD/ROL/XOR (ARX) instructions for encryption and is derived from the SipHash algorithm.

SipHash is a fast short-input Pseudo-Random-Function(PRF) designed and published in 2012 by Jean-Philippe Aumasson and Daniel J. Bernstein.

The decryption of ciphertext is simply reversing the permutation function using SUB/ROR/XOR.


void chas_encrypt(int enc, void *key, void *buf) 
   int      i;
   uint32_t *v=(uint32_t*)buf;
   uint32_t *k=(uint32_t*)key;
   // pre-whiten
   for (i=0; i<4; i++) {
     v[i] ^= k[i];

   // apply permutation function
   for (i=0; i<16; i++) {
     if (enc==CHASKEY_ENCRYPT)
       v[0] += v[1]; 
       v[1]=ROTL32(v[1], 5); 
       v[1] ^= v[0]; 
       v[2] += v[3]; 
       v[3]=ROTL32(v[3], 8); 
       v[3] ^= v[2];
       v[0] += v[3]; 
       v[3] ^= v[0];
       v[2] += v[1]; 
       v[1]=ROTL32(v[1], 7); 
       v[1] ^= v[2]; 
     } else {     
       v[1] ^= v[2];
       v[1]=ROTR32(v[1], 7);
       v[2] -= v[1];
       v[3] ^= v[0];
       v[0] -= v[3];
       v[3] ^= v[2];
       v[3]=ROTR32(v[3], 8);
       v[2] -= v[3];
       v[1] ^= v[0];
       v[1]=ROTR32(v[1], 5);
       v[0] -= v[1];
   // post-whiten
   for (i=0; i<4; i++) {
     v[i] ^= k[i];

The assembly is straight forward. We load buffer into ESI, key into EDI and enc into ECX. Load 4 32-bit registers with 128-bit data, apply pre-whitening with 128-bit key. Test ECX for zero, then save flag status with PUSHFD. This then frees ECX to use as a loop counter which is set to 16 (for LTS).

After each round of permutation, restore the flag status with POPFD and keep looping until ECX is zero. Finally apply post-whitening using 128-bit key, save and return.

%define v0 eax
%define v1 ebx
%define v2 edx
%define v3 ebp

    lea     esi, [esp+32+4]
    xchg    ecx, eax          ; ecx = enc
    xchg    edi, eax          ; edi = key
    xchg    eax, esi          ; esi = buf
    push    esi
    ; load buf
    xchg    eax, v3
    xchg    eax, v1
    xchg    eax, v2
    xchg    eax, v3
    ; pre-whiten
    xor     v0, [edi   ]
    xor     v1, [edi+ 4]
    xor     v2, [edi+ 8]
    xor     v3, [edi+12]
    test    ecx, ecx
    mov     cl, 16
    jz      ck_l1
    ; encrypt
    add     v0, v1
    rol     v1, 5
    xor     v1, v0
    rol     v0, 16
    add     v2, v3
    rol     v3, 8
    xor     v3, v2
    add     v0, v3
    rol     v3, 13
    xor     v3, v0
    add     v2, v1
    rol     v1, 7
    xor     v1, v2
    rol     v2, 16
    jmp     ck_l2
    ; decrypt
    ror     v2, 16
    xor     v1, v2
    ror     v1, 7
    sub     v2, v1
    xor     v3, v0
    ror     v3, 13
    sub     v0, v3
    xor     v3, v2
    ror     v3, 8
    sub     v2, v3
    ror     v0, 16
    xor     v1, v0
    ror     v1, 5
    sub     v0, v1
    loop    ck_l0
    ; post-whiten
    xor     v0, [edi   ]
    xor     v1, [edi+ 4]
    xor     v2, [edi+ 8]
    xor     v3, [edi+12]
    pop     edi
    ; save buf
    xchg    eax, v1
    xchg    eax, v2
    xchg    eax, v3


C code is approx. 185 bytes using /O2 /Os flags. Assembly is currently 132 bytes although It’s difficult to say if that could be reduced any further.

See sources here

Posted in assembly, cryptography, encryption, programming, security | Tagged , , | Leave a comment

SM4 block cipher (Chinese Standard for WAPI)


SM4 (formerly SMS4) is a 128-bit block cipher with a 128-bit user key and 32 rounds. It’s used in the WLAN Authentication and Privacy Infrastructure (WAPI), a Chinese WLAN national standard.

It was published or rather declassified in 2006 and standardized in 2012.

I’m not clear what the SM stands for. There are other specifications like SM2 which describes Elliptic Curve algorithms for digital signatures and key exchange. SM3 which is a cryptographic hash algorithm and of course, SM4 a symmetric block cipher which I’ll implement here in C and x86 assembly optimized for size.

The only other specification to mention is SM9 which documents a set of identity-based cryptographic schemes from pairings.

English translations of the specifications for SM2, SM3 have been provided by Sean Shen and Xiaodong Lee at China Internet Network Information Center (CNNIC) a non-profit organization based in China.

But the only english translation for SM4 was by Whitfield Diffie and Dr. George Ledin.

About the C code

If you want high performance implementations of SM4 in C and x86/x86-x64 assembly, please look at GMSSL which appears to be a fork of OpenSSL but includes SM2, SM3 SM4 and SM9.

Mixer-substitution T

T is a substitution that generates 32 bits of output from 32 bits of input. Source code includes the full 256-byte array which obviously increases the size of code considerably.

  • Non-Linear function

(b_0, b_1, b_2, b_3) = \tau (A) = (Sbox(a_0),Sbox(a_1),Sbox(a_2),Sbox(a_3))

The Sbox is a 256-byte array and there’s no description of how these elements were chosen. If anyone knows, please leave a comment.


  • Linear function (for key setup)

L'(B) = B \oplus (B \lll 13)\oplus (B \lll 23)

  • Linear function (for encryption)

C=L(B)=B\oplus(B\lll 2)\oplus (B\lll 10)\oplus (B\lll 18)\oplus(B\lll 24)


    pop    ebx
    xor    eax, x1
    xor    eax, x2
    xor    eax, x3
    ; apply non-linear substitution
    mov    cl, 4
    ror    eax, 8
    loop   t_l1
    mov    ebx, eax
    mov    ecx, eax
    mov    edx, eax
    mov    ebp, eax
    ; apply linear substitution
    jc     t_l2
    ; for key setup
    rol    ebx, 13
    rol    ecx, 23
    xor    eax, ebx
    xor    eax, ecx
    jmp    t_l3
    ; for encryption
    rol    ebx, 2
    rol    ecx, 10
    rol    edx, 18
    rol    ebp, 24
    xor    eax, ebx
    xor    eax, ecx
    xor    eax, edx
    xor    eax, ebp
    mov    [esp+_eax], eax    
; in:  eax
; out: eax  
    call   t_l0  ; pushes address of sbox on stack
    ; sbox for SM4 goes here

The round function F

F(X_0,X_1,X_2,X_3,rk)=X_0\oplus T(X_1\oplus X_2\oplus X_3\oplus rk)

The value of 1 in last parameter to T function tells it to use the linear function for encryption. In x86 assembly, I use the Carry Flag (CF) setting or clearing with STC and CLC instructions.


The constant parameter CK

(ck_{i,0},ck_{i,1},ck_{i,2},ck_{i,3}) \in \Big(Z_2^8\Big)^4,\quad then\quad ck_{ij}=(4i+j)\times 7\: (mod \: 256)

You can include the precomputed array.


Or generate at runtime using some code.


; expects ecx to hold index
; returns constant in eax
    xor    eax, eax          ; ck = 0
    cdq                      ; j  = 0
    shl    eax, 8            ; ck <<= 8
    lea    ebx, [ecx*4+edx]  ; ebx = (i*4) + j
    imul   ebx, ebx, 7       ; ebx *= 7
    or     al, bl            ; ck |= ebx %= 256
    inc    edx               ; j++
    cmp    edx, 4            ; j<4
    jnz    ck_l0
    mov    [esp+_eax], eax   ; return ck

Key setup


    mov    edi, [esp+32+4]  ; edi = ctx
    mov    esi, [esp+32+8]  ; esi = 128-bit key
    ; load the key
    bswap  eax
    xchg   eax, rk0
    bswap  eax
    xchg   eax, rk1
    bswap  eax
    xchg   eax, rk2
    bswap  eax
    xchg   eax, rk3
    ; xor FK values
    xor    rk0, 0xa3b1bac6    
    xor    rk1, 0x56aa3350    
    xor    rk2, 0x677d9197    
    xor    rk3, 0xb27022dc
    xor    ecx, ecx
    call   CK
    call   T_function 
    xor    rk0, eax
    mov    eax, rk0
    stosd                ; rk[i] = rk0
    xchg   rk0, rk1
    xchg   rk1, rk2
    xchg   rk2, rk3
    inc    ecx           ; i++
    cmp    ecx, 32
    jnz    sk_l1       



    mov    edi, [esp+32+4] ; edi = ctx
    mov    esi, [esp+32+8] ; esi = buf
    push   esi ; save buffer for later
    ; load data
    bswap  eax
    xchg   eax, x0
    bswap  eax    
    xchg   eax, x1
    bswap  eax    
    xchg   eax, x2
    bswap  eax    
    xchg   eax, x3
    ; do 32 rounds
    push   32
    pop    ecx
    ; apply F round
    mov    eax, [edi] ; rk[i]
    call   T_function     
    xor    x0, eax
    xchg   x0, x1
    xchg   x1, x2
    xchg   x2, x3
    loop   e_l0
    ; save data
    pop    edi
    xchg   eax, x3
    bswap  eax    
    xchg   eax, x2
    bswap  eax    
    xchg   eax, x1
    bswap  eax    
    xchg   eax, x0
    bswap  eax    


If there was a way to generate the sbox on the fly, this could be a good cipher for resource constrained devices. The size of code using /O2 /Os switches resulted in 690 bytes. The assembly for just encryption is approx. 500 bytes.

If you want to contribute or just access full source code, see here

Thanks to 0x4d_ for \LaTeX formulas.

Posted in assembly, cryptography, encryption, programming | Tagged , , , , , , | 1 Comment

CubeMAC128 Message Authentication Code


CubeMAC128 is a cryptographic Message Authentication Code (MAC) designed for packet authentication that was proposed in 2010 by mathematician and cryptographer Daniel J. Bernstein.

The CubeMAC proposal was in response to NIST concerns about using CubeHash as a MAC function for small messages. The parameters for MAC were 16 initial rounds, a 32-byte block and 32 final rounds. The recommended key length is 512-bits or 64-bytes.

You can find a detailed discussion about it in CubeHash parameter tweak: 10 times smaller MAC overhead

Permutation Function

Like other designs by Dan, the pseudo-random-function used to provide properties of confusion and diffusion only uses Add-Rotate-Xor operations which makes it suitable for many different architectures.

The following is taken from reference code.

static void transform(hashState *state)
  int i;
  int r;
  crypto_uint32 y[16];

  for (r = 0;r < CUBEHASH_ROUNDS;++r) {
    for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
    for (i = 0;i < 16;++i) y[i ^ 8] = state->x[i];
    for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],7);
    for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
    for (i = 0;i < 16;++i) y[i ^ 2] = state->x[i + 16];
    for (i = 0;i < 16;++i) state->x[i + 16] = y[i];
    for (i = 0;i < 16;++i) state->x[i + 16] += state->x[i];
    for (i = 0;i < 16;++i) y[i ^ 4] = state->x[i];
    for (i = 0;i < 16;++i) state->x[i] = ROTATE(y[i],11);
    for (i = 0;i < 16;++i) state->x[i] ^= state->x[i + 16];
    for (i = 0;i < 16;++i) y[i ^ 1] = state->x[i + 16];
    for (i = 0;i < 16;++i) state->x[i + 16] = y[i];

The changes made are to obviously reduce code at the expense of performance but not intentionally. Rather than move upwards in steps of one for variable i, it’s set to 15 and moved down until less than zero.

This should eliminate a CMP opcode and depends on Sign status flag (SF) to indicate when to end loop.

// permutation function
void permute(cube_state *s)
    int      i, j, k, n;
    uint32_t y[16];

    for (n=16; n>0; n--) 
      for (k=7, j=2; j>0; k+=4, j--)
        for (i=15; i>=0; --i) s->w[i + 16] += s->w[i];
        for (i=15; i>=0; --i) y[i ^ (j*4)] = s->w[i];
        for (i=15; i>=0; --i) s->w[i] = ROTL32(y[i], k);
        for (i=15; i>=0; --i) s->w[i] ^= s->w[i + 16];
        for (i=15; i>=0; --i) y[i ^ j] = s->w[i + 16];
        for (i=15; i>=0; --i) s->w[i + 16] = y[i];

The assembly works similar except we’re using the LOOP instruction quite a lot with PUSHAD/POPAD.

; ==================================
      ; permutation function
      ; edi = s
      pushad                   ; save registers
      pushad                   ; allocate 32-bytes
      pushad                   ; allocate 32-bytes
      mov    esi, esp          ; esi = y[16]
      ; for (n=16; n>0; n--)
      push   16
      pop    ecx
      ; for (k=7, j=2; j>0; k+=4, j--)
      push   ecx               ; save n
      mov    cl, 16
      push   2
      pop    ebp               ; j=2
      push   7
      pop    edx               ; k=7
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i + 16] += s->w[i];
      ; **************************
      mov    eax, [edi]
      add    [edi+64], eax
      loop   pm_l2
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   y[i ^ (j*4)] = s->w[i];
      ; **************************
      shl    ebp, 2
      lea    ebx, [ecx-1]
      mov    eax, [edi+ebx*4]
      xor    ebx, ebp
      mov    [esi+ebx*4], eax
      loop   pm_l3
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i] = ROTL32(y[i], k);
      ; **************************
      xchg   ecx, edx
      rol    eax, cl
      dec    edx
      jnz    pm_l4
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i] ^= s->w[i + 16];
      ; **************************
      mov    eax, [edi]
      xor    eax, [edi+64]
      loop   pm_l5
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   y[i ^ j] = s->w[i + 16];
      ; **************************
      lea    ebx, [ecx-1]
      mov    eax, [edi+ebx*4+64]
      xor    ebx, ebp
      mov    [esi+ebx*4], eax
      loop   pm_l6
      ; **************************
      ; for (i=15; i>=0; --i)
      ;   s->w[i + 16] = y[i];
      ; **************************
      add    edi, 64
      rep    movsd

      add    edx, 4          ; k += 4
      dec    ebp             ; j--
      jnz    pm_l1

      pop    ecx
      loop   pm_l0           ; will set CF to 0

      popad                  ; release 32-bytes
      popad                  ; release 32-bytes
      popad                  ; restore registers

Sponge Function

We absorb 32-byte blocks of data into the state before applying the permutation function.

// absorb data into the state
uint32_t absorb(cube_state *s, 
    const void *msg, uint32_t len)
    uint32_t i, idx=0;
    uint8_t  *p=(uint8_t*)msg;
    for (i=0; i<len; i++) {
      s->b[idx++] ^= p[i];
      if (idx == 32) {
        idx = 0;
    return idx;

We inline this to minimize the overall size of code.

      ; ==================================
      ; absorb data into state
      ; ebx = data
      ; ecx = len
      ; edi = s
      xor    eax, eax      ; idx = 0
      jecxz  abs_l1        ; exit if len == 0
      mov    dl, [ebx]
      xor    [edi+eax], dl ; s->b[idx] ^= *data
      inc    eax           ; idx++
      inc    ebx           ; data++
      cmp    al, 32        ; absorbed block?
      loopne abs_l0        ; while (al != 32 && ecx != 0)
      jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
      call   ebp           ; permute(s)
      jmp    absorb        ; keep going
      cmc                  ; CF = !CF 
      jc     cm_l0         ; loop twice

MAC Function

This is purely intended for small messages; authenticating network packets for example. It takes a 512-bit key, a message and returns 128-bit MAC.

The length of message shouldn’t exceed 4GB but that should be obvious. I was naturally thinking about packets of 512-bytes or less 🙂

// cube message authentication code
void cube_macx(const void *mkey, uint32_t keylen,
    const void *msg, uint32_t msglen, void *tag)
    uint32_t   idx;  
    cube_state s;
    // 1. initialize state
    memset(&s, 0, sizeof(cube_state));

    s.w[0] = 16; // 16-byte output
    s.w[1] = 32; // 32-byte block size
    s.w[2] = 16; // 16 rounds per block
    // 2. absorb key
    absorb (&s, mkey, 64);
    // 3. absorb message
    idx = absorb(&s, msg, msglen);

    // 4. absorb end bit
    s.b[idx] ^= 0x80;
    // 5. absorb final bit
    s.w[31] ^= 1;
    // 6. return 128-bit tag
    memcpy(tag, s.b, 16);  

So here’s the rest of assembly code with permutation function stripped out.

      call   ld_pm
      ; permutation function goes here
      pop    ebp           ; ebp = permute
      lea    esi, [esp+32+4]
      xor    eax, eax
      xor    ecx, ecx
      mov    cl, 128
      sub    esp, ecx
      ; 1. initialize local state
      ; memset(&s, 0, sizeof(cube_state));
      mov    edi, esp
      rep    stosb

      mov    edi, esp
      push   edi
      ; s.w[0] = 16;
      mov    al, 16
      ; s.w[1] = 32;
      mov    al, 32
      ; s.w[2] = 16;
      mov    al, 16
      pop    edi
      ; permute(&s);
      call   ebp
      ; 2. absorb key
      ; 3. absorb message
      xchg   ebx, eax        ; ebx = data
      xchg   ecx, eax        ; ecx = len
      ; ==================================
      ; absorb data into state
      ; ebx = data
      ; ecx = len
      ; edi = s
      xor    eax, eax      ; idx = 0
      jecxz  abs_l1        ; exit if len == 0
      mov    dl, [ebx]
      xor    [edi+eax], dl ; s->b[idx] ^= *data
      inc    eax           ; idx++
      inc    ebx           ; data++
      cmp    al, 32        ; absorbed block?
      loopne abs_l0        ; while (al != 32 && ecx != 0)
      jne    abs_l1        ; if (al != 32 && ecx == 0) goto abs_l1
      call   ebp           ; permute(s)
      jmp    absorb        ; keep going
      cmc                  ; CF = !CF 
      jc     cm_l0         ; loop twice
      ; 4. absorb end bit
      xor    byte[edi+eax], 0x80
      call   ebp

      ; 5. absorb final bit
      xor    byte[edi+31*4], 1
      call   ebp
      call   ebp

      ; 6. return 128-bit tag
      xchg   eax, edi        ; edi = tag
      xchg   eax, esi        ; esi = s
      mov    cl, 16
      rep    movsb

      ; release stack
      pop    eax
      add    esp, 124


The size of C code using /Os /O2 switches was approx. 380 bytes. The assembly code is currently 197 bytes which I doubt can be reduced much more.

I’ve documented Poly1305 here which is another MAC function designed by the same author.

If I had to choose a compact function for authentication of small packets, I’d probably pick CubeMAC instead although it hasn’t received as much scrutiny by the cryptographic community as Poly1305.

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , , | Leave a comment

Speck Block Cipher


Speck is a family of lightweight block ciphers publicly released by the National Security Agency (NSA) in June 2013. It’s an ARX (add-rotate-xor) design optimized for performance in software implementations and has been suggested for use on resource constrained devices.

Speck supports a variety of block and key sizes. A block is always two words, but the words may be 16, 24, 32, 48 or 64 bits in size. The corresponding key is 2, 3 or 4 words. The round function consists of two rotations, adding the right word to the left word, xoring the key into the left word, then and xoring the left word to the right word. The number of rounds depends on the parameters selected.

There’s some interesting speculation on the reasons for publishing Simon and Speck in post to a crypto mailing list Speculation on the origin of Speck and Simon

There are 2 implementations provided here. 1 will use 64-bit block size, 128-bit key and 27 rounds since this all fits easily onto 32-bit mode of x86 architecture. The other uses 128-bit block size, 256-bit key and 34 rounds since this all fits easily onto x86-64 mode of x86 architecture.

Key schedule

void speck_setkey(void *in, void *out)
  uint32_t i, t, k0, k1, k2, k3;

  // copy 128-bit key to local space
  k0 = ((uint32_t*)in)[0];
  k1 = ((uint32_t*)in)[1];
  k2 = ((uint32_t*)in)[2];
  k3 = ((uint32_t*)in)[3];
  // expand 128-bit key into round keys
  for (i=0; i<SPECK_RNDS; i++)
    ((uint32_t*)out)[i] = k0;
    k1 = (ROTR32(k1, 8) + k0) ^ i;
    k0 = ROTL32(k0, 3) ^ k1;
    // rotate left 32-bits
    XCHG(k3, k2, t);
    XCHG(k3, k1, t);
%define SPECK_RNDS 27
%define k0 eax    
%define k1 ebx    
%define k2 ebp    
%define k3 edx
    mov    esi, [esp+32+4]   ; esi = in
    mov    edi, [esp+32+8]   ; edi = ks
    xchg   eax, k3
    xchg   eax, k1
    xchg   eax, k2
    xchg   eax, k3
    xor    ecx, ecx
    ; ((uint32_t*)ks)[i] = k0;
    ; k1 = (ROTR32(k1, 8) + k0) ^ i;
    ror    k1, 8
    add    k1, k0
    xor    k1, ecx
    ; k0 = ROTL32(k0, 3) ^ k1;
    rol    k0, 3
    xor    k0, k1
    ; rotate left 32-bits
    xchg   k3, k2
    xchg   k3, k1
    ; i++
    inc    ecx
    cmp    cl, SPECK_RNDS    
    jnz    spk_sk   


void speck_encrypt(void *in, void *keys, int enc)
  uint8_t i;
  uint32_t *ks=(uint32_t*)keys;
  // copy input to local space
  uint32_t x0=((uint32_t*)in)[0];
  uint32_t x1=((uint32_t*)in)[1];
  for (i=0; i<SPECK_RNDS; i++)
    if (enc==SPECK_DECRYPT)
      x1 = ROTR32(x0 ^ x1, 3);
      x0 = ROTL32((x0 ^ ks[SPECK_RNDS-1-i]) - x1, 8);        
    } else {
      x0 = (ROTR32(x0, 8) + x1) ^ ks[i];
      x1 = ROTL32(x1, 3) ^ x0;
  // save result
  ((uint32_t*)in)[0] = x0;
  ((uint32_t*)in)[1] = x1;
%define x0 eax    
%define x1 ebx
    lea    esi, [esp+32+4]
    xchg   edi, eax          ; edi = ks
    xchg   eax, ecx          ; ecx = enc
    xchg   eax, esi          ; esi = in
    push   esi
    xchg   eax, x1
    xchg   eax, x1
    test   ecx, ecx
    mov    cl, SPECK_RNDS
    jz     spk_e0
    ; x1 = ROTR32(x1 ^ x0, 3);
    xor    x1, x0
    ror    x1, 3
    ; x0 = ROTL32((x0 ^ ks[SPECK_RNDS-1-i]) - x1, 8);
    xor    x0, [edi+4*ecx-4]
    sub    x0, x1
    rol    x0, 8
    loop   spk_d0
    jmp    spk_end    
    ; x0 = (ROTR32(x0, 8) + x1) ^ ks[i];
    ror    x0, 8
    add    x0, x1
    xor    x0, [edi]
    ; x1 = ROTL32(x1, 3) ^ x0;
    rol    x1, 3
    xor    x1, x0
    loop   spk_e0
    pop    edi
    ; ((uint32_t*)in)[0] = x0;
    xchg   eax, x1
    ; ((uint32_t*)in)[1] = x1;

Just encryption

Most block ciphers are usually insecure by themselves. Counter Mode (CTR) is recommended for turning a block cipher into a stream cipher which then only requires encryption. Here’s the function with key scheduling and encryption integrated.

    mov    esi, [esp+32+8]   ; esi = in
    push   esi               ; save
    xchg   eax, x0           ; x0 = in[0]
    xchg   eax, x1           ; x1 = in[1]
    mov    esi, [esp+32+8]   ; esi = key
    xchg   eax, k0           ; k0 = key[0] 
    xchg   eax, k1           ; k1 = key[1]
    xchg   eax, k2           ; k2 = key[2]
    xchg   eax, k3           ; k3 = key[3]    
    xor    eax, eax          ; i = 0
    ; x0 = (ROTR32(x0, 8) + x1) ^ k0;
    ror    x0, 8
    add    x0, x1
    xor    x0, k0
    ; x1 = ROTL32(x1, 3) ^ x0;
    rol    x1, 3
    xor    x1, x0
    ; k1 = (ROTR32(k1, 8) + k0) ^ i;
    ror    k1, 8
    add    k1, k0
    xor    k1, eax
    ; k0 = ROTL32(k0, 3) ^ k1;
    rol    k0, 3
    xor    k0, k1    
    xchg   k3, k2
    xchg   k3, k1
    ; i++
    inc    eax
    cmp    al, SPECK_RNDS    
    jnz    spk_el
    pop    edi    
    xchg   eax, x0
    xchg   eax, x1

The x86-64 version to support 256-bit keys and 128-bit blocks is only 24 bytes more.

; speck128/256 encryption in 88 bytes
%ifndef BIN
    global speck128_encryptx

%define k0 rdi    
%define k1 rbp    
%define k2 rsi    
%define k3 rcx

%define x0 rbx    
%define x1 rdx

    push   rbp
    push   rbx
    push   rdi
    push   rsi   

    mov    k0, [rcx   ]      ; k0 = key[0]
    mov    k1, [rcx+ 8]      ; k1 = key[1]
    mov    k2, [rcx+16]      ; k2 = key[2]
    mov    k3, [rcx+24]      ; k3 = key[3]
    push   rdx
    mov    x0, [rdx  ]       ; x0 = in[0]
    mov    x1, [rdx+8]       ; x1 = in[1] 
    xor    eax, eax          ; i = 0
    ; x1 = (ROTR64(x1, 8) + x0) ^ k0;
    ror    x1, 8
    add    x1, x0
    xor    x1, k0
    ; x0 =  ROTL64(x0, 3) ^ x1;
    rol    x0, 3
    xor    x0, x1
    ; k1 = (ROTR64(k1, 8) + k0) ^ i;
    ror    k1, 8
    add    k1, k0
    xor    k1, rax
    ; k0 = ROTL64(k0, 3) ^ k1;
    rol    k0, 3
    xor    k0, k1
    xchg   k3, k2
    xchg   k3, k1
    ; i++
    add    al, 1
    cmp    al, SPECK_RNDS    
    jnz    spk_el
    pop    rax
    mov    [rax  ], x0
    mov    [rax+8], x1
    pop    rsi
    pop    rdi
    pop    rbx
    pop    rbp


instruction set setkey + encrypt + decrypt (bytes) setkey + encrypt (bytes)
x86 105 64
x86-64 132 88

MSVC generated code is 139 bytes using /O2 /Os flags. x86 assembly is 105 bytes for enc+dec or 64 bytes for just enc. Nice algorithm that would be fun for those new to cryptography.

See sources here

Posted in assembly, cryptography, encryption, programming, security | Tagged , , , , | Leave a comment