Asmcodes: Kuznyechik “Grasshopper” Block cipher

Introduction

Kuznyechik (Grasshopper in Russian) is a 128-bit block cipher supporting 256-bit keys defined in the National Standard of the Russian Federation GOST R 34.12-2015 and RFC 7801 published in 2015 to supercede GOST 28147-89 cipher also referred to as Magma that was declassified in 1994.

Kuznyechik uses an SPN (Substitution-permutation network) design the same as AES (Rijndael) and like the 128-bit key version of Rijndael uses 10 rounds. There’s Key Whitening, Linear transformation and byte substitution, design properties all used by AES finalists.

About the assembly code

Initially, I didn’t intend to implement this cipher in x86 assembly due to the overall size of code generated by C compiler. However, after removing the inverse sbox and computing these values at runtime, it resulted in significantly smaller code than Serpent and Twofish implementations which I didn’t expect. The assembly version is a joint effort between Peter Ferrie and myself resulting in a binary similar to the size of Twofish-256.

Dr. Markku-Juhani O. Saarinen deserves credit for the C code here which I’ve shamelessly copied and modified to be more compact. Markku has a knack for simplifying cryptographic algorithms and making them easier to read.

While I have derived my own implementation from code originally written by Markku, he has not endorsed the changes I’ve made.

Sbox initialization

Although the designers of Kuznyechik asserted the sbox values were chosen “randomly”, Alex Biryukov, Leo Perrin, and Aleksei Udovenko show in their paper Reverse-Engineering the S-Box of Streebog, Kuznyechik and STRIBOBr1 that the S-Boxes of both Kuznyechik and Streebog were not created pseudo-randomly but using a hidden algorithm which they were able to reverse engineer.

The following code will generate the pi array based on their research.

#include <stdio.h>
#include <stdint.h>

// gcc -std=c11 genpi.c -o genpi && ./genpi

// multiplcation by matrix over GF(2)
uint8_t matmul(uint8_t a, uint8_t mat[8][8]) {
  uint8_t res = 0;
  int x, y;
  
  for (x = 0; x < 8; x++)
    if (a & (1 << (7 - x)))
      for (y = 0; y < 8; y++)
        res ^= mat[y][x] << (7 - y);
  return res;
}

// https://en.wikipedia.org/wiki/Finite_field_arithmetic
uint8_t fieldmul(uint8_t a, uint8_t b) {
  uint8_t p = 0;
  
  while (b) {
    if (b & 1) 
      p ^= a;
    if (a & 0x8)
      a = (a << 1) ^ 0x19;  // x^4 + x^3 + 1
    else
      a <<= 1;
    b >>= 1;
  }
  return p;
}

int main(int argc, char *argv[]) {
    int i, x, y, l, r, n, z;
    
uint8_t inv[16]   = 
{0x0,0x1,0xc,0x8,0x6,0xf,0x4,0xe,
 0x3,0xd,0xb,0xa,0x2,0x9,0x7,0x5};

uint8_t nu_0[16]  = 
{0x2,0x5,0x3,0xb,0x6,0x9,0xe,0xa,
 0x0,0x4,0xf,0x1,0x8,0xd,0xc,0x7};

uint8_t nu_1[16]  = 
{0x7,0x6,0xc,0x9,0x0,0xf,0x8,0x1,
 0x4,0x5,0xb,0xe,0xd,0x2,0x3,0xa};

uint8_t sigma[16] = 
{0xc,0xd,0x0,0x4,0x8,0xb,0xa,0xe,
 0x3,0x9,0x5,0x2,0xf,0x1,0x6,0x7};

uint8_t phi[16]   = 
{0xb,0x2,0xb,0x8,0xc,0x4,0x1,0xc,
 0x6,0x3,0x5,0x8,0xe,0x3,0x6,0xb};

uint8_t alpha[8][8] = {
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,0,0,1},
    {0,1,0,0,0,0,1,1},
    {1,1,1,0,1,1,1,1},
    {1,0,0,0,1,0,1,0},
    {0,1,0,0,0,1,0,0},
    {0,0,0,1,1,0,1,0},
    {0,0,1,0,0,0,0,0},
};

uint8_t omega[8][8] = {
    {0,0,0,0,1,0,1,0},
    {0,0,0,0,0,1,0,0},
    {0,0,1,0,0,0,0,0},
    {1,0,0,1,1,0,1,0},
    {0,0,0,0,1,0,0,0},
    {0,1,0,0,0,1,0,0},
    {1,0,0,0,0,0,1,0},
    {0,0,0,0,0,0,0,1},
};
  
    for(x = 0; x < 256; x++) {
        y = matmul(x, alpha);
        l = (y >> 4) & 0xf;
        r = y & 0xf;
        l = (r == 0) ? nu_0[l] : nu_1[fieldmul(l, inv[r])];
        r = sigma[fieldmul(r, phi[l])];
        y = matmul((l << 4) | r, omega);
        if ((x & 7)==0) putchar('\n');
        printf("0x%02x, ", y);
    }
    return 0;
}

I have only assumed based on code generated by C compiler that Implementing this in assembly would not reduce size of code and so only the pi array is included in source and only the inverse sbox is created at runtime. However, I make no claim it can’t be implemented in less than 256 bytes which pi takes up, I just didn’t feel it was worth the effort.

kuz_init is called the same time as kuz_setkey so if you don’t call kuz_setkey first, encryption/decryption won’t work.

// The S-Box from section 5.1.1
const uint8_t kuz_pi[0x100] = {
  0xFC, 0xEE, 0xDD, 0x11, 0xCF, 0x6E, 0x31, 0x16,   // 00..07
  0xFB, 0xC4, 0xFA, 0xDA, 0x23, 0xC5, 0x04, 0x4D,   // 08..0F
  0xE9, 0x77, 0xF0, 0xDB, 0x93, 0x2E, 0x99, 0xBA,   // 10..17
  0x17, 0x36, 0xF1, 0xBB, 0x14, 0xCD, 0x5F, 0xC1,   // 18..1F
  0xF9, 0x18, 0x65, 0x5A, 0xE2, 0x5C, 0xEF, 0x21,   // 20..27
  0x81, 0x1C, 0x3C, 0x42, 0x8B, 0x01, 0x8E, 0x4F,   // 28..2F
  0x05, 0x84, 0x02, 0xAE, 0xE3, 0x6A, 0x8F, 0xA0,   // 30..37
  0x06, 0x0B, 0xED, 0x98, 0x7F, 0xD4, 0xD3, 0x1F,   // 38..3F
  0xEB, 0x34, 0x2C, 0x51, 0xEA, 0xC8, 0x48, 0xAB,   // 40..47
  0xF2, 0x2A, 0x68, 0xA2, 0xFD, 0x3A, 0xCE, 0xCC,   // 48..4F
  0xB5, 0x70, 0x0E, 0x56, 0x08, 0x0C, 0x76, 0x12,   // 50..57
  0xBF, 0x72, 0x13, 0x47, 0x9C, 0xB7, 0x5D, 0x87,   // 58..5F
  0x15, 0xA1, 0x96, 0x29, 0x10, 0x7B, 0x9A, 0xC7,   // 60..67
  0xF3, 0x91, 0x78, 0x6F, 0x9D, 0x9E, 0xB2, 0xB1,   // 68..6F
  0x32, 0x75, 0x19, 0x3D, 0xFF, 0x35, 0x8A, 0x7E,   // 70..77
  0x6D, 0x54, 0xC6, 0x80, 0xC3, 0xBD, 0x0D, 0x57,   // 78..7F
  0xDF, 0xF5, 0x24, 0xA9, 0x3E, 0xA8, 0x43, 0xC9,   // 80..87
  0xD7, 0x79, 0xD6, 0xF6, 0x7C, 0x22, 0xB9, 0x03,   // 88..8F
  0xE0, 0x0F, 0xEC, 0xDE, 0x7A, 0x94, 0xB0, 0xBC,   // 90..97
  0xDC, 0xE8, 0x28, 0x50, 0x4E, 0x33, 0x0A, 0x4A,   // 98..9F
  0xA7, 0x97, 0x60, 0x73, 0x1E, 0x00, 0x62, 0x44,   // A0..A7
  0x1A, 0xB8, 0x38, 0x82, 0x64, 0x9F, 0x26, 0x41,   // A8..AF
  0xAD, 0x45, 0x46, 0x92, 0x27, 0x5E, 0x55, 0x2F,   // B0..B7
  0x8C, 0xA3, 0xA5, 0x7D, 0x69, 0xD5, 0x95, 0x3B,   // B8..BF
  0x07, 0x58, 0xB3, 0x40, 0x86, 0xAC, 0x1D, 0xF7,   // C0..C7
  0x30, 0x37, 0x6B, 0xE4, 0x88, 0xD9, 0xE7, 0x89,   // C8..CF
  0xE1, 0x1B, 0x83, 0x49, 0x4C, 0x3F, 0xF8, 0xFE,   // D0..D7
  0x8D, 0x53, 0xAA, 0x90, 0xCA, 0xD8, 0x85, 0x61,   // D8..DF
  0x20, 0x71, 0x67, 0xA4, 0x2D, 0x2B, 0x09, 0x5B,   // E0..E7
  0xCB, 0x9B, 0x25, 0xD0, 0xBE, 0xE5, 0x6C, 0x52,   // E8..EF
  0x59, 0xA6, 0x74, 0xD2, 0xE6, 0xF4, 0xB4, 0xC0,   // F0..F7
  0xD1, 0x66, 0xAF, 0xC2, 0x39, 0x4B, 0x63, 0xB6,   // F8..FF
};

// initialize sbox tables
void kuz_init(kuz_key_t *key)
{
  int i;

  for (i=0; i<256; i++) {
    key->pi[i] = kuz_pi[i];
    key->pi_inv[kuz_pi[i]] = i;
  }
}

Creating the inverse sbox tables with code is smaller than including it as array.

; initialize sbox tables 
; edi = key context
load_sbox:
    pop    ebx               ; ebx = kuz_pi
sbox_loop:
    mov    al, dl            ; al = i
    xlatb                    ; al = kuz_pi[i]
    stosb                    ; key->pi[i] = al
    mov    [esi+eax], dl     ; key->pi_inv[kuz_pi[i]] = i
    inc    dl                ; i++
    jnz    sbox_loop         ; i<256
    popad
    ret

kuz_init:
    pushad
    inc    dh                ; edx=256
    add    edi, edx          ; edi = key->pi
    lea    esi, [edi+edx]    ; esi = key->pi_inv
    call   load_sbox
; The S-Box from section 5.1.1
kuz_pi:
  db   0xFC, 0xEE, 0xDD, 0x11, 0xCF, 0x6E, 0x31, 0x16,   ; 00..07
  db   0xFB, 0xC4, 0xFA, 0xDA, 0x23, 0xC5, 0x04, 0x4D,   ; 08..0F
  db   0xE9, 0x77, 0xF0, 0xDB, 0x93, 0x2E, 0x99, 0xBA,   ; 10..17
  db   0x17, 0x36, 0xF1, 0xBB, 0x14, 0xCD, 0x5F, 0xC1,   ; 18..1F
  db   0xF9, 0x18, 0x65, 0x5A, 0xE2, 0x5C, 0xEF, 0x21,   ; 20..27
  db   0x81, 0x1C, 0x3C, 0x42, 0x8B, 0x01, 0x8E, 0x4F,   ; 28..2F
  db   0x05, 0x84, 0x02, 0xAE, 0xE3, 0x6A, 0x8F, 0xA0,   ; 30..37
  db   0x06, 0x0B, 0xED, 0x98, 0x7F, 0xD4, 0xD3, 0x1F,   ; 38..3F
  db   0xEB, 0x34, 0x2C, 0x51, 0xEA, 0xC8, 0x48, 0xAB,   ; 40..47
  db   0xF2, 0x2A, 0x68, 0xA2, 0xFD, 0x3A, 0xCE, 0xCC,   ; 48..4F
  db   0xB5, 0x70, 0x0E, 0x56, 0x08, 0x0C, 0x76, 0x12,   ; 50..57
  db   0xBF, 0x72, 0x13, 0x47, 0x9C, 0xB7, 0x5D, 0x87,   ; 58..5F
  db   0x15, 0xA1, 0x96, 0x29, 0x10, 0x7B, 0x9A, 0xC7,   ; 60..67
  db   0xF3, 0x91, 0x78, 0x6F, 0x9D, 0x9E, 0xB2, 0xB1,   ; 68..6F
  db   0x32, 0x75, 0x19, 0x3D, 0xFF, 0x35, 0x8A, 0x7E,   ; 70..77
  db   0x6D, 0x54, 0xC6, 0x80, 0xC3, 0xBD, 0x0D, 0x57,   ; 78..7F
  db   0xDF, 0xF5, 0x24, 0xA9, 0x3E, 0xA8, 0x43, 0xC9,   ; 80..87
  db   0xD7, 0x79, 0xD6, 0xF6, 0x7C, 0x22, 0xB9, 0x03,   ; 88..8F
  db   0xE0, 0x0F, 0xEC, 0xDE, 0x7A, 0x94, 0xB0, 0xBC,   ; 90..97
  db   0xDC, 0xE8, 0x28, 0x50, 0x4E, 0x33, 0x0A, 0x4A,   ; 98..9F
  db   0xA7, 0x97, 0x60, 0x73, 0x1E, 0x00, 0x62, 0x44,   ; A0..A7
  db   0x1A, 0xB8, 0x38, 0x82, 0x64, 0x9F, 0x26, 0x41,   ; A8..AF
  db   0xAD, 0x45, 0x46, 0x92, 0x27, 0x5E, 0x55, 0x2F,   ; B0..B7
  db   0x8C, 0xA3, 0xA5, 0x7D, 0x69, 0xD5, 0x95, 0x3B,   ; B8..BF
  db   0x07, 0x58, 0xB3, 0x40, 0x86, 0xAC, 0x1D, 0xF7,   ; C0..C7
  db   0x30, 0x37, 0x6B, 0xE4, 0x88, 0xD9, 0xE7, 0x89,   ; C8..CF
  db   0xE1, 0x1B, 0x83, 0x49, 0x4C, 0x3F, 0xF8, 0xFE,   ; D0..D7
  db   0x8D, 0x53, 0xAA, 0x90, 0xCA, 0xD8, 0x85, 0x61,   ; D8..DF
  db   0x20, 0x71, 0x67, 0xA4, 0x2D, 0x2B, 0x09, 0x5B,   ; E0..E7
  db   0xCB, 0x9B, 0x25, 0xD0, 0xBE, 0xE5, 0x6C, 0x52,   ; E8..EF
  db   0x59, 0xA6, 0x74, 0xD2, 0xE6, 0xF4, 0xB4, 0xC0,   ; F0..F7
  db   0xD1, 0x66, 0xAF, 0xC2, 0x39, 0x4B, 0x63, 0xB6,   ; F8..FF

Key Whitening

Those of you familiar with previous asmcode entries should know by now the purpose of key whitening which essentially improves resistance to brute force attacks. It involves exclusive OR operation on the data being encrypted/decrypted using subkeys. It’s the exact same as AddRoundKey in AES and was first used by Ron Rivest in 1984 for DES-X.

static void kuz_whiten(w128_t *w, kuz_key_t *key, int idx) {
  int i;
  uint32_t *p=(uint32_t*)&key->k[idx].b[0];
  
  for (i=0; i<4; i++) {
    w->w[i] ^= p[i];
  }
}

Assembly is same as that used in AES, Serpent and Twofish.

; key whitening
; esi = w
; edi = key
; edx = round
kuz_whiten:
    pushad
    mov    cl, 4
    shl    edx, cl
w_l0:
    mov    eax, [edi+edx]     ; get 4 bytes of key
    xor    [esi], eax         ; xor state
    cmpsd
    loop   w_l0
    popad
    ret

Linear Transformation

Think of the linear transformation in Serpent, the mix columns operation in Rijndael or the Maximum Distance Seperable code in Twofish. They all serve the same purpose which is to create diffusion.

In Diffusion the statistical structure of the plaintext is dissipated into long range statistics of the cipher text. This is achieved by having each plaintext digit affect the value of many cipher text digits. Which is equivalent to saying that each cipher text digit is affected by many plaintext digits.

// poly multiplication mod p(x) = x^8 + x^7 + x^6 + x + 1
// totally not constant time
static uint8_t kuz_mul_gf256(uint8_t x, uint8_t y)
{
  uint8_t z;

  z = 0;

  while (y) {
    if (y & 1) {
      z ^= x;
    }
    x = (x << 1) ^ (x & 0x80 ? 0xC3 : 0x00);
    y >>= 1;
  }
  return z;
}

static void kuz_lt (w128_t *w, int enc) {
  int     i, j;
  uint8_t x;

  // Linear vector from sect 5.1.2
  const uint8_t kuz_lvec[16] = {
    0x94, 0x20, 0x85, 0x10, 0xC2, 0xC0, 0x01, 0xFB,
    0x01, 0xC0, 0xC2, 0x10, 0x85, 0x20, 0x94, 0x01 };

  // 16 rounds
  for (j=0; j<16; j++)
  {
    if (enc == KUZ_ENCRYPT)
    {
      // An LFSR with 16 elements from GF(2^8)
      x = w->b[15]; // since lvec[15] = 1

      for (i = 14; i >= 0; i--) {
        w->b[i + 1] = w->b[i];
        x ^= kuz_mul_gf256(w->b[i], kuz_lvec[i]);
      }
      w->b[0] = x;
    } else {
      x = w->b[0];

      for (i = 0; i < 15; i++) {
        w->b[i] = w->b[i + 1];
        x ^= kuz_mul_gf256(w->b[i], kuz_lvec[i]);
      }
      w->b[15] = x;
    }
  }
}

Peter was able to combine both operations in assembly using ECX (crypto flag).

; linear transformation
; esi = w
; ecx = enc
lt_l0:
    pop    edi
    push   16
    pop    eax               ; 16 rounds
lt_l1:
    pushad
    mov    dl, 15
    jecxz  lt_l2
    cdq

lt_l2:
    mov    ah, [esi+edx]     ; ah = w->b[00 or 15]

lt_l3:
    mov    bh, [edi+edx-1]     ; bh = kuz_lvec[i]
    mov    al, [esi+edx-1]     ; al = w->b[i]
    jecxz  lt_l4
    mov    bh, [edi+edx]     ; bh = kuz_lvec[i]
    mov    al, [esi+edx+1]   ; al = w->b[i+1]

lt_l4:
    mov    [esi+edx], al     ; w->b[i] = al
    call   kuz_mul_gf256
    dec    edx
    jecxz  lt_l5
    inc    edx
    inc    edx
    cmp    edx, 15

lt_l5:
    jnz    lt_l3
    mov    [esi+edx], ah     ; w->b[00 or 15] = x
    popad
    dec    eax
    jnz    lt_l1
    popad
    ret
kuz_lt:
    pushad
    call   lt_l0
    db 0x94, 0x20, 0x85, 0x10 
    db 0xC2, 0xC0, 0x01, 0xFB
    db 0x01, 0xC0, 0xC2, 0x10
    db 0x85, 0x20, 0x94, 0x01

; poly multiplication 
; mod p(x) = x^8 + x^7 + x^6 + x + 1 
mgf_l0:
    shr    bh, 1             ; if (y & 1), y >>= 1
    jnc    mgf_l1
    xor    ah, al            ; z ^= x
mgf_l1:
    add    al, al            ; if (x & 0x80), x <<= 1
    jnc    kuz_mul_gf256
    xor    al, 0xC3          ; x ^= 0xC3
kuz_mul_gf256:
    test   bh, bh            ; while (y)
    jnz    mgf_l0
    ret

Byte Substitution

The purpose of an sbox or substituion box is to obscure the relationship between key and ciphertext — Shannon’s property of confusion. Claude E. Shannon proposed ideas related to information security in his 1948 paper Communication Theory of Secrecy Systems

Confusion seeks to make a relationship between the statistics of the cipher text and the value of the encryption key as complex as possible. Thus even if the attacker can get some handle on the statistics of the cipher text, the way in which the key was used to produce that cipher text is so complex as to make it difficult to deduce the key.

// substitute bytes
static void kuz_subbytes(w128_t *w, kuz_key_t *key, int enc) {
  int           i;
  const uint8_t *sbox = (enc==KUZ_ENCRYPT) ? key->pi : key->pi_inv;

  for (i=0; i<16; i++) {
    w->b[i] = sbox[w->b[i]];
  }
}

Key scheduling

Based on Feistel Network design. It would certainly be possible to create subkeys during encryption or decryption but I wanted to keep it simple.

/ key setup
void kuz_setkey(kuz_key_t *kuz, const uint8_t key[32])
{
  uint32_t i, j;
  w128_t   c, z;
  w256_t   x;

  kuz_init(kuz);
  
  // copy key to context
  memcpy (&kuz->k[0].b[0], key, 32);

  // copy key to local buffer
  memcpy (&x.b[0], key, 32);
  
  for (i=1; i<=32; i++) {
    // C Value
    memset (&c.b[0], 0, 16);
    c.b[15] = i;    // load round in lsb

    kuz_lt(&c, KUZ_ENCRYPT);

    for (j=0; j<16; j++) {
      z.b[j] = x.b[j] ^ c.b[j];
    }

    kuz_subbytes(&z, kuz, KUZ_ENCRYPT);
    kuz_lt(&z, KUZ_ENCRYPT);

    for (j=0; j<16; j++) {
      z.b[j] ^= x.b[16+j];
    }
    
    memcpy (&x.b[16], &x.b[0], 16);
    memcpy (&x.b[0], &z.b[0], 16);
    
    if ((i & 7) == 0) {
      memcpy (&kuz->k[(i >> 2)].b[0], &x.b[0], 32);
    }
  }
}

Markku’s code uses 64-bit values here which I replaced with REP MOVSB operations. Using MMX code might reduce code further and for 64-bit implementations, it would make more sense to use 64-bit registers instead of stack memory.

; key setup
kuz_setkeyx:
_kuz_setkeyx:
    pushad
    mov    edi, [esp+32+4]   ; ebx=kuz context
    mov    esi, [esp+32+8]   ; esi=key
    mov    ebp, edi
    
    push   32                ; eax=32
    pop    eax
    cdq                      ; edx=0
    call   kuz_init
    
    xchg   eax, ecx
    ; copy key to context
    pushad
    rep    movsb
    popad
    ; copy 256-bit key to local buffer
    pushad                   ; allocate 32 bytes for x
    mov    edi, esp          ; edi=x
    push   edi
    rep    movsb
    pop    edi
    
    pushad                   ; allocate 32 bytes for c,z
    mov    ebx, esp          ; ebx=z
    lea    esi, [ebx+16]     ; esi=c
    ; do 32 rounds
ksk_l0:
    inc    edx
    cmp    dl, 32
    ja     ksk_l3
    
    mov    cl, 16
    pushad                   ; save x
    mov    edi, esi
    xor    eax, eax
    rep    stosb             ; memset (&c.b[0], 0, 16);
    mov    [esi+15], dl      ; c.b[15] = i;
    
    mov    edi, esi
    call   kuz_lt            ; kuz_lt(&c, KUZ_ENCRYPT);
    popad

ksk_l1:
    mov    al, [edi+ecx-1]   ; al=x.b[j]
    xor    al, [esi+ecx-1]   ; ^= c.b[j]
    mov    [ebx+ecx-1], al
    loop   ksk_l1
    
    xchg   esi, ebx     ; XCHG(c, z)
    xchg   edi, ebp     ; XCHG(x, kuz)
    call   kuz_subbytes ; kuz_subbytes(&z, kuz, KUZ_ENCRYPT);
    call   kuz_lt       ; kuz_lt(&z, KUZ_ENCRYPT);
    xchg   esi, ebx     ; XCHG(c, z)
    xchg   edi, ebp     ; XCHG(x, kuz)

ksk_l2:
    mov    al, [edi+ecx+16] ; z.b[j] ^= x.b[16+j];
    xor    [ebx+ecx], al
    inc    ecx
    cmp    cl, 16
    jnz    ksk_l2
    
    ; memcpy (&x.b[16], &x.b[0], 16);
    pushad
    mov    esi, edi
    add    edi, ecx
    rep    movsb
    popad
    
    ; memcpy (&x.b[0], &z.b[0], 16);
    pushad
    mov    esi, ebx
    rep    movsb
    popad
    
    test   dl, 7
    jnz    ksk_l0
    
    ; memcpy (&kuz->k[(i >> 2)].b[0], &x.b[0], 32);
    pushad
    add    ecx, ecx            ; ecx *= 2
    mov    esi, edi
    lea    edi, [ebp+edx*4]
    rep    movsb
    popad
    jmp    ksk_l0
ksk_l3:
    popad
    popad
    popad
    ret

Encryption and Decryption

// encrypt/decrypt a block - 8 bit way
void kuz_encrypt(kuz_key_t *key, void *blk, int enc)
{
  int    i, j;
  w128_t *x=(w128_t*)blk;

  if (enc==KUZ_ENCRYPT)
  {
    i=0;
    for (;;)
    {
      kuz_whiten(x, key, i);
      if (++i == KUZ_ROUNDS) break;
      kuz_subbytes(x, key, enc);
      kuz_lt(x, enc);
    }
  } else {
    i=KUZ_ROUNDS;
    for (;;)
    {
      i--;
      kuz_whiten(x, key, i);
      if (i == 0) break;
      kuz_lt(x, enc);
      kuz_subbytes(x, key, enc);
    }
  }
}

The assembly takes advantage of similarity between both encryption/decryption operations by storing pointer to functions in registers and swapping them depending on enc parameter stored in ecx.

; encrypt/decrypt a block
kuz_encryptx:
_kuz_encryptx:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   eax, edi          ; edi=key
    lodsd
    push   eax               ; save blk
    lodsd
    cdq                      ; edx=0
    xchg   eax, ecx          ; ecx=enc
    pop    esi               ; esi=blk
    call   load_func
    ; kuz_whiten, kuz_subbytes and kuz_lt functions
    ; are placed here.
load_func:
    pop    ebx
    lea    eax, [ebx + (kuz_lt - kuz_whiten)]
    lea    ebp, [eax + (kuz_subbytes - kuz_lt)]
    
    jecxz  kde_l2
    xchg   eax, ebp
    mov    dl, KUZ_ROUNDS
    jmp    kde_l1
kde_l0:
    call   ebp ; kuz_lt or kuz_subbytes
    call   eax ; kuz_subbytes or kuz_lt
kde_l1:
    dec    edx
kde_l2:
    call   ebx ; kuz_whiten
    inc    ecx
    test   edx, edx
    loop   kde_l3
    inc    edx
    inc    edx
    cmp    dl, KUZ_ROUNDS+1
kde_l3:
    jnz    kde_l0
    popad
    ret

Summary

Please refer to sources here and here for any future updates.

architecture size
x86 615
x64 n/a

I cannot say there’s any advantage to using this cipher compared to Rijndael (AES), Serpent or Twofish. The design properties of all these ciphers are very similar but Kuznyechik is slower than Rinjdael. They all use similar functions but the advantage of AES (Rijndael) is that it’s likely to be more scrutinized by the cryptographic community than Kuznyechik, Serpent or Twofish. Having said all that, it’s still nice to have another block cipher people can study and learn from which has been certified by organizations other than NIST.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming, security and tagged , , , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s