BLAKE2 cryptographic hash

Introduction

The BLAKE2 Cryptographic Hash and Message Authentication Code (MAC) was designed by Jean-Philippe Aumasson, Samuel Neves, Zooko Wilcox-O’Hearn, and Christian Winnerlein.

BLAKE2 comes in two basic flavors:

  • BLAKE2b (or just BLAKE2) is optimized for 64-bit platforms and produces digests of any size between 1 and 64 bytes.
  • BLAKE2s is optimized for 8- to 32-bit platforms and produces digests of any size between 1 and 32 bytes.

Only BLAKE2s was implemented in x86 assembly for now. BLAKE2b may be written in x64 assembly at a later date.

The version in C is derived from RFC 7693 by Markku-Juhani O. Saarinen but only tested on x86 architecture. The x86 assembly was a joint effort between Peter Ferrie and myself.

Initialization Vector

The initialization vector is exactly the same as SHA-256.
There are 8 32-bit integers derived from the square root of the first 8 prime numbers (2, 3, 5, 7, 11, 13, 17, 19)

static const uint32_t blake2s_iv[8] =
   {
       0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
       0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
   };

If you want to create on the fly, the following function will work (note some compilers require #pragma intrinsic (fabs,pow,sqrt))

uint32_t sqrt2int (uint32_t x) {
  uint32_t r;
  r = (uint32_t)(fabs(sqrt((double)p[x]))*pow(2,32));
  return r;
}

And although the following isn’t used in final assembly code, it’s just to illustrate generation with the FPU. init_fpu must be called before sqrt2int

init_fpu:
    ; round numbers down
    push   eax
    fstcw  [esp]            ; store control word
    pop    eax
    or     ax, 00400H       ; set round down bit
    and    ax, 0F7FFH       ; clear bit
    push   eax
    fldcw  [esp]            ; load control word
    pop    eax
    ret
    
sqrt2int:
    ; get square root of number in eax 
    ; return 32-bit fractional part as integer
    push   1
    push   0
    fild   qword ptr[esp]   ; load 2^32
    movzx  eax, word ptr[primes+2*i]
    push   eax
    fild   dword ptr[esp]   ; load integer
    push   eax
    fsqrt
    fld1                    ; load 1 
    fsub                    ; subtract to get fractional part
    fmul                    ; multiply fractional part by 2^32
    frndint
    fistp  qword ptr[esp]
    pop    eax 
    add    esp, 3*4         ; release 2^32 on stack
    ret

Initialization Function

To work as a MAC, you just supply a key which cannot exceed 32-bytes or 256-bits. The main difference between implementation here and in standard specifications is that you can also supply a number of rounds. By default 10 are used so you cannot use anything below this. Using non-standard number of rounds might undermine security of algorithm.

void b2s_init (b2s_ctx *ctx, uint32_t outlen, 
  void *key, uint32_t keylen, uint32_t rnds)
{
  uint32_t i;

  // ensure keylen and outlen don't exceed 32 bytes
  keylen=(keylen>32) ? 32 : keylen;
  outlen=(outlen>32) ? 32 : outlen;

  // initialize state iv
  for (i=0; i<8; i++) {
    ctx->state.v32[i] = b2s_iv[i];
  }
  // update state with keylen and outlen
  ctx->state.v32[0] ^= 0x01010000 ^ 
                      (keylen << 8) ^ 
                      outlen;

  // set buffer to zero
  for (i=0; i<16; i++) {
    ctx->buffer.v32[i] = 0;
  }
  
  // set length to zero
  ctx->len.v64 = 0;
  // if key used, set index to 64
  ctx->index   = keylen ? 64 : 0;
  ctx->outlen  = outlen;
  // minimum rounds is 10
  ctx->rounds  = rnds < 10 ? 10 : rnds; 

  // copy optional key
  for (i=0; i<keylen; i++) {
    ctx->buffer.v8[i] = ((uint8_t*)key)[i];
  }
}
_b2s_initx:
    pushad

    mov    ebp, esp    
    mov    edi, [ebp+32+ 4] ; ctx
    mov    ebx, [ebp+32+ 8] ; outlen
    mov    edx, [ebp+32+16] ; keylen
    
    ; initialize state
    mov    esi, b2s_iv
    lodsd
    ; eax ^= outlen
    xor    eax, ebx
    ; keylen << 8
    shl    edx, 8
    ; eax ^= keylen
    xor    eax, edx
    ; eax ^= 0x01010000
    xor    eax, 001010000h
    ; ctx->state.v32[0] = eax
    stosd
    push   7
    pop    ecx
    rep    movsd
    
    ; edi now points to ctx->buffer
    ; zero initialize
    ; also ctx->len.v64 = 0
    xor    eax, eax
    mov    cl, 19
    push   edi
    rep    stosd
    
    ; edi now points to index
    mov    ecx, [ebp+32+16] ; keylen
    jecxz  b2_kl
    mov    byte [edi-4], 64
b2_kl:
    ; ctx->outlen = outlen
    xchg   eax, ebx
    stosd
    ; ctx->rounds = rnds
    mov    eax, [ebp+32+20]
    ; minimum of 10
    push   10
    pop    edx
    cmp    eax, edx
    cmovb  eax, edx
    stosd
    
    mov    esi, [ebp+32+12] ; key
    pop    edi
    rep    movsb

    popad
    ret

Compression Function F

Message Schedule SIGMA

Message word schedule permutations for each round of both BLAKE2b and BLAKE2s are defined by SIGMA. For BLAKE2b, the two extra permutations for rounds 10 and 11 are SIGMA[10..11] = SIGMA[0..1].

The reference code uses a 10×16 array which requires 160 bytes of ROM.

const uint8_t sigma[10][16] = {
           { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
           { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
           { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
           { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
           { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
           { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
           { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
           { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
           { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
           { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }

Because there are only 16 32-bit words in a message block, each byte can be stored as a 4-bit nibble thus only requiring 80 bytes of space.

uint64_t sigma64[10] = 
{ 0xfedcba9876543210, 0x357b20c16df984ae,
  0x491763eadf250c8b, 0x8f04a562ebcd1397,
  0xd386cb1efa427509, 0x91ef57d438b0a6c2,
  0xb8293670a4def15c, 0xa2684f05931ce7bd,
  0x5a417d2c803b9ef6, 0x0dc3e9bf5167482a };

G Mixing Function

The reference implementation uses a macro B2S_G which is almost identical to the Quarter-Round macro in ChaCha stream cipher. The main difference is the bit-rotations are right instead of left and there are 2 additions of the message block using the SIGMA permutations as indexes.

for (i = 0; i < 10; i++) {          // ten rounds
           B2S_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
           B2S_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
           B2S_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
           B2S_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
           B2S_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
           B2S_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
           B2S_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
           B2S_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
       }

To reduce the amount of code, B2_SG is converted to a function. It’s called 8 times with 4 different work vector indices. Like the sigma permutations, each vector index is stored as a 4-bit nibble thus only requiring 16 instead of 32 bytes.

uint16_t idx16[8]=
{ 0xC840, 0xD951, 0xEA62, 0xFB73, 
  0xFA50, 0xCB61, 0xD872, 0xE943 };

index represents 1 of the 16-bit values from above array.

void b2s_g(blake_blk *x, uint16_t index, 
    uint32_t x0, uint32_t x1) 
{
  uint32_t a, b, c, d;
  
  a = (index         & 0xF);
  b = ((index >>  4) & 0xF);
  c = ((index >>  8) & 0xF);
  d = ((index >> 12) & 0xF);
  
  x->v32[a] += x->v32[b] + x0; 
  x->v32[d] = ROTR32(x->v32[d] ^ x->v32[a], 16);
  
  x->v32[c] += x->v32[d]; 
  x->v32[b] = ROTR32(x->v32[b] ^ x->v32[c], 12);
  
  x->v32[a] += x->v32[b] + x1; 
  x->v32[d] = ROTR32(x->v32[d] ^ x->v32[a],  8);
  
  x->v32[c] += x->v32[d]; 
  x->v32[b] = ROTR32(x->v32[b] ^ x->v32[c],  7);
}

The assembly version only requires 1 ROR, 1 ADD and 1 XOR instead of 4 each. This is possible because the first update of a and d is exactly the same as 2nd,3rd and 4th statements with only values swapped around.. The rotation constants are also loaded into ecx which are swapped during a loop.

The trick in looping twice is acheived with JNP/JPO (Jump if no parity/ Jump if parity odd). ebx is set to 1, when it reaches 3, the loop ends. When ecx reaches zero, the main mixing loop ends.

%define a  eax
%define b  edx
%define c  esi
%define d  edi
%define x  edi
%define t0 ebp

; void b2s_g(blake2_blk *blk, uint16_t index, 
;    uint32_t x0, uint32_t x1)
b2s_g:
    pushad
    push   a
    xchg   ah, al
    aam    16
    
    movzx  ebp, ah
    movzx  c, al

    pop    a
    aam    16
    
    movzx  b, ah
    movzx  a, al
    
    lea    a, [x+a*4]
    lea    b, [x+b*4]
    lea    c, [x+c*4]
    lea    d, [x+ebp*4]
    ; load ecx with rotate values
    ; 16, 12, 8, 7
    mov    ecx, 07080C10h
    ; x[a] = PLUS(x[a],x[b]) + x0; 
    mov    t0, [esp+_ebx]    ; x0
    add    t0, [b]
q_l1:
    mov    bl, 1
q_l2:
    ; also x[c] = PLUS(x[c],x[d]);
    add    t0, [a]
    mov    [a], t0
    ; x[d] = ROTATE(XOR(x[d],x[a]),cl);
    ; also x[b] = ROTATE(XOR(x[b],x[c]),cl);
    xor    t0, [d]
    ror    t0, cl
    mov    [d], t0
    xchg   cl, ch
    xchg   c, a
    xchg   d, b
    inc    ebx
    jpo    q_l2
    ; x[a] = PLUS(x[a],x[b]) + x1; 
    add    t0, [esp+_ecx]     ; x1
    ; --------------------------------------------
    shr    ecx, 16
    jnz    q_l1

    popad
    ret
// F compression function
void b2s_compress (b2s_ctx *ctx, int last)
{
  uint32_t  i, j, x0, x1;
  uint64_t  x;
  blake_blk v, m;
  
  // initialize v work vector with iv and state
  for (i=0; i<BLAKE2s_LBLOCK; i++) {
    v.v32[i    ] = ctx->state.v32[i];
    v.v32[i + 8] = b2s_iv[i];
  }
  
  // copy message buffer into m
  for (i=0; i<BLAKE2s_CBLOCK/4; i++) {
    m.v32[i] = ctx->buffer.v32[i];
  }

  // xor v with current length
  v.v32[12] ^= ctx->len.v32[0];
  v.v32[13] ^= ctx->len.v32[1];
  
  // if this is last block, invert word 14
  if (last) {
    v.v32[14] = ~v.v32[14];
  }
  
  // the minimum is 10 rounds but this can be increased
  // to deal with situation 10 rounds is no longer
  // adequate for protection.
  for (i=0; i<ctx->rounds; i++) {
    x = sigma64[i%10];
    // 8 mixing
    for (j=0; j<8; j++) {
      x0 = (x & 15); x >>= 4;
      x1 = (x & 15); x >>= 4;
      b2s_g(&v, idx16[j], m.v32[x0], m.v32[x1]);
    }
  }
  // update state with v
  for (i=0; i<BLAKE2s_LBLOCK; i++) {
    ctx->state.v32[i] ^= v.v32[i] ^ v.v32[i + 8];
  }
}

Initialization of v work vector with state and b2s_iv is done using CMC (Complement Carry). On the first loop, the flag CF is clear and context state is loaded into v. CMC will set CF to 1 thus repeating the loop except this time loading remainder of v with the initialization vector.

The inversion of bits at block 14 is acheived by negating edx which should be zero or one. If it is 1, it becomes -1 and inverts the bits using an XOR. If edx is zero, the XOR does nothing.

The SIGMA indices required for message block are separated using AAM.

_b2s_compressx:
    pushad
    mov    esi, edi
    mov    ebx, esi
    ; create space for v + m
    sub    esp, 124
    push   eax
    ; initialize v with state and chaining variables
    mov    edi, esp          ; edi = v
    mov    eax, b2s_iv 
    push   32                ; move 32 bytes
    pop    ecx
b2t_l1:                      ; first is ctx->state
                             ; then b2s_iv
    rep    movsb
    mov    cl, 32
    cmc                      ; complement
    xchg   esi, eax
    jc     b2t_l1            ; continue if carry
    
    ; esi should now be ctx->buffer
    ; edi will be m
    ; ecx will be 32
    ; copy buffer into m
    ; save edi in eax
    push   edi
    rep    movsw
    pop    edi
    
    ; esi now points to ctx->len
    ; xor v with current length
    lodsd
    xor    [edi+12*4-64], eax
    lodsd
    xor    [edi+13*4-64], eax
    
    ; if this is last block, invert word 14
    ; 1 becomes -1, 0 remains 0
    neg    edx
    xor    [edi+14*4-64], edx
    
    ; do minimum of 10 rounds
    mov    ecx, [esi+8]
b2t_l2:
    xor    esi, esi
b2t_l3:
    cmp    esi, 10
    je     b2t_l2
    xor    ebp, ebp
    mov    edx, [b2s_sigma64+8*esi+4]
    mov    eax, [b2s_sigma64+8*esi]
b2t_l4:
    pushad
    aam    16
    mov    cl, ah
    movzx  eax, al
    mov    ebx, [edi+4*eax]              ; m.v32[x0]
    mov    ecx, [edi+4*ecx]              ; m.v32[x1]
    mov    eax, dword[ebp*2+b2s_idx16]   ; b2s_idx16[j]
    mov    edi, [edi+_esp-96]
    call   b2s_g
    popad
    shrd   eax, edx, 8
    shr    edx, 8
    
    inc    ebp
    cmp    ebp, 8
    jnz    b2t_l4
    
    ; check rnds
    inc    esi
    loop   b2t_l3
    
    ; update state with work vector
b2t_l5:
    mov    eax, [edi+4*ebp-68]
    xor    eax, [edi+4*ebp+32-68]
    xor    [ebx+4*ebp-4], eax
    dec    ebp
    jnz    b2t_l5
    
    add    esp, 124
    pop    eax
    popad
    ret

Updating

void b2s_update (b2s_ctx *ctx, 
  void *input, uint32_t len)
{
  uint32_t i;
  uint8_t *p=(uint8_t*)input;
  
  if (len==0) return;
  
  // update buffer and state
  for (i=0; i<len; i++) {
    if (ctx->index==64) {
      ctx->len.v64 += ctx->index;
      b2s_compress (ctx, 0);
      ctx->index = 0;
    }
    ctx->buffer.v8[ctx->index++] = p[i];
  }
}
_b2s_updatex:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   edi, eax           ; ctx
    lodsd
    xchg   ecx, eax
    lodsd
    xchg   ecx, eax           ; len
    jecxz  ex_upd
    xchg   esi, eax           ; input
    mov    edx, [edi+index] ; index
b2_upd:
    cmp    edx, 64
    jnz    b2_ab
    ; ctx->len.v64 += index
    add    dword[edi+len+0], edx   
    mov    dl, 0              ; last=0
    adc    dword[edi+len+4], edx
    call   _b2s_compressx
b2_ab:
    lodsb
    mov    [edi+edx+buffer], al
    inc    edx
    loop   b2_upd
    mov    [edi+index], edx
ex_upd:
    popad
    ret

Finalizing

void b2s_final (void* out, b2s_ctx *ctx)
{
  uint32_t i;
  
  // update length with current index to buffer
  ctx->len.v64 += ctx->index;
  
  // zero remainder of buffer
  while (ctx->index < 64) {
    ctx->buffer.v8[ctx->index++] = 0;
  }

  // compress last block
  b2s_compress (ctx, 1);

  // copy state to output
  for (i=0; i<ctx->outlen; i++) {
    ((uint8_t*)out)[i] = ctx->state.v8[i];
  }
}
_b2s_finalx:
    pushad
    mov    esi, [esp+32+4]    ; out
    mov    edx, [esp+32+8]    ; ctx
    mov    ecx, [edx+index]   ; index
    xor    eax, eax
    add    dword[edx+len+0], ecx
    adc    dword[edx+len+4], eax
    lea    edi, [edx+ecx+buffer]
    neg    ecx
    add    ecx, 64
    rep    stosb
    xchg   edx, eax
    xchg   edi, eax
    inc    edx                 ; last=1
    call   _b2s_compressx
    mov    ecx, [edi+outlen] ; outlen
    xchg   esi, edi
    rep    movsb
    popad
    ret

Results

architecture size in bytes
x86 510
x64 n/a

Future updates to C/assembly may not be shown here. Please refer to sources here and here

Advertisements
This entry was posted in assembly, cryptography, programming and tagged , , . Bookmark the permalink.

2 Responses to BLAKE2 cryptographic hash

  1. Pingback: Asmcodes: Twofish-256 | Odzhan

  2. Pingback: Asmcodes: Twofish-256 | x86 crypto

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s