Asmcodes: SHA-256 cryptographic hash

Introduction

SHA-2 (Secure Hash Algorithm) consists of 6 cryptographic hash functions designed by the NSA which were published in 2001 under royalty free patent. They are intended to be used for data integrity, message authentication and digital signatures.

To avoid confusion, what follows is a tiny implementation in x86 assembler, optimized for size, so if you require a version optimized for speed, consider using a library such as PolarSSL or OpenSSL

Init, Update, Final

Normally, the initialization, update and finalization processes are separate functions so data can be processed sequentially. To compact it as much as possible, the function presented here can only be called once and the input length should not exceed ~500MB.

The limitation is due to len variable defined as a 32-bit integer rather than 64-bit which it should be.
The following structure for holding state is used.

typedef struct _SHA256_CTX {
  uint64_t len;
  union {
    uint8_t  v8[SHA256_DIGEST_LENGTH];
    uint32_t v32[SHA256_DIGEST_LENGTH/4];
    uint64_t v64[SHA256_DIGEST_LENGTH/8];
  } s;
  union {
    uint8_t  v8[SHA256_CBLOCK];
    uint32_t v32[SHA256_CBLOCK/4];
    uint64_t v64[SHA256_CBLOCK/8];
  } buf;
} SHA256_CTX;

The function that includes init, update and final functions in one call.

uint16_t p[64] =
{  2,   3,   5,   7,  11,  13,  17,  19, 
  23,  29,  31,  37,  41,  43,  47,  53,
  59,  61,  67,  71,  73,  79,  83,  89, 
  97, 101, 103, 107, 109, 113, 127, 131,
 137, 139, 149, 151, 157, 163, 167, 173, 
 179, 181, 191, 193, 197, 199, 211, 223,
 227, 229, 233, 239, 241, 251, 257, 263, 
 269, 271, 277, 281, 283, 293, 307, 311 };

// cube root of integer
// return fractional part as integer
uint32_t cbr2int (uint32_t x) {
  uint32_t r;
  r = (uint32_t)(fabs(pow((double)p[x],1.0/3.0))*pow(2,32));
  return r;
}

void SHA256_Transform (SHA256X_CTX *ctx) 
{
  uint32_t t1, t2, i, j, t;
  uint32_t w[64], s[8];
  
  // load data in big endian format
  for (i=0; i<16; i++) {
    w[i] = SWAP32(ctx->buffer.v32[i]);
  }

  // expand data into 512-bit buffer
  for (i=16; i<64; i++) {
    w[i] = SIG1(w[i-2]) + w[i-7] + SIG0(w[i-15]) + w[i-16];
  }
  
  // load state into local buffer
  for (i=0; i<8; i++) {
    s[i] = ctx->state.v32[i];
  }
  
  // for 64 rounds
  for (i=0; i<64; i++)
  {
    t1 = s[7] + EP1(s[4]) + CH(s[4], s[5], s[6]) + w[i];
    t1 += cbr2int(i);
    t2 = EP0(s[0]) + MAJ(s[0], s[1], s[2]);
    s[7]  = t1 + t2;
    s[3] += t1;
    // rotate "right" 32-bits
    t1=s[0]; // load a
    for (j=1; j<8; j++) {
      t2=s[j];
      s[j]=t1;
      t1=t2;
    }
    s[0]=t1;
  }
  
  // save state
  for (i=0; i<8; i++) {
    ctx->state.v32[i] += s[i];
  }
}

uint32_t h[SHA256_LBLOCK]=
{ 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
  0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };

void SHA256 (uint32_t inlen, void *in, void *out)
{
  SHA256X_CTX ctx;
  uint32_t i, idx;
  uint8_t *p=(uint8_t*)in;
  
  // initialize context
  ctx.len = 0;
  memcpy (&ctx.state.v32[0], h, 8*4);
  
  // update state with input
  idx = 0;
  
  do {
    // last block?
    if (inlen==0) {
      // add the end bit
      ctx.buffer.v8[idx++] = 0x80;
      // zero the remainder of buffer
      memset (&ctx.buffer.v8[idx], 0, (64 - idx));
      // equal or exceed where to store total bits?
      if (idx >= 56) {
        // compress it
        SHA256_Transform ((SHA256_CTX*)&ctx);
        // zero buffer
        memset (ctx.buffer.v8, 0, 64);
      }
      // add total bits
      ctx.buffer.v32[15] = SWAP32 (ctx.len * 8);
      idx=64;
    } else
    {    
      ctx.buffer.v8[idx++] = *p++;
      ctx.len++;
    }
    if (idx==64) {
      // compress
      SHA256_Transform ((SHA256_CTX*)&ctx);
      idx=0;
    }
  } while (--inlen != -1);
  
  // swap byte order and return
  for (i=0; i<8; i++) {
    ((uint32_t*)out)[i] = SWAP32 (ctx.state.v32[i]);
  }
}

The x86 assembler version in 588 bytes.

; context
sha256_ctx struct
  state  dd 8 dup (?)
  len    dd ?
  buffer db 64 dup (?)
sha256_ctx ends

SHA256 proc
    pushad

    push   eax
    fstcw  [esp]            ; store control word
    pop    eax
    or     ax, 00400H       ; set round down bit
    and    ax, 0F7FFH       ; clear bit
    push   eax
    fldcw  [esp]            ; load control word
    pop    eax

    mov    ebx, esp          ; 
    sub    esp, 4+64+8*4     ; alloc space for ctx
    mov    edi, esp
    push   edi               ; save ptr to ctx
    
    push   8                 ; load 8 32-bit words
    pop    ecx
    call   load_iv
  dd 06a09e667h, 0bb67ae85h 
  dd 03c6ef372h, 0a54ff53ah
  dd 0510e527fh, 09b05688ch
  dd 01f83d9abh, 05be0cd19h
load_iv:
    pop    esi
    rep    movsd
    
    xor    eax, eax
    stosd                    ; ctx.len = 0
    cdq                      ; idx = 0
    
    mov    ebp, [ebx+32+ 4]  ; ecx = inlen
    mov    esi, [ebx+32+ 8]  ; esi = in
    mov    ebx, [ebx+32+12]  ; ebx = out
chk_len:
    test   ebp, ebp
    jnz    upd_buf
    
    push   edi
    add    edi, edx
    mov    ecx, edx
    mov    al, 80h           ; ctx.buffer[idx++] = 0x80;
    stosb
    xor    eax, eax
    neg    ecx
    add    ecx, 63
    rep    stosb
    pop    edi
    
    ; if (edx >= 56)
    cmp    edx, 56
    jb     add_bits
    
    call   SHA256_Transform
    
    push   edi               ; zero ctx.buffer
    mov    cl, 64
    rep    stosb
    pop    edi
    
add_bits:
    mov    eax, [edi-4]      ; eax = ctx.len
    shl    eax, 3            ; multiply by 8
    bswap  eax               ; convert to big endian
    mov    [edi+60], eax     ; save in buffer
    jmp    upd_ctx
upd_buf:
    lodsb                    ; al = *p++
    mov    [edi+edx], al
    inc    dword ptr[edi-4]  ; ctx.len++
    inc    edx               ; idx++
    cmp    edx, 64           ; buffer full?
    jnz    upd_len
upd_ctx:
    call   SHA256_Transform
    xor    edx, edx          ; idx=0
upd_len:
    dec    ebp               ; while (--inlen >= 0)
    jns    chk_len

    mov   edi, ebx           ; edi = out
    pop   esi                ; esi = ctx.state
    push  8                  ; save 256-bit state/hash
    pop   ecx
save_state:
    lodsd                    ; load word
    bswap  eax               ; convert to little endian
    stosd                    ; save in out buffer
    loop   save_state
    add    esp, 4+64+8*4     ; fix up stack
    popad
    ret
SHA256 endp

Because there are 8 32-bit values and only 8 32-bit general purpose registers on x86 in 32-bit mode, the stack is used much more than I would have liked but it can’t be helped.

s0 equ <eax>
s1 equ <ebx>
i  equ <ecx>
t1 equ <edx>
t2 equ <esi>
t3 equ <ebp>

_a textequ <dword ptr[esp][sha256_ws.h[0*4]]>
_b textequ <dword ptr[esp][sha256_ws.h[1*4]]>
_c textequ <dword ptr[esp][sha256_ws.h[2*4]]>
_d textequ <dword ptr[esp][sha256_ws.h[3*4]]>
_e textequ <dword ptr[esp][sha256_ws.h[4*4]]>
_f textequ <dword ptr[esp][sha256_ws.h[5*4]]>
_g textequ <dword ptr[esp][sha256_ws.h[6*4]]>
_h textequ <dword ptr[esp][sha256_ws.h[7*4]]>

; workspace for compression
sha256_ws struct
  h dd  8 dup (?)
  w dd 64 dup (?)
sha256_ws ends

SHA256_Transform proc
    pushad    
    
     ; save pointer to context
    mov    esi, edi
    sub    esi, 4 + 8*4
    push   esi
    
    ; allocate work space
    sub    esp, sizeof sha256_ws
    
    ; Initialize working variables 
    ; to current hash value:
    mov    edi, esp
    push   8
    pop    ecx
    rep    movsd
    ; skip len
    lodsd
    
    ; convert 16 words to big endian
    mov    cl, 16
swap_bytes:
    lodsd
    bswap  eax
    stosd
    loop   swap_bytes

    ; Extend the first 16 words into the 
    ; remaining 48 words w[16..63] of 
    ; the message schedule array:
    mov    cl, 48
expand:
    ; s0 := (w[i-15] rightrotate 7) 
    ;   xor (w[i-15] rightrotate 18) 
    ;   xor (w[i-15] rightshift 3)
    mov    s0, [edi - 15*4]
    mov    t1, s0
    mov    t2, s0
    ror    s0, 7
    ror    t1, 18
    shr    t2, 3
    xor    s0, t1
    xor    s0, t2
    ; s1 := (w[i-2] rightrotate 17) 
    ;   xor (w[i-2] rightrotate 19) 
    ;   xor (w[i-2] rightshift 10)
    mov    s1, [edi - 2*4]
    mov    t1, s1
    mov    t2, s1
    ror    s1, 17
    ror    t1, 19
    shr    t2, 10
    xor    s1, t1
    xor    s1, t2
    ; w[i] := w[i-16] + s0 + w[i-7] + s1
    add    s0, [edi - 16*4]
    add    s1, [edi -  7*4]
    add    s0, s1
    stosd
    loop   expand
    
    ; update context
update_loop:
    ; S1 := EP1(e)
    
    ; S1 := (e rightrotate 6) 
    ;   xor (e rightrotate 11) 
    ;   xor (e rightrotate 25)
    mov    s1, _e
    mov    t1, s1
    ror    s1, 25
    ror    t1, 6
    xor    s1, t1
    ror    t1, 11-6
    xor    s1, t1
    ; CH(e, f, g)
    ; ch := (e and f) xor ((not e) and g)
    mov    t1, _f
    xor    t1, _g
    and    t1, _e
    xor    t1, _g
    ; temp1 := h + S1 + ch + k[i] + w[i]
    add    t1, s1
    add    t1, _h
    call   cbr2int
    add    t1, eax
    add    t1, dword ptr [esp][sha256_ws.w[4*i]]
    ; S0 := EP0(a)
    ; S0 := (a rightrotate 2) 
    ;   xor (a rightrotate 13) 
    ;   xor (a rightrotate 22)
    mov    s0, _a
    mov    t2, s0
    ror    s0, 22
    ror    t2, 2
    xor    s0, t2
    ror    t2, 13-2
    xor    s0, t2
    ; MAJ(a, b, c)
    ; maj := (a and b) xor (a and c) xor (b and c)
    mov    t2, _a
    mov    t3, _a
    or     t2, _b
    and    t2, _c
    and    t3, _b
    or     t2, t3
    ; temp2 := S0 + maj
    add    t2, s0
    ; d = d + temp1
    add    _d, t1
    ; h = temp1 + temp2
    mov    _h, t1
    add    _h, t2
    ; shift state
    mov    esi, esp
    mov    edi, esp    
    push   i
    push   edi
    mov    cl, 7
    lodsd                   ; load a
shift_state:
    scasd                   ; skip a
    xchg   eax, [edi]       ; 
    loop   shift_state
    pop    edi
    stosd
    pop    i
    
    ; i++
    inc    i
    cmp    i, 64
    jne    update_loop
    
    mov    esi, esp
    add    esp, sizeof sha256_ws
    pop    edi
    mov    cl, 8
save_state:
    lodsd
    add    eax, [edi]
    stosd
    loop   save_state
    popad
    ret
SHA256_Transform endp

I discussed how to generate the K constants using FPU because it saves a number of bytes.

; get cubic root of number 
    ; return 32-bit fractional part as integer
cbr2int:
    push   1
    push   0
    fild   qword ptr[esp]   ; load 2^32
    push   1
    fild   dword ptr[esp]
    push   3
    fild   dword ptr[esp]
    fdivp                   ; 1.0/3.0
    movzx  eax, word ptr[primes+2*i]
    push   eax
    fild   dword ptr[esp]   ;
    fyl2x                   ; ->log2(Src1)*exponent
    fld    st(0)            ; copy the logarithm
    frndint                 ; keep only the characteristic
    fsub   st(1), st        ; keeps only the mantissa
    fxch                    ; get the mantissa on top
    f2xm1                   ; ->2^(mantissa)-1
    fscale
    fstp   st(1)            ; copy result over and "pop" it
    fmul                    ; * 2^32
    fistp  qword ptr[esp]   ; save integer
    pop    eax
    add    esp, 4*4         ; release stack
    ret

primes dw 2, 3, 5, 7, 11, 13, 17, 19
       dw 23, 29, 31, 37, 41, 43, 47, 53
       dw 59, 61, 67, 71, 73, 79, 83, 89
       dw 97, 101, 103, 107, 109, 113, 127, 131
       dw 137, 139, 149, 151, 157, 163, 167, 173
       dw 179, 181, 191, 193, 197, 199, 211, 223 
       dw 227, 229, 233, 239, 241, 251, 257, 263
       dw 269, 271, 277, 281, 283, 293, 307, 311
primes_len equ $-primes

Results

architecture algorithm size
x86 SHA-256 (integrated version with dynamic k) 588
x86 SHA-256 (dynamic k + h) 666
x86 SHA-256 (static k + h) 710

There are a few additional tweaks that might shave off a few bytes but nothing significant, at least not that I can see but leave some feedback if you manage to reduce it further.

sources here on github

Advertisements
This entry was posted in assembly, cryptography, encryption, security and tagged , , , , , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s