MD4 cryptographic hash

Introduction

The MD4 Message-Digest Algorithm RFC 1320 has been considered broken by the security community for well over a decade. RFC 6150 published in 2011 outlines the various reasons why it should cease being used so there’s no need to discuss problems with it here.

It’s still used in many operating systems today for password security and protocols. Moreoever, the design of many other hashing algorithms such as MD5, SHA-1 and RIPEMD were derived from MD4 which leads one to reasonable conclusion such algorithms will be part of our computing world indefinitely.

Microsoft Windows uses NTLM algorithm which is essentially an MD4 hash of UTF-16 password.
Early implementations of Kerberos for Windows, MS-CHAPv1 and MS-CHAPv2 just to name a few all require MD4 to function so even though we know it’s unsuitable for security today, it will be around for many years to come, mostly for legacy purposes.

What I’ve done is optimize an implementation for size to determine the suitability of its overall design for applications that require small footprint.

Initialization

First thing we do is initialize our state structure using known constants.
The value of these constants are not derived on anything specific.

#define MD4_CBLOCK        64
#define MD4_DIGEST_LENGTH 16
#define MD4_LBLOCK        MD4_DIGEST_LENGTH/4

#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))

#pragma pack(push, 1)
typedef struct _MD4_CTX {
  union {
    uint8_t   v8[MD4_DIGEST_LENGTH];
    uint32_t v32[MD4_DIGEST_LENGTH/4];
  } state;
  union {
    uint8_t   v8[MD4_CBLOCK];
    uint32_t v32[MD4_CBLOCK/4];
    uint64_t v64[MD4_CBLOCK/8];
  } buffer;
  uint64_t len;
} MD4_CTX;
#pragma pack(pop)

In assembly, the structure and definitions.

MD4_CBLOCK        EQU 64
MD4_DIGEST_LENGTH EQU 16
MD4_LBLOCK        EQU MD4_DIGEST_LENGTH / 4

MD4_CTX struct 
  union state
    v8  byte  MD4_DIGEST_LENGTH dup (?)
    v32 dword MD4_LBLOCK dup (?)
  ends
  buffer byte MD4_CBLOCK dup (?)
  len    dword 2 dup (?)
MD4_CTX ends

I have not defined structure for assembler version like C because assembly code doesn’t require casting when accessing data of different types.
The pragma directive ensures the structure is exactly the same size in assembler version.

Initialization

void MD4_Init (MD4_CTX *c) {
  c->len  = 0;
  c->s.w[0] = 0x67452301;
  c->s.w[1] = 0xefcdab89;
  c->s.w[2] = 0x98badcfe;
  c->s.w[3] = 0x10325476;
}

In x86 assembly..

public MD4_Init
MD4_Init proc C
    pushad
    mov    edi, [esp+32+4]       ; context
    xor    eax, eax
    mov    [edi][MD4_CTX.len+0], eax
    mov    [edi][MD4_CTX.len+4], eax
    ; ctx->state.v32[0] = 0x67452301
    mov    eax, 067452301h
    stosd
    ; ctx->state.v32[1] = 0xefcdab89
    mov    eax, 0efcdab89h
    stosd
    ; ctx->state.v32[2] = 0x98badcfe
    mov    eax, 098badcfeh
    stosd
    ; ctx->state.v32[3] = 0x10325476
    mov    eax, 010325476h
    stosd
    popad
    ret
MD4_Init endp

Now we have our state initialized, we add data to the state buffer and once it reaches our block length, will update the 4 variables initialized. First, in C.

Updating state

void MD4_Update (MD4_CTX *c, void *in, uint32_t len) {
  uint8_t *p = (uint8_t*)in;
  uint32_t r, idx;
  
  if (len==0) return;
  
  // get buffer index
  idx = c->len & MD4_CBLOCK - 1;
  
  // update length
  c->len += len;
  
  while (len > 0) {
    r = MIN (len, MD4_CBLOCK - idx);
    memcpy ((void*)&c->buf.b[idx], p, r);
    if ((idx + r) < MD4_CBLOCK) break;
    
    MD4_Transform (c);
    len -= r;
    idx = 0;
    p += r;
  }
}

I had used size_t in initial version but this varies in size depending on architecture.

public MD4_Update
MD4_Update proc C
    pushad
    mov    ecx, [esp+32+12]
    jecxz  exit_update
    xchg   eax, ecx
    
    ; get buffer index
    mov    ebx, [esp+32+4]    ; context
    mov    esi, [esp+32+8]    ; input

    ; idx = ctx->len & MD4_CBLOCK - 1;
    mov    edx, [ebx][MD4_CTX.len]
    and    edx, MD4_CBLOCK - 1
    
    ; limit of (2^32)-1 bytes each update
    ; ctx->len += len;
    add    [ebx][MD4_CTX.len+0], eax
    adc    [ebx][MD4_CTX.len+4], 0
    
    .while 1
      ; r = MIN(len, MD4_CBLOCK - idx);
      push   MD4_CBLOCK
      pop    ecx
      sub    ecx, edx
      cmp    ecx, eax
      cmovae ecx, eax
      ; memcpy ((void*)&ctx->buffer[idx], p, r);
      lea    edi, [ebx][MD4_CTX.buffer][edx]
      ; idx += r
      add    edx, ecx
      ; len -= r
      sub    eax, ecx
      rep    movsb
      ; if ((idx + r) < MD4_CBLOCK) break;
      .break .if edx < MD4_CBLOCK
      push   ebx
      call   MD4_Transform
      cdq
    .endw
exit_update:
    popad
    ret
MD4_Update endp

Finalization

Our final result is based on calculating the length in bits of data and appending byte to end.

void MD4_Final (void* dgst, MD4_CTX *c)
{
  // see what length we have ere..
  uint32_t len=c->len & MD4_CBLOCK - 1;
  // fill with zeros
  memset (&c->buf.b[len], 0, MD4_CBLOCK - len);
  // add the end bit
  c->buf.b[len] = 0x80;
  // if exceeding 56 bytes, transform it
  if (len >= 56) {
    MD4_Transform (c);
    // clear
    memset (c->buf.b, 0, MD4_CBLOCK);
  }
  // add total bits
  c->buf.q[7] = c->len * 8;
  // compress
  MD4_Transform(c);
  // copy digest to buffer
  memcpy (dgst, c->s.b, MD4_DIGEST_LENGTH);
}
public MD4_Final
MD4_Final proc C
    pushad
    mov    esi, [esp+32+8]   ; context
    mov    edi, [esp+32+4]   ; dgst
    
    ; uint64_t len=ctx->len & MD4_CBLOCK - 1;
    mov    ecx, [esi][MD4_CTX.len+0]
    and    ecx, MD4_CBLOCK - 1
    
    ; fill with zeros
    ; memset (&ctx->buffer.v8[len], 0, MD4_CBLOCK - len);
    pushad
    lea    edi, [esi][MD4_CTX.buffer][ecx]
    neg    ecx
    add    ecx, MD4_CBLOCK
    xor    eax, eax
    rep    stosb
    popad 
    
    ; ctx->buffer.v8[len] = 0x80;
    mov    byte ptr[esi][MD4_CTX.buffer][ecx], 80h
    
    ; if (len >= 56)
    cmp    ecx, 56
    .if !carry?
      push esi
      call MD4_Transform
      
      ; memset (ctx->buffer.v8, 0, MD4_CBLOCK);
      push edi
      lea  edi, [esi][MD4_CTX.buffer]
      push MD4_CBLOCK/4
      pop  ecx
      xor  eax, eax
      rep  stosd
      pop  edi
    .endif
    
    ; add total bits
    ; ctx->buffer.v64[7] = ctx->len * 8;
    mov    eax, [esi][MD4_CTX.len+0]
    mov    edx, [esi][MD4_CTX.len+4]
    shld   edx, eax, 3
    shl    eax, 3
    mov    dword ptr[esi][MD4_CTX.buffer+56], eax
    mov    dword ptr[esi][MD4_CTX.buffer+60], edx  
    
    ; compress
    ; MD4_Transform(ctx);
    push   esi    
    call   MD4_Transform
    
    ; copy digest to buffer
    ; memcpy (dgst, ctx->state.v8, MD4_DIGEST_LENGTH);
    push   MD4_LBLOCK
    pop    ecx
    rep    movsd
    popad
    ret    
MD4_Final endp

The bulk of computation occurs in the transform/compression function which is optimized for size, not speed.

#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define G(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
#define H(x, y, z) ((x) ^ (y) ^ (z))

/************************************************
 *
 * Transform block of data.
 *
 ************************************************/
void MD4_Transform (MD4_CTX *c) 
{
  uint32_t i, t;
  uint32_t s[4];
  
  // increment by 4
  uint8_t rotf[] = { 3, 7, 11, 19 };
  uint8_t rotg[] = { 3, 5,  9, 13 };
  uint8_t roth[] = { 3, 9, 11, 15 };

  // increment by 4 mod 15
  uint8_t idxg[] = 
  { 0, 4,  8, 12, 
    1, 5,  9, 13, 
    2, 6, 10, 14,  
    3, 7, 11, 15 };
    
  // increment by 8 mod 12, 18, 12, 21
  uint8_t idxh[] = 
  { 0,  8, 4, 12, 
    2, 10, 6, 14, 
    1,  9, 5, 13,  
    3, 11, 7, 15 };

  for (i=0; i<4; i++) {
    s[i]=c->s.w[i];
  }
  
  // for 48 rounds
  for (i=0; i<48; i++) {
    if (i < 16) {
      s[0] += F(s[1], s[2], s[3]) + 
        c->buf.w[i];
      t = rotf[i%4];
    } else if (i < 32) {
      s[0] += G(s[1], s[2], s[3]) + 
        c->buf.w[idxg[i%16]] + 0x5a827999L;
      t = rotg[i%4];
    } else {
      s[0] += H(s[1], s[2], s[3]) + 
        c->buf.w[idxh[i%16]] + 0x6ed9eba1L;
      t = roth[i%4];
    }
    t=ROTL32(s[0], t);
    s[0]=s[3];
    s[3]=s[2];
    s[2]=s[1];
    s[1]=t;
  }

  for (i=0; i<4; i++) {
    c->s.w[i] += s[i];
  }
}
_a equ <eax>  ; don't change
_b equ <ebx>
_c equ <ebp>
_d equ <edx>

_i equ <esi>
t1 equ <edi>
t2 equ <ecx>

; update context with input
MD4_Transform proc stdcall
    pushad
    
    ; load context
    mov    esi, [esp+32+4]  ; ctx
    
    ; load context
    ; a = ctx->state.v32[0]
    lodsd
    xchg   _d, eax
    ; b = ctx->state.v32[1]
    lodsd
    xchg   _b, eax
    ; c = ctx->state.v32[2]
    lodsd
    xchg   _c, eax
    ; d = ctx->state.v32[3]
    lodsd
    xchg   _d, eax
    
    push   esi
    
    xor    _i, _i
    
    .repeat
      mov    t1, [esp]   
      movzx  t2, byte ptr idx[_i]     ; idx[i]
      add    _a, [t1+4*t2]            ; a += x[i]
      mov    t2, _i
      and    t2, 3                    ; i %= 4
      mov    t1, _c
      .if     _i < 16
        mov   cl, byte ptr rotf[t2]    ; rotf[i%4]
        xor   t1, _d
        and   t1, _b
        xor   t1, _d
      .elseif _i < 32
        mov   cl, byte ptr rotg[t2]    ; rotg[i%4]
        push  ecx
        mov   t2, _c
        or    t1, _d
        and   t1, _b
        and   t2, _d
        or    t1, t2
        pop   ecx
        add   t1, 05a827999h
      .else
        mov   cl, byte ptr roth[t2]    ; roth[i%4]
        xor   t1, _b
        xor   t1, _d
        add   t1, 06ed9eba1h
      .endif
      
      add    _a, t1
      rol    _a, cl
      
      mov    t1, _a
      mov    _a, _d
      mov    _d, _c
      mov    _c, _b
      mov    _b, t1
      
      inc    _i
    .until _i == 48
    
    pop    esi
    mov    edi, [esp+32+4]
    
    ; save context
    ; ctx->state.v32[0] += a;
    add    [edi], _a
    scasd
    ; ctx->state.v32[1] += b;
    add    [edi], _b
    scasd
    ; ctx->state.v32[2] += c;
    add    [edi], _c
    scasd
    ; ctx->state.v32[3] += d;
    add    [edi], _d
    ; restore registers
    popad
    retn   4
MD4_Transform endp

Initial Results

cl /Fa /O2 /Os /GS- /c md4.c
jwasm -bin md4.asm

Not terribly impressive if size is a constraint.

architecture size
x86 393
x64 481

Conclusion

For its day, MD4 was a reasonably good hash algorithm but for systems with limited resources, it’s too bulky. There are many better algorithms that have surfaced since which we’ll look at later.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming and tagged , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s