Asmcodes: Serpent-256 Block cipher

Introduction

Serpent is a symmetric block cipher designed by Eli Biham, Ross Anderson and Lars Knudsen published in 1998. It was runner up to Rijndael which became the AES standard.

Like other AES finalists, it uses 128-bit block size, key sizes of 128, 192, 256-bits and 32 rounds of encryption.

The authors of Twofish (another AES finalist) in a paper The Twofish Team’s Final Comments on AES Selection regarded Serpent as having the highest security margin while also making recommendations for other ciphers in the selection.

algorithm safety factor
MARS 1.9
RC6 1.18
Rijndael 1.11/1.33/1.56
Serpent 3.56
Twofish 2.67
32-round RC6 2.00
18-round Rijndael 2.00
24-round Rijndael 2.67

Two of the finalists, MARS and RC6, simply do not work well in certain applications. Any one of the other three algorithms—Rijndael (with the extra rounds), Serpent, or Twofish would make an excellent standard.

Serpent has positioned itself as the conservative choice, while Rijndael (with its original 10 rounds) is clearly the fastest choice. We believe that the results of the straw poll at the Third AES Candidate Conference reflected this dichotomy: those that thought security was paramount chose Serpent, while those more concerned with performance chose Rijndael

Even though Serpent was considered the most secure, NIST finally selected Rijndael as the winner. Only time will tell which is the best.

The assembly implementation here, was a joint effort between myself and Peter Ferrie with Peter adding many improvements to original I initally wrote. It is optimized for size, NOT speed and derived from a modified C implementation originally written by Daniel Otte, the author of AVR-crypto-Lib.

Structures

Rather than do lots of casting for different integer sizes, a union serpent_blk is used.

#define GOLDEN_RATIO    0x9e3779b9l

#define SERPENT_ROUNDS  32
#define SERPENT_BLK_LEN 16
#define SERPENT_KEY256  32

#define SERPENT_ENCRYPT  0
#define SERPENT_DECRYPT  1

#define SERPENT_IP       0
#define SERPENT_FP       1

typedef union _serpent_blk_t {
  uint8_t v8[SERPENT_BLK_LEN];
  uint32_t v32[SERPENT_BLK_LEN/4];
  uint64_t v64[SERPENT_BLK_LEN/2];
} serpent_blk;

typedef uint32_t serpent_subkey_t[4];

typedef struct serpent_key_t {
  serpent_subkey_t x[SERPENT_ROUNDS+1];
} serpent_key;

Initial and Final Permutation

void permute (serpent_blk *out, 
  serpent_blk *in, int type) 
{
  uint8_t cy;   // carry 
  uint8_t n, m;
  
  for (n=0; n<SERPENT_BLK_LEN/4; n++) {
    out->v32[n]=0;
  }
  
  // initial
  if (type==SERPENT_IP)
  {
    for (n=0; n<16; n++) {
      for (m=0; m<8; m++) {
        cy = in->v32[m%4] & 1;
        in->v32[m%4] >>= 1;
        out->v8[n] = (cy << 7) | (out->v8[n] >> 1);
      }
    }
  } else {
    // final
    for (n=0; n<4; n++) {
      for (m=0; m<32; m++) {
        cy = in->v32[n] & 1;
        in->v32[n] >>= 1;
        out->v32[m%4] = (cy << 31) | (out->v32[m%4] >> 1);
      }
    }
  }
}

x86 assembly. The original code I wrote was purely C to ASM without realizing it could be optimized using RCR which was suggested by Peter Ferrie. Both functions were rewritten with suggestions you can see in the comments below. It may be possible to integrate both functions and save more space.

; void permute (out, in);
; edi = out
; esi = in
; CF = type
_permutex:
permute:
    pushad
    xchg   eax, ecx    ; ecx should be zero
    push   16
    pop    ecx
    pushad
    rep    stosb
    popad
    cdq            ; m=0
    jnc    do_fp
    ; initial permutation
ip_m_l:
    mov    eax, edx
    and    eax, 3
    shr    dword[esi+4*eax], 1
    rcr    byte[edi], 1
    inc    edx
    test   dl, 7
    jne    ip_m_l
    inc    edi
    loop   ip_m_l
xit_perm:
    popad
    ret
    ; final permutation
do_fp:
    mov    cl, 4    ; n=4
fp_m_l:
    mov    eax, edx
    and    eax, 3
    shr    dword[esi], 1
    rcr    dword[edi+4*eax], 1
    inc    edx
    test   dl, 31
    jne    fp_m_l
    lodsd
    loop   fp_m_l
    popad
    ret

Sbox function

#define HI_NIBBLE(b) (((b) >> 4) & 0x0F)
#define LO_NIBBLE(b) ((b) & 0x0F)

uint32_t serpent_gen_w (uint32_t *b, uint32_t i) {
  uint32_t ret;
  ret = b[0] ^ b[3] ^ b[5] ^ b[7] ^ GOLDEN_RATIO ^ i;
  return ROL32(ret, 11);
} 

// substitution box
void sbox128 (serpent_blk *blk, uint32_t box_idx, int type) 
{
  serpent_blk tmp_blk, sb;
  uint8_t *sbp;
  uint8_t i, t;
  
  box_idx &= 7;
  
  if (type==SERPENT_ENCRYPT) {
    sbp=(uint8_t*)&sbox[box_idx][0];
  } else {
    sbp=(uint8_t*)&sbox_inv[box_idx][0];
  }
  
  for (i=0; i<16; i+=2) {
    t = sbp[i/2];
    sb.v8[i+0] = LO_NIBBLE(t);
    sb.v8[i+1] = HI_NIBBLE(t);
  }
  
  permute (&tmp_blk, blk, SERPENT_IP);
  
  for (i=0; i<SERPENT_BLK_LEN; i++) {
    t = tmp_blk.v8[i];
    tmp_blk.v8[i] = (sb.v8[HI_NIBBLE(t)] << 4) | sb.v8[LO_NIBBLE(t)];
  }
  permute (blk, &tmp_blk, SERPENT_FP);
}
; CF=0 : encryption
; CF=1 : decryption
;
; edi = blk
; edx = i
_sbox128x:
sbox128:
    pushad
    call   init_sbox    
; sbox
  db 083h, 01Fh, 06Ah, 0B5h, 0DEh, 024h, 007h, 0C9h
  db 0CFh, 072h, 009h, 0A5h, 0B1h, 08Eh, 0D6h, 043h
  db 068h, 097h, 0C3h, 0FAh, 01Dh, 04Eh, 0B0h, 025h
  db 0F0h, 08Bh, 09Ch, 036h, 01Dh, 042h, 07Ah, 0E5h
  db 0F1h, 038h, 00Ch, 06Bh, 052h, 0A4h, 0E9h, 0D7h
  db 05Fh, 0B2h, 0A4h, 0C9h, 030h, 08Eh, 06Dh, 017h
  db 027h, 05Ch, 048h, 0B6h, 09Eh, 0F1h, 03Dh, 00Ah
  db 0D1h, 00Fh, 08Eh, 0B2h, 047h, 0ACh, 039h, 065h
; sbox_inv
  db 03Dh, 00Bh, 06Ah, 0C5h, 0E1h, 074h, 09Fh, 028h
  db 085h, 0E2h, 06Fh, 03Ch, 04Bh, 097h, 0D1h, 00Ah
  db 09Ch, 04Fh, 0EBh, 021h, 030h, 0D6h, 085h, 07Ah
  db 090h, 07Ah, 0EBh, 0D6h, 053h, 02Ch, 084h, 01Fh
  db 005h, 038h, 09Ah, 0E7h, 0C2h, 06Bh, 0F4h, 01Dh
  db 0F8h, 092h, 014h, 0EDh, 06Bh, 035h, 0C7h, 00Ah
  db 0AFh, 0D1h, 035h, 006h, 094h, 07Eh, 0C2h, 0B8h
  db 003h, 0D6h, 0E9h, 08Fh, 0C5h, 07Bh, 01Ah, 024h
  
init_sbox:
    pop    esi               ; esi=sbox
    jc     sb_and
    add    esi, 64           ; eax=sbox_inv
sb_and:
    and    edx, 7            ; %= 8
    lea    esi, [esi+8*edx]  ; esi = &sbox[i*8]
    mov    edx, edi
    call   ld_perm

; permute function here

ld_perm:
    pop    ebp
    
    sub    esp, 32           ; create 2 16-byte blocks
    mov    ebx, esp          ; ebx=sb
    mov    edi, esp          ; edi=sb
    mov    cl, 8             ; SERPENT_BLK_LEN/2
sb_l1:
    lodsb                    ; t = sbp[i/2];
    aam    16
    stosw
    loop   sb_l1
    
    ; permute (&tmp_blk, blk, SERPENT_IP);
    mov    esi, edx
    stc
    call   ebp ;permute
    mov    esi, edi
    mov    cl, SERPENT_BLK_LEN
    push   esi
sb_l2:
    lodsb
    aam    16
    xchg   ah, al
    xlatb
    xchg   ah, al
    xlatb
    aad    16 
    stosb
    loop   sb_l2
    
    pop    esi
    mov    edi, edx
    ; permute (blk, &tmp_blk, SERPENT_FP);
    call   ebp ;permute
    add    esp, 32
    popad
    ret

Key Expansion

// create serpent keys. only 256-bit is supported
void serpent_setkey (serpent_key *key, void *input) 
{
  union {
    uint8_t v8[32];
    uint32_t v32[8];
  } s_ws;
  
  uint32_t i, j;
  
  // copy key input to local buffer
  for (i=0; i<SERPENT_KEY256; i++) {
    s_ws.v8[i] = ((uint8_t*)input)[i];
  }

  // expand the key
  for (i=0; i<=SERPENT_ROUNDS; i++) {
    for (j=0; j<4; j++) {
      key->x[i][j] = serpent_gen_w (s_ws.v32, i*4+j);
      memmove (&s_ws.v8, &s_ws.v8[4], 7*4);
      s_ws.v32[7] = key->x[i][j];
    }
    sbox128 ((serpent_blk*)&key->x[i], 3 - i, SERPENT_ENCRYPT);
  }
}

The serpent_gen_w function was inlined since it’s only called once and only by setkey function.

_serpent_setkeyx:  
serpent_setkey:  
    pushad
    
    ; copy key into local memory
    push   32
    pop    ecx
    sub    esp, ecx
    mov    edi, esp
    mov    esi, [edi+64+8]  ; esi = input
    rep    movsb
    
    mov    esi, esp          ; esi = local key bytes
    mov    edi, [edi+32+4]   ; edi = key ctx
    ; ecx will be i which is now 0
skey_l1:
    xor    edx, edx           ; j=0
skey_l2:
    ; serpent_gen_w (s_ws.v32, j+4*i);
    mov    eax, [esi]
    xor    eax, [esi+3*4]
    xor    eax, [esi+5*4]
    xor    eax, [esi+7*4]
    xor    eax, GOLDEN_RATIO
    lea    ebx, [edx+4*ecx]
    xor    eax, ebx
    rol    eax, 11
    mov    [edi+4*edx], eax
    
    ; rotate workspace left 32 bits
    pushad
    push   esi
    cmpsd
    pop    edi
    mov    cl, 7
    rep    movsd
    stosd
    popad
    
    ; j++
    inc    edx
    cmp    edx, 4
    jne    skey_l2
    
    ; apply sbox
    dec    edx    
    sub    edx, ecx          ; 3 - i
    stc                      ; CF=1 for encrypt
    call   sbox128
    
    ; advance key
    add    edi, 16
    
    ; i++
    inc    ecx
    cmp    ecx, 32
    jbe    skey_l1
    
    add    esp, 32
    popad
    ret

Linear Transformation

// linear transformation
void serpent_lt (serpent_blk* x, int enc) 
{
  uint32_t x0, x1, x2, x3;
  
  // load
  x0=x->v32[0];
  x1=x->v32[1];
  x2=x->v32[2];
  x3=x->v32[3];
  
  if (enc==SERPENT_DECRYPT) {
    x2 = ROL32(x2, 10);
    x0 = ROR32(x0, 5);
    x2 ^= x3 ^ (x1 << 7);
    x0 ^= x1 ^ x3;
    x3 = ROR32(x3, 7);
    x1 = ROR32(x1, 1);
    x3 ^= x2 ^ (x0 << 3);
    x1 ^= x0 ^ x2;
    x2 = ROR32(x2,  3);
    x0 = ROR32(x0, 13);
  } else {
    x0 = ROL32(x0, 13);
    x2 = ROL32(x2,  3);
    x1 ^= x0 ^ x2;
    x3 ^= x2 ^ (x0 << 3);
    x1 = ROL32(x1, 1);
    x3 = ROL32(x3, 7);
    x0 ^= x1 ^ x3;
    x2 ^= x3 ^ (x1 << 7);
    x0 = ROL32(x0, 5);
    x2 = ROR32(x2, 10);
  }
  // save
  x->v32[0]=x0;
  x->v32[1]=x1;
  x->v32[2]=x2;
  x->v32[3]=x3;
}
%define x0 eax
%define x1 ebx
%define x2 edx
%define x3 ebp
%define x4 esi

; ecx=0 for encrypt
; ecx=1 for decrypt
;
; edi = input
_serpent_ltx:
serpent_lt:
    pushad
    mov    esi, edi
    
    lodsd
    xchg   x0, x3
    lodsd
    xchg   x0, x1
    lodsd
    xchg   x0, x2
    lodsd
    xchg   x0, x3
    
    jecxz  lt_enc   ; if ecx=0, encryption
    
    rol    x2, 10
    ror    x0, 5
    mov    x4, x1
    shl    x4, 7
    xor    x2, x4
    xor    x2, x3
    xor    x0, x1
    xor    x0, x3
    ror    x3, 7
    ror    x1, 1
    mov    x4, x0
    shl    x4, 3
    xor    x3, x4
    xor    x3, x2
    xor    x1, x0
    xor    x1, x2
    ror    x2, 3
    ror    x0, 13
    jmp    lt_end
lt_enc:
    rol    x0, 13
    rol    x2, 3
    xor    x1, x0
    xor    x1, x2
    xor    x3, x2
    mov    x4, x0
    shl    x4, 3
    xor    x3, x4
    rol    x1, 1
    rol    x3, 7
    xor    x0, x1
    xor    x0, x3
    mov    x4, x1
    shl    x4, 7
    xor    x2, x3
    xor    x2, x4
    rol    x0, 5
    ror    x2, 10
lt_end:
    stosd
    xchg   x0, x1
    stosd
    xchg   x0, x2
    stosd
    xchg   x0, x3
    stosd
    popad
    ret

Encryption and Decryption

void serpent_encrypt (void *in, serpent_key *key, int enc)
{
  int8_t i;
  serpent_blk *out=in;
  
  if (enc==SERPENT_DECRYPT)
  {
    i=SERPENT_ROUNDS;
    blkxor (out, key, i);
    for (;;) {
      --i;
      // apply sbox
      sbox128 (out, i, SERPENT_DECRYPT);
      // xor with subkey
      blkxor (out, key, i);
      if (i==0) break;
      // linear transformation
      serpent_lt (out, SERPENT_DECRYPT);
    }
  } else {
    i=0;
    for (;;) {
      // xor with subkey
      blkxor (out, key, i);
      // apply sbox
      sbox128 (out, i, SERPENT_ENCRYPT);
      if (++i == SERPENT_ROUNDS) break;
      // linear transformation
      serpent_lt (out, SERPENT_ENCRYPT);
    }
    blkxor (out, key, i);
  }
}
_serpent_encryptx:
serpent_encrypt:
    pushad
    lea    esi, [esp+32+ 4]
    lodsd
    xchg   edi, eax         ; out
    lodsd
    xchg   ecx, eax
    lodsd
    xchg   ecx, eax         ; enc
    xchg   esi, eax         ; key
    call   ld_fn
    ; ****************************
blkxor:
    pushad
    mov    cl, 4
    shl    edx, cl
blk_l:
    mov    eax, [esi+edx]
    xor    [edi], eax
    cmpsd
    loop   blk_l
    popad
    ret

; linear transformation function goes here
; sbox function here
; permute function here

ld_fn:
    ; *********************************
    pop    ebp
    lea    ebx, [ebp+(serpent_lt-blkxor)]
    lea    eax, [ebx+(sbox128-serpent_lt)]
    xor    edx, edx
    jecxz  se_e
    mov    dl, SERPENT_ROUNDS
    ; blkxor (out, key, i);
    call   ebp ; blkxor
    jmp    sd_e
sd_l:
    ; serpent_lt (out, SERPENT_DECRYPT);
    call   ebx ; serpent_lt
sd_e:
    dec    edx               ; --i
    ; sbox128 (out, i, SERPENT_DECRYPT);
    clc
    call   eax ; sbox128
    ; blkxor (out, key, i);
    call   ebp ; blkxor
    test   edx, edx
    jnz    sd_l
    popad
    ret
se_l:
    ; serpent_lt (out, SERPENT_ENCRYPT);
    call   ebx ; serpent_lt
se_e:
    ; blkxor (out, key, i);
    call   ebp ; blkxor
    ; sbox128 (out, i, SERPENT_ENCRYPT);
    stc
    call   eax ; sbox128
    inc    edx
    cmp    edx, SERPENT_ROUNDS
    jne    se_l
    call   ebp ; blkxor
    popad
    ret
    ; *********************************

Results

architecture size in bytes
x86 536
x64 n/a

Check out sources here and here for any future updates.

Advertisements
This entry was posted in assembly, cryptography, encryption, programming and tagged , , , , , . Bookmark the permalink.

8 Responses to Asmcodes: Serpent-256 Block cipher

  1. yanapermana says:

    Nice asm…
    How many hours you spent for writing this code, brother?

    Like

    • Odzhan says:

      Hello and Thank you. I think more time was spent optimizing the C code and analyzing output of different compilers to see what worked best before writing ASM.

      I’d spend time thinking of how to get everything smaller in C first so I honestly couldn’t say how long it took. Most of the work was done 6+ months ago and I only wrote/tested ASM yesterday in few hours maybe.

      It’s usually just a gradual process but writing the ASM didn’t take long once the C code was clean. It can still be improved but I wanted to draw a line under it.

      Like

  2. Peter Ferrie says:

    Some suggestions for shrinking the code further:

    mov ebx, [esi+4*eax]
    shr dword[esi+4*eax], 1
    and ebx, 1
    shl ebx, 7
    shr byte[edi+ecx], 1
    or byte[edi+ecx], bl

    can become

    shr dword[esi+4*eax], 1
    rcr byte[edi+ecx], 1

    mov ebx, [esi+4*ecx]
    shr dword[esi+4*ecx], 1
    and ebx, 1
    shl ebx, 31

    shr dword[edi+4*eax], 1
    or dword[edi+4*eax], ebx

    can become

    shr dword[esi+4*ecx], 1

    rcr dword[edi+4*eax], 1

    lodsb ; t = sbp[i/2];
    mov dl, al ; sb.v8[i+0] = HI_NIBBLE(t);
    shr al, 4
    stosb
    xchg eax, edx
    and al, 15

    can become

    lodsb
    aam 16
    xchg ah, al
    stosb
    xchg ah, al
    stosb

    lodsb ; t = tmp_blk.v8[i];
    mov dl, al
    shr al, 4 ; sb.v8[HI_NIBBLE(t)] << 4)
    xlatb
    shl al, 4
    xchg eax, edx
    and al, 15 ; sb.v8[LO_NIBBLE(t)]
    xlatb
    or al, dl

    can become

    lodsb
    aam 16
    xchg ah, al
    xlatb
    xchg ah, al
    xlatb
    aad 16

    pushfd
    rep stosb
    popfd

    can become
    rep stosb

    because flags are not affected here.
    I will investigate further when I have more time…

    Liked by 1 person

    • Odzhan says:

      Peter, those are awesome suggestions and I’ve updated to reflect. I also changed use of carry flag. STC is used for encryption. Just makes more sense CF=1 for that.

      Funnily enough, I was thinking of using RCR when writing those initial/final permutation functions but not to the extent you suggested. Impressed 🙂 Thanks a lot for those.

      Probably the main area of improvement could be in set key function. I still don’t like it but for now, it works okay. The other thing I thought about was possibility of updating the key which is stored in ESI during call to blkxor and i was going to use CLD/STD to move it up or down but I keep thinking it’ll eventually end up with more code than just ADD/SUB so I probably will just leave it for now 🙂

      Like

      • Peter Ferrie says:

        Here’s my attempt to improve it further:

        pushad
        rep stosb
        popad

        can become

        push edi
        rep stosb
        pop edi

        lea esi, [sbox] ; encryption sbox
        lea eax, [sbox_inv] ; decryption sbox

        should be

        mov esi, offset sbox ; encryption sbox
        mov eax, offset sbox_inv ; decryption sbox

        for smaller code

        xchg ah, al
        ; sb.v8[i+0] = HI_NIBBLE(t);
        stosb
        xchg ah, al
        ; sb.v8[i+1] = LO_NIBBLE(t);
        stosb

        can become

        xchg ah, al
        ; sb.v8[i+0] = HI_NIBBLE(t);
        ; sb.v8[i+1] = LO_NIBBLE(t);
        stosw

        mov eax, esp ; save in eax before saving blk in edi

        xchg eax, edi

        can become

        mov ebx, esp

        mov edi, ebx

        and no need to load ebx later

        xor eax, eax

        setb al

        test eax, eax
        jz skey_il ; skip padding
        or byte [esi+ebx], al

        can become

        setb al

        or byte [esi+ebx], al

        no need to zero eax, and the test isn’t needed, since the “or” is harmless if al is zero.

        lea ebx, [edx+4*ecx]

        should move above skey_jl for better performance

        mov edx, ecx ; ecx=i
        neg edx ;
        add edx, 3 ; i – 3

        can become

        push 3
        pop edx
        sub edx, ecx ; 3 – i

        push 16
        pop ecx
        pushad
        rep movsb
        popad
        shl ecx, 1 ; *= 2 for default SERPENT_ROUNDS
        mov esi, ecx ; esi = 16
        shl esi, 4

        can become

        push 16
        pop ecx
        rep movsb
        mov cl, 32 ; *= 2 for default SERPENT_ROUNDS
        imul esi, ecx, 16 ; esi = rounds*16

        Liked by 1 person

  3. Pingback: Asmcodes: Twofish-256 | Odzhan

  4. PELock says:

    1. Here:

    ; j++
    inc edx
    cmp edx, 4
    jne skey_l2

    The j counter is set to 0 and is checked against 4, use cmp dl,4 it will do the job and you’re it’s byte smaller, use the same strategy for any loops that starts with 0 and doesn’t exceed 0xFF 😉

    2. Also I’ve noticed this is used twice:


    add esp, 32
    popad
    ret

    put a global label on that and jump from the other code to this label

    common_return_epilog:
    add esp, 32
    popad
    ret

    Liked by 1 person

    • Odzhan says:

      Hello, perhaps it depends on assembler but the size of code remains same.

      /* 01FB */ “x42” /* inc edx */
      /* 01FC */ “x83xfax04” /* cmp edx, 0x4 */
      /* 01FF */ “x75xd5” /* jnz 0x1d6 */

      /* 01FB */ “x42” /* inc edx */
      /* 01FC */ “x80xfax04” /* cmp dl, 0x4 */
      /* 01FF */ “x75xd5” /* jnz 0x1d6 */

      I thought the short jmp could work but unfortunately the distance was over 2 bytes too long 🙂

      Having said that, If we can save a couple of bytes in between, which is no doubt possble, it’ll work then so I’ll keep it mind.

      Thanks!

      Like

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s