Serpent Block Cipher

Posted on February 2, 2016 by odzhan

Introduction

Serpent is a 128-bit block cipher with support for 128,192 and 256-bit key sizes. It was designed by Eli Biham, Ross Anderson, Lars Knudsen and published in 1998. It was runner up to Rijndael that was selected to become the Advanced Encryption Standard (AES). The authors of Twofish (another AES finalist) stated in a paper The Twofish Team’s Final Comments on AES Selection regard Serpent as having the highest security margin.

Algorithm	Safety factor
MARS	1.9
RC6	1.18
Rijndael	1.11/1.33/1.56
Serpent	3.56
Twofish	2.67
32-round RC6	2.00
18-round Rijndael	2.00
24-round Rijndael	2.67

Two of the finalists, MARS and RC6, simply do not work well in certain applications. Any one of the other three algorithms—Rijndael (with the extra rounds), Serpent, or Twofish would make an excellent standard.

Serpent has positioned itself as the conservative choice, while Rijndael (with its original 10 rounds) is clearly the fastest choice. We believe that the results of the straw poll at the Third AES Candidate Conference reflected this dichotomy: those that thought security was paramount chose Serpent, while those more concerned with performance chose Rijndael

The assembly implementation here, was a joint effort between myself and Peter Ferrie.

Compact code

#define R(v,n)(((v)>>(n))|((v)<<(32-(n))))
#define X(a,b)t=a,a=b,b=t
#define HI(b)(((b)>>4)&0x0F)
#define LO(b)((b)&0x0F)
#define F(a,b)for(a=0;a<b;a++)

typedef unsigned char B;
typedef unsigned int W;
void sbox(W*, W);

void serpent(void*mk,void*data) {
    W i,j,s,rk[4],k[8],*x=data;

    F(i,8)k[i]=((W*)mk)[i];
      
    for(i=0;;) {
      F(j,4) {
        rk[j]=R((k[0]^k[3]^k[5]^k[7]^0x9e3779b9UL^(i*4+j)),21);
        F(s,7)k[s]=k[s+1];
        k[7]=rk[j];
      }
      sbox(rk,3-i);
      
      x[0]^=rk[0];x[1]^=rk[1];
      x[2]^=rk[2];x[3]^=rk[3];
      
      if(i==32)break;
      sbox(x,i);
      
      if(++i!=32) {
        x[0]=R(x[0],19);x[2]=R(x[2],29);
        x[1]^=x[0]^x[2];x[3]^=x[2]^(x[0]<<3);
        x[1]=R(x[1],31);x[3]=R(x[3],25);
        x[0]^=x[1]^x[3];x[2]^=x[3]^(x[1]<<7);
        x[0]=R(x[0],27);x[2]=R(x[2],10);
      }
    }
}

void sbox(W *x, W idx) {
    B s[16],p[16],t,i,j,c;
    
    B sbox[8][8] = 
    { { 0x83, 0x1F, 0x6A, 0xB5, 0xDE, 0x24, 0x07, 0xC9 },
      { 0xCF, 0x72, 0x09, 0xA5, 0xB1, 0x8E, 0xD6, 0x43 },
      { 0x68, 0x97, 0xC3, 0xFA, 0x1D, 0x4E, 0xB0, 0x25 },
      { 0xF0, 0x8B, 0x9C, 0x36, 0x1D, 0x42, 0x7A, 0xE5 },
      { 0xF1, 0x38, 0x0C, 0x6B, 0x52, 0xA4, 0xE9, 0xD7 },
      { 0x5F, 0xB2, 0xA4, 0xC9, 0x30, 0x8E, 0x6D, 0x17 },
      { 0x27, 0x5C, 0x48, 0xB6, 0x9E, 0xF1, 0x3D, 0x0A },
      { 0xD1, 0x0F, 0x8E, 0xB2, 0x47, 0xAC, 0x39, 0x65 }};

    for(i=0;i<16;i+=2) {
      t=sbox[idx%8][i/2];
      s[i+0]=LO(t);
      s[i+1]=HI(t);
    }
    
    // initial permutation
    F(i,16) {
      F(j,8) {
        c=x[j%4]&1;
        x[j%4]>>=1;
        p[i]=(c<<7)|(p[i]>>1);
      }
    }
    
    F(i,16)p[i]=(s[HI(p[i])]<<4)|s[LO(p[i])];

    // final permutation
    F(i,4) {
      F(j,32) {
        c=((W*)p)[i]&1;
        ((W*)p)[i]>>=1;
        x[j%4]=(c<<31)|(x[j%4]>>1);
      }
    }
}

x86 assembly

; -----------------------------------------------
; Serpent-256 block cipher in x86 assembly (Encryption only)
;
; size: 364 bytes
;
; global calls use cdecl convention
;
; -----------------------------------------------
    bits 32
    
    %ifndef BIN
      global serpent
    %endif

    %define x  edi
    %define k  esi
    
    %define x0 eax
    %define x1 ebx
    %define x2 ecx
    %define x3 edx
    %define x4 ebp

serpent:
    pushad
    mov    esi, [esp+32+4]    ; esi = mk
    ; allocate 64 bytes of memory
    pushad
    pushad
    ; copy 256-bit key to local memory
    ; F(i,8)k[i]=((W*)mk)[i];
    mov    edi, esp
    push   8
    pop    ecx
    rep    movsd
sx_main:
    ; create a subkey for this round
    ; rk[j]=R((k[0]^k[3]^k[5]^k[7]^0x9e3779b9UL^(i*4+j)),21);
    xor    edx, edx           ; edx = j
    mov    esi, esp           ; esi = k
    lea    edi, [esi+32]      ; edi = rk
sx_subkey:
    mov    eax, [esi]         ; t =k[0]
    xor    eax, [esi+3*4]     ; t^=k[3]
    xor    eax, [esi+5*4]     ; t^=k[5]
    xor    eax, [esi+7*4]     ; t^=k[7]
    xor    eax, 0x9e3779b9    ; t^=0x9e3779b9UL
    lea    ebp, [edx+4*ecx]   ; ebp=(j+4*i)
    xor    eax, ebp           ; t^=ebp
    ror    eax, 21            ; t = R(t, 21)
    mov    [edi+4*edx], eax   ; rk[j]=t
    
    ; F(s,7)k[s]=k[s+1];
    pushad
    push   esi                ; save k
    cmpsd                     ; esi = &k[1]
    pop    edi                ; edi = &k[0]
    mov    cl, 7
    rep    movsd
    ; k[7]=rk[j];
    stosd
    popad
    
    ; j++
    inc    edx
    cmp    dl, 4
    jne    sx_subkey
    
    ; sbox(rk, 3-i);
    dec    edx    
    sub    edx, ecx           ; 3 - i
    call   sbox
    
    mov    esi, [esp+96+8]    ; esi = data
        
    ; linear layer  
    ; x[0]^=rk[0];x[1]^=rk[1];
    ; x[2]^=rk[2];x[3]^=rk[3];
    pushad
    mov    cl, 4
xor_key:
    mov    eax, [edi]         ; eax = rk[i]
    xor    [esi], eax         ; x[i]^= eax
    cmpsd                     ; advance esi and edi
    loop   xor_key
    popad
    
    ; if(i==32)break;
    cmp    cl, 32
    je     sx_end
    
    ; nonlinear layer
    ; sbox(x, i);
    mov    edx, ecx
    mov    edi, esi
    call   sbox
    
    ; if(++i != 32) {
    inc    ecx
    cmp    cl, 32
    je     sx_main
    
    ; linear layer
    pushad
    lodsd
    xchg   x3, eax
    lodsd
    xchg   x1, eax
    lodsd
    xchg   x2, eax
    lodsd
    xchg   x3, eax
    ; x[0]=R(x[0],19);x[2]=R(x[2],29);
    ror    x0, 19
    ror    x2, 29
    ; x[1]^=x[0]^x[2];x[3]^=x[2]^(x[0]<<3);
    xor    x1, x0
    xor    x1, x2
    xor    x3, x2
    mov    x4, x0
    shl    x4, 3
    xor    x3, x4
    ; x[1]=R(x[1],31);x[3]=R(x[3],25);
    ror    x1, 31
    ror    x3, 25
    ; x[0]^=x[1]^x[3];x[2]^=x[3]^(x[1]<<7);
    xor    x0, x1
    xor    x0, x3
    xor    x2, x3
    mov    x4, x1
    shl    x4, 7
    xor    x2, x4
    ; x[0]=R(x[0],27);x[2]=R(x[2],10);
    ror    x0, 27
    ror    x2, 10
    stosd
    xchg   x1, eax
    stosd
    xchg   x2, eax
    stosd
    xchg   x3, eax
    stosd
    popad
    jmp    sx_main
    
sx_end:
    popad
    popad
    popad
    ret

    ; edx = idx
    ; edi = x
sbox:
    pushad
    call   init_sbox    
; sbox
  db 083h, 01Fh, 06Ah, 0B5h, 0DEh, 024h, 007h, 0C9h
  db 0CFh, 072h, 009h, 0A5h, 0B1h, 08Eh, 0D6h, 043h
  db 068h, 097h, 0C3h, 0FAh, 01Dh, 04Eh, 0B0h, 025h
  db 0F0h, 08Bh, 09Ch, 036h, 01Dh, 042h, 07Ah, 0E5h
  db 0F1h, 038h, 00Ch, 06Bh, 052h, 0A4h, 0E9h, 0D7h
  db 05Fh, 0B2h, 0A4h, 0C9h, 030h, 08Eh, 06Dh, 017h
  db 027h, 05Ch, 048h, 0B6h, 09Eh, 0F1h, 03Dh, 00Ah
  db 0D1h, 00Fh, 08Eh, 0B2h, 047h, 0ACh, 039h, 065h  
init_sbox:
    pop    esi               ; esi=sbox
    and    edx, 7            ; %= 8
    lea    esi, [esi+8*edx]  ; esi = &sbox[i*8]
    mov    edx, edi
    call   ld_perm
    
; void permute (out, in);
; edi = out
; esi = in
; CF = type
permute:
    pushad
    xchg   eax, ecx    ; ecx should be zero
    push   16
    pop    ecx
    pushad
    rep    stosb
    popad
    cdq            ; m=0
    jnc    do_fp
    ; initial permutation
ip_m_l:
    mov    eax, edx
    and    eax, 3
    shr    dword[esi+4*eax], 1
    rcr    byte[edi], 1
    inc    edx
    test   dl, 7
    jne    ip_m_l
    inc    edi
    loop   ip_m_l
xit_perm:
    popad
    ret
    ; final permutation
do_fp:
    mov    cl, 4    ; n=4
fp_m_l:
    mov    eax, edx
    and    eax, 3
    shr    dword[esi], 1
    rcr    dword[edi+4*eax], 1
    inc    edx
    test   dl, 31
    jne    fp_m_l
    lodsd
    loop   fp_m_l
    popad
    ret
    
ld_perm:
    pop    ebp
    
    pushad          ; create 2 16-byte blocks
    mov    ebx, esp          ; ebx=sb
    mov    edi, esp          ; edi=sb
    mov    cl, 8             ; SERPENT_BLK_LEN/2
sb_l1:
    lodsb                    ; t = sbp[i/2];
    aam    16
    stosw
    loop   sb_l1
    
    ; permute (&tmp_blk, blk, SERPENT_IP);
    mov    esi, edx
    stc
    call   ebp ;permute
    mov    esi, edi
    mov    cl, 16
    push   esi
sb_l2:
    lodsb
    aam    16
    xchg   ah, al
    xlatb
    xchg   ah, al
    xlatb
    aad    16 
    stosb
    loop   sb_l2
    
    pop    esi
    mov    edi, edx
    ; permute (blk, &tmp_blk, SERPENT_FP);
    call   ebp
    popad
    popad
    ret

Sources here.

This entry was posted in assembly, cryptography, encryption, programming and tagged anderson, assembly, biham, knudsen, serpent, x86. Bookmark the permalink.

9 Responses to Serpent Block Cipher

yanapermana says:

November 23, 2015 at 3:35 pm

Nice asm…
How many hours you spent for writing this code, brother?

LikeLike

Reply
- Odzhan says:
  
  November 23, 2015 at 3:54 pm
  
  Hello and Thank you. I think more time was spent optimizing the C code and analyzing output of different compilers to see what worked best before writing ASM.
  
  I’d spend time thinking of how to get everything smaller in C first so I honestly couldn’t say how long it took. Most of the work was done 6+ months ago and I only wrote/tested ASM yesterday in few hours maybe.
  
  It’s usually just a gradual process but writing the ASM didn’t take long once the C code was clean. It can still be improved but I wanted to draw a line under it.
  
  LikeLike
  
  Reply
Peter Ferrie says:

November 24, 2015 at 1:32 am

Some suggestions for shrinking the code further:

mov ebx, [esi+4*eax]
shr dword[esi+4*eax], 1
and ebx, 1
shl ebx, 7
shr byte[edi+ecx], 1
or byte[edi+ecx], bl

can become

shr dword[esi+4*eax], 1
rcr byte[edi+ecx], 1
—

mov ebx, [esi+4*ecx]
shr dword[esi+4*ecx], 1
and ebx, 1
shl ebx, 31
…
shr dword[edi+4*eax], 1
or dword[edi+4*eax], ebx

can become

shr dword[esi+4*ecx], 1
…
rcr dword[edi+4*eax], 1
—

lodsb ; t = sbp[i/2];
mov dl, al ; sb.v8[i+0] = HI_NIBBLE(t);
shr al, 4
stosb
xchg eax, edx
and al, 15

can become

lodsb
aam 16
xchg ah, al
stosb
xchg ah, al
stosb
—

lodsb ; t = tmp_blk.v8[i];
mov dl, al
shr al, 4 ; sb.v8[HI_NIBBLE(t)] << 4)
xlatb
shl al, 4
xchg eax, edx
and al, 15 ; sb.v8[LO_NIBBLE(t)]
xlatb
or al, dl

can become

lodsb
aam 16
xchg ah, al
xlatb
xchg ah, al
xlatb
aad 16
—

pushfd
rep stosb
popfd

can become
rep stosb

because flags are not affected here.
I will investigate further when I have more time…

LikeLiked by 1 person

Reply
- Odzhan says:
  
  November 24, 2015 at 3:05 am
  
  Peter, those are awesome suggestions and I’ve updated to reflect. I also changed use of carry flag. STC is used for encryption. Just makes more sense CF=1 for that.
  
  Funnily enough, I was thinking of using RCR when writing those initial/final permutation functions but not to the extent you suggested. Impressed 🙂 Thanks a lot for those.
  
  Probably the main area of improvement could be in set key function. I still don’t like it but for now, it works okay. The other thing I thought about was possibility of updating the key which is stored in ESI during call to blkxor and i was going to use CLD/STD to move it up or down but I keep thinking it’ll eventually end up with more code than just ADD/SUB so I probably will just leave it for now 🙂
  
  LikeLike
  
  Reply
  - Peter Ferrie says:
    
    November 24, 2015 at 10:16 pm
    
    Here’s my attempt to improve it further:
    
    pushad
    rep stosb
    popad
    
    can become
    
    push edi
    rep stosb
    pop edi
    —
    
    lea esi, [sbox] ; encryption sbox
    lea eax, [sbox_inv] ; decryption sbox
    
    should be
    
    mov esi, offset sbox ; encryption sbox
    mov eax, offset sbox_inv ; decryption sbox
    
    for smaller code
    —
    
    xchg ah, al
    ; sb.v8[i+0] = HI_NIBBLE(t);
    stosb
    xchg ah, al
    ; sb.v8[i+1] = LO_NIBBLE(t);
    stosb
    
    can become
    
    xchg ah, al
    ; sb.v8[i+0] = HI_NIBBLE(t);
    ; sb.v8[i+1] = LO_NIBBLE(t);
    stosw
    —
    
    mov eax, esp ; save in eax before saving blk in edi
    …
    xchg eax, edi
    
    can become
    
    mov ebx, esp
    …
    mov edi, ebx
    
    and no need to load ebx later
    —
    
    xor eax, eax
    …
    setb al
    …
    test eax, eax
    jz skey_il ; skip padding
    or byte [esi+ebx], al
    
    can become
    
    setb al
    …
    or byte [esi+ebx], al
    
    no need to zero eax, and the test isn’t needed, since the “or” is harmless if al is zero.
    —
    
    lea ebx, [edx+4*ecx]
    
    should move above skey_jl for better performance
    —
    
    mov edx, ecx ; ecx=i
    neg edx ;
    add edx, 3 ; i – 3
    
    can become
    
    push 3
    pop edx
    sub edx, ecx ; 3 – i
    —
    
    push 16
    pop ecx
    pushad
    rep movsb
    popad
    shl ecx, 1 ; *= 2 for default SERPENT_ROUNDS
    mov esi, ecx ; esi = 16
    shl esi, 4
    
    can become
    
    push 16
    pop ecx
    rep movsb
    mov cl, 32 ; *= 2 for default SERPENT_ROUNDS
    imul esi, ecx, 16 ; esi = rounds*16
    
    LikeLiked by 1 person
Pingback: Asmcodes: Twofish-256 | Odzhan
PELock says:

February 2, 2016 at 8:27 pm

1. Here:

; j++
inc edx
cmp edx, 4
jne skey_l2

The j counter is set to 0 and is checked against 4, use cmp dl,4 it will do the job and you’re it’s byte smaller, use the same strategy for any loops that starts with 0 and doesn’t exceed 0xFF 😉

2. Also I’ve noticed this is used twice:

…
add esp, 32
popad
ret

put a global label on that and jump from the other code to this label

common_return_epilog:
add esp, 32
popad
ret

LikeLiked by 1 person

Reply
- Odzhan says:
  
  February 2, 2016 at 8:46 pm
  
  Hello, perhaps it depends on assembler but the size of code remains same.
  
  /* 01FB */ “x42” /* inc edx */
  /* 01FC */ “x83xfax04” /* cmp edx, 0x4 */
  /* 01FF */ “x75xd5” /* jnz 0x1d6 */
  
  /* 01FB */ “x42” /* inc edx */
  /* 01FC */ “x80xfax04” /* cmp dl, 0x4 */
  /* 01FF */ “x75xd5” /* jnz 0x1d6 */
  
  I thought the short jmp could work but unfortunately the distance was over 2 bytes too long 🙂
  
  Having said that, If we can save a couple of bytes in between, which is no doubt possble, it’ll work then so I’ll keep it mind.
  
  Thanks!
  
  LikeLike
  
  Reply
Pingback: Twofish-256 Block cipher | crypto on x86 and ARM