Serpent Block Cipher

Introduction

Serpent is a 128-bit block cipher with support for 128,192 and 256-bit key sizes. It was designed by Eli Biham, Ross Anderson, Lars Knudsen and published in 1998. It was runner up to Rijndael that was selected to become the Advanced Encryption Standard (AES). The authors of Twofish (another AES finalist) stated in a paper The Twofish Team’s Final Comments on AES Selection regard Serpent as having the highest security margin.

Algorithm Safety factor
MARS 1.9
RC6 1.18
Rijndael 1.11/1.33/1.56
Serpent 3.56
Twofish 2.67
32-round RC6 2.00
18-round Rijndael 2.00
24-round Rijndael 2.67

Two of the finalists, MARS and RC6, simply do not work well in certain applications. Any one of the other three algorithms—Rijndael (with the extra rounds), Serpent, or Twofish would make an excellent standard.

Serpent has positioned itself as the conservative choice, while Rijndael (with its original 10 rounds) is clearly the fastest choice. We believe that the results of the straw poll at the Third AES Candidate Conference reflected this dichotomy: those that thought security was paramount chose Serpent, while those more concerned with performance chose Rijndael

The assembly implementation here, was a joint effort between myself and Peter Ferrie.

Compact code

#define R(v,n)(((v)>>(n))|((v)<<(32-(n))))
#define X(a,b)t=a,a=b,b=t
#define HI(b)(((b)>>4)&0x0F)
#define LO(b)((b)&0x0F)
#define F(a,b)for(a=0;a<b;a++)

typedef unsigned char B;
typedef unsigned int W;
void sbox(W*, W);

void serpent(void*mk,void*data) {
    W i,j,s,rk[4],k[8],*x=data;

    F(i,8)k[i]=((W*)mk)[i];
      
    for(i=0;;) {
      F(j,4) {
        rk[j]=R((k[0]^k[3]^k[5]^k[7]^0x9e3779b9UL^(i*4+j)),21);
        F(s,7)k[s]=k[s+1];
        k[7]=rk[j];
      }
      sbox(rk,3-i);
      
      x[0]^=rk[0];x[1]^=rk[1];
      x[2]^=rk[2];x[3]^=rk[3];
      
      if(i==32)break;
      sbox(x,i);
      
      if(++i!=32) {
        x[0]=R(x[0],19);x[2]=R(x[2],29);
        x[1]^=x[0]^x[2];x[3]^=x[2]^(x[0]<<3);
        x[1]=R(x[1],31);x[3]=R(x[3],25);
        x[0]^=x[1]^x[3];x[2]^=x[3]^(x[1]<<7);
        x[0]=R(x[0],27);x[2]=R(x[2],10);
      }
    }
}

void sbox(W *x, W idx) {
    B s[16],p[16],t,i,j,c;
    
    B sbox[8][8] = 
    { { 0x83, 0x1F, 0x6A, 0xB5, 0xDE, 0x24, 0x07, 0xC9 },
      { 0xCF, 0x72, 0x09, 0xA5, 0xB1, 0x8E, 0xD6, 0x43 },
      { 0x68, 0x97, 0xC3, 0xFA, 0x1D, 0x4E, 0xB0, 0x25 },
      { 0xF0, 0x8B, 0x9C, 0x36, 0x1D, 0x42, 0x7A, 0xE5 },
      { 0xF1, 0x38, 0x0C, 0x6B, 0x52, 0xA4, 0xE9, 0xD7 },
      { 0x5F, 0xB2, 0xA4, 0xC9, 0x30, 0x8E, 0x6D, 0x17 },
      { 0x27, 0x5C, 0x48, 0xB6, 0x9E, 0xF1, 0x3D, 0x0A },
      { 0xD1, 0x0F, 0x8E, 0xB2, 0x47, 0xAC, 0x39, 0x65 }};

    for(i=0;i<16;i+=2) {
      t=sbox[idx%8][i/2];
      s[i+0]=LO(t);
      s[i+1]=HI(t);
    }
    
    // initial permutation
    F(i,16) {
      F(j,8) {
        c=x[j%4]&1;
        x[j%4]>>=1;
        p[i]=(c<<7)|(p[i]>>1);
      }
    }
    
    F(i,16)p[i]=(s[HI(p[i])]<<4)|s[LO(p[i])];

    // final permutation
    F(i,4) {
      F(j,32) {
        c=((W*)p)[i]&1;
        ((W*)p)[i]>>=1;
        x[j%4]=(c<<31)|(x[j%4]>>1);
      }
    }
}

x86 assembly

; -----------------------------------------------
; Serpent-256 block cipher in x86 assembly (Encryption only)
;
; size: 364 bytes
;
; global calls use cdecl convention
;
; -----------------------------------------------
    bits 32
    
    %ifndef BIN
      global serpent
    %endif

    %define x  edi
    %define k  esi
    
    %define x0 eax
    %define x1 ebx
    %define x2 ecx
    %define x3 edx
    %define x4 ebp

serpent:
    pushad
    mov    esi, [esp+32+4]    ; esi = mk
    ; allocate 64 bytes of memory
    pushad
    pushad
    ; copy 256-bit key to local memory
    ; F(i,8)k[i]=((W*)mk)[i];
    mov    edi, esp
    push   8
    pop    ecx
    rep    movsd
sx_main:
    ; create a subkey for this round
    ; rk[j]=R((k[0]^k[3]^k[5]^k[7]^0x9e3779b9UL^(i*4+j)),21);
    xor    edx, edx           ; edx = j
    mov    esi, esp           ; esi = k
    lea    edi, [esi+32]      ; edi = rk
sx_subkey:
    mov    eax, [esi]         ; t =k[0]
    xor    eax, [esi+3*4]     ; t^=k[3]
    xor    eax, [esi+5*4]     ; t^=k[5]
    xor    eax, [esi+7*4]     ; t^=k[7]
    xor    eax, 0x9e3779b9    ; t^=0x9e3779b9UL
    lea    ebp, [edx+4*ecx]   ; ebp=(j+4*i)
    xor    eax, ebp           ; t^=ebp
    ror    eax, 21            ; t = R(t, 21)
    mov    [edi+4*edx], eax   ; rk[j]=t
    
    ; F(s,7)k[s]=k[s+1];
    pushad
    push   esi                ; save k
    cmpsd                     ; esi = &k[1]
    pop    edi                ; edi = &k[0]
    mov    cl, 7
    rep    movsd
    ; k[7]=rk[j];
    stosd
    popad
    
    ; j++
    inc    edx
    cmp    dl, 4
    jne    sx_subkey
    
    ; sbox(rk, 3-i);
    dec    edx    
    sub    edx, ecx           ; 3 - i
    call   sbox
    
    mov    esi, [esp+96+8]    ; esi = data
        
    ; linear layer  
    ; x[0]^=rk[0];x[1]^=rk[1];
    ; x[2]^=rk[2];x[3]^=rk[3];
    pushad
    mov    cl, 4
xor_key:
    mov    eax, [edi]         ; eax = rk[i]
    xor    [esi], eax         ; x[i]^= eax
    cmpsd                     ; advance esi and edi
    loop   xor_key
    popad
    
    ; if(i==32)break;
    cmp    cl, 32
    je     sx_end
    
    ; nonlinear layer
    ; sbox(x, i);
    mov    edx, ecx
    mov    edi, esi
    call   sbox
    
    ; if(++i != 32) {
    inc    ecx
    cmp    cl, 32
    je     sx_main
    
    ; linear layer
    pushad
    lodsd
    xchg   x3, eax
    lodsd
    xchg   x1, eax
    lodsd
    xchg   x2, eax
    lodsd
    xchg   x3, eax
    ; x[0]=R(x[0],19);x[2]=R(x[2],29);
    ror    x0, 19
    ror    x2, 29
    ; x[1]^=x[0]^x[2];x[3]^=x[2]^(x[0]<<3);
    xor    x1, x0
    xor    x1, x2
    xor    x3, x2
    mov    x4, x0
    shl    x4, 3
    xor    x3, x4
    ; x[1]=R(x[1],31);x[3]=R(x[3],25);
    ror    x1, 31
    ror    x3, 25
    ; x[0]^=x[1]^x[3];x[2]^=x[3]^(x[1]<<7);
    xor    x0, x1
    xor    x0, x3
    xor    x2, x3
    mov    x4, x1
    shl    x4, 7
    xor    x2, x4
    ; x[0]=R(x[0],27);x[2]=R(x[2],10);
    ror    x0, 27
    ror    x2, 10
    stosd
    xchg   x1, eax
    stosd
    xchg   x2, eax
    stosd
    xchg   x3, eax
    stosd
    popad
    jmp    sx_main
    
sx_end:
    popad
    popad
    popad
    ret

    ; edx = idx
    ; edi = x
sbox:
    pushad
    call   init_sbox    
; sbox
  db 083h, 01Fh, 06Ah, 0B5h, 0DEh, 024h, 007h, 0C9h
  db 0CFh, 072h, 009h, 0A5h, 0B1h, 08Eh, 0D6h, 043h
  db 068h, 097h, 0C3h, 0FAh, 01Dh, 04Eh, 0B0h, 025h
  db 0F0h, 08Bh, 09Ch, 036h, 01Dh, 042h, 07Ah, 0E5h
  db 0F1h, 038h, 00Ch, 06Bh, 052h, 0A4h, 0E9h, 0D7h
  db 05Fh, 0B2h, 0A4h, 0C9h, 030h, 08Eh, 06Dh, 017h
  db 027h, 05Ch, 048h, 0B6h, 09Eh, 0F1h, 03Dh, 00Ah
  db 0D1h, 00Fh, 08Eh, 0B2h, 047h, 0ACh, 039h, 065h  
init_sbox:
    pop    esi               ; esi=sbox
    and    edx, 7            ; %= 8
    lea    esi, [esi+8*edx]  ; esi = &sbox[i*8]
    mov    edx, edi
    call   ld_perm
    
; void permute (out, in);
; edi = out
; esi = in
; CF = type
permute:
    pushad
    xchg   eax, ecx    ; ecx should be zero
    push   16
    pop    ecx
    pushad
    rep    stosb
    popad
    cdq            ; m=0
    jnc    do_fp
    ; initial permutation
ip_m_l:
    mov    eax, edx
    and    eax, 3
    shr    dword[esi+4*eax], 1
    rcr    byte[edi], 1
    inc    edx
    test   dl, 7
    jne    ip_m_l
    inc    edi
    loop   ip_m_l
xit_perm:
    popad
    ret
    ; final permutation
do_fp:
    mov    cl, 4    ; n=4
fp_m_l:
    mov    eax, edx
    and    eax, 3
    shr    dword[esi], 1
    rcr    dword[edi+4*eax], 1
    inc    edx
    test   dl, 31
    jne    fp_m_l
    lodsd
    loop   fp_m_l
    popad
    ret
    
ld_perm:
    pop    ebp
    
    pushad          ; create 2 16-byte blocks
    mov    ebx, esp          ; ebx=sb
    mov    edi, esp          ; edi=sb
    mov    cl, 8             ; SERPENT_BLK_LEN/2
sb_l1:
    lodsb                    ; t = sbp[i/2];
    aam    16
    stosw
    loop   sb_l1
    
    ; permute (&tmp_blk, blk, SERPENT_IP);
    mov    esi, edx
    stc
    call   ebp ;permute
    mov    esi, edi
    mov    cl, 16
    push   esi
sb_l2:
    lodsb
    aam    16
    xchg   ah, al
    xlatb
    xchg   ah, al
    xlatb
    aad    16 
    stosb
    loop   sb_l2
    
    pop    esi
    mov    edi, edx
    ; permute (blk, &tmp_blk, SERPENT_FP);
    call   ebp
    popad
    popad
    ret

Sources here.

This entry was posted in assembly, cryptography, encryption, programming and tagged , , , , , . Bookmark the permalink.

9 Responses to Serpent Block Cipher

  1. yanapermana says:

    Nice asm…
    How many hours you spent for writing this code, brother?

    Like

    • Odzhan says:

      Hello and Thank you. I think more time was spent optimizing the C code and analyzing output of different compilers to see what worked best before writing ASM.

      I’d spend time thinking of how to get everything smaller in C first so I honestly couldn’t say how long it took. Most of the work was done 6+ months ago and I only wrote/tested ASM yesterday in few hours maybe.

      It’s usually just a gradual process but writing the ASM didn’t take long once the C code was clean. It can still be improved but I wanted to draw a line under it.

      Like

  2. Peter Ferrie says:

    Some suggestions for shrinking the code further:

    mov ebx, [esi+4*eax]
    shr dword[esi+4*eax], 1
    and ebx, 1
    shl ebx, 7
    shr byte[edi+ecx], 1
    or byte[edi+ecx], bl

    can become

    shr dword[esi+4*eax], 1
    rcr byte[edi+ecx], 1

    mov ebx, [esi+4*ecx]
    shr dword[esi+4*ecx], 1
    and ebx, 1
    shl ebx, 31

    shr dword[edi+4*eax], 1
    or dword[edi+4*eax], ebx

    can become

    shr dword[esi+4*ecx], 1

    rcr dword[edi+4*eax], 1

    lodsb ; t = sbp[i/2];
    mov dl, al ; sb.v8[i+0] = HI_NIBBLE(t);
    shr al, 4
    stosb
    xchg eax, edx
    and al, 15

    can become

    lodsb
    aam 16
    xchg ah, al
    stosb
    xchg ah, al
    stosb

    lodsb ; t = tmp_blk.v8[i];
    mov dl, al
    shr al, 4 ; sb.v8[HI_NIBBLE(t)] << 4)
    xlatb
    shl al, 4
    xchg eax, edx
    and al, 15 ; sb.v8[LO_NIBBLE(t)]
    xlatb
    or al, dl

    can become

    lodsb
    aam 16
    xchg ah, al
    xlatb
    xchg ah, al
    xlatb
    aad 16

    pushfd
    rep stosb
    popfd

    can become
    rep stosb

    because flags are not affected here.
    I will investigate further when I have more time…

    Liked by 1 person

    • Odzhan says:

      Peter, those are awesome suggestions and I’ve updated to reflect. I also changed use of carry flag. STC is used for encryption. Just makes more sense CF=1 for that.

      Funnily enough, I was thinking of using RCR when writing those initial/final permutation functions but not to the extent you suggested. Impressed 🙂 Thanks a lot for those.

      Probably the main area of improvement could be in set key function. I still don’t like it but for now, it works okay. The other thing I thought about was possibility of updating the key which is stored in ESI during call to blkxor and i was going to use CLD/STD to move it up or down but I keep thinking it’ll eventually end up with more code than just ADD/SUB so I probably will just leave it for now 🙂

      Like

      • Peter Ferrie says:

        Here’s my attempt to improve it further:

        pushad
        rep stosb
        popad

        can become

        push edi
        rep stosb
        pop edi

        lea esi, [sbox] ; encryption sbox
        lea eax, [sbox_inv] ; decryption sbox

        should be

        mov esi, offset sbox ; encryption sbox
        mov eax, offset sbox_inv ; decryption sbox

        for smaller code

        xchg ah, al
        ; sb.v8[i+0] = HI_NIBBLE(t);
        stosb
        xchg ah, al
        ; sb.v8[i+1] = LO_NIBBLE(t);
        stosb

        can become

        xchg ah, al
        ; sb.v8[i+0] = HI_NIBBLE(t);
        ; sb.v8[i+1] = LO_NIBBLE(t);
        stosw

        mov eax, esp ; save in eax before saving blk in edi

        xchg eax, edi

        can become

        mov ebx, esp

        mov edi, ebx

        and no need to load ebx later

        xor eax, eax

        setb al

        test eax, eax
        jz skey_il ; skip padding
        or byte [esi+ebx], al

        can become

        setb al

        or byte [esi+ebx], al

        no need to zero eax, and the test isn’t needed, since the “or” is harmless if al is zero.

        lea ebx, [edx+4*ecx]

        should move above skey_jl for better performance

        mov edx, ecx ; ecx=i
        neg edx ;
        add edx, 3 ; i – 3

        can become

        push 3
        pop edx
        sub edx, ecx ; 3 – i

        push 16
        pop ecx
        pushad
        rep movsb
        popad
        shl ecx, 1 ; *= 2 for default SERPENT_ROUNDS
        mov esi, ecx ; esi = 16
        shl esi, 4

        can become

        push 16
        pop ecx
        rep movsb
        mov cl, 32 ; *= 2 for default SERPENT_ROUNDS
        imul esi, ecx, 16 ; esi = rounds*16

        Liked by 1 person

  3. Pingback: Asmcodes: Twofish-256 | Odzhan

  4. PELock says:

    1. Here:

    ; j++
    inc edx
    cmp edx, 4
    jne skey_l2

    The j counter is set to 0 and is checked against 4, use cmp dl,4 it will do the job and you’re it’s byte smaller, use the same strategy for any loops that starts with 0 and doesn’t exceed 0xFF 😉

    2. Also I’ve noticed this is used twice:


    add esp, 32
    popad
    ret

    put a global label on that and jump from the other code to this label

    common_return_epilog:
    add esp, 32
    popad
    ret

    Liked by 1 person

    • Odzhan says:

      Hello, perhaps it depends on assembler but the size of code remains same.

      /* 01FB */ “x42” /* inc edx */
      /* 01FC */ “x83xfax04” /* cmp edx, 0x4 */
      /* 01FF */ “x75xd5” /* jnz 0x1d6 */

      /* 01FB */ “x42” /* inc edx */
      /* 01FC */ “x80xfax04” /* cmp dl, 0x4 */
      /* 01FF */ “x75xd5” /* jnz 0x1d6 */

      I thought the short jmp could work but unfortunately the distance was over 2 bytes too long 🙂

      Having said that, If we can save a couple of bytes in between, which is no doubt possble, it’ll work then so I’ll keep it mind.

      Thanks!

      Like

  5. Pingback: Twofish-256 Block cipher | crypto on x86 and ARM

Leave a comment