Introduction
Serpent is a 128-bit block cipher with support for 128,192 and 256-bit key sizes. It was designed by Eli Biham, Ross Anderson, Lars Knudsen and published in 1998. It was runner up to Rijndael that was selected to become the Advanced Encryption Standard (AES). The authors of Twofish (another AES finalist) stated in a paper The Twofish Team’s Final Comments on AES Selection regard Serpent as having the highest security margin.
Algorithm | Safety factor |
---|---|
MARS | 1.9 |
RC6 | 1.18 |
Rijndael | 1.11/1.33/1.56 |
Serpent | 3.56 |
Twofish | 2.67 |
32-round RC6 | 2.00 |
18-round Rijndael | 2.00 |
24-round Rijndael | 2.67 |
Two of the finalists, MARS and RC6, simply do not work well in certain applications. Any one of the other three algorithms—Rijndael (with the extra rounds), Serpent, or Twofish would make an excellent standard.
Serpent has positioned itself as the conservative choice, while Rijndael (with its original 10 rounds) is clearly the fastest choice. We believe that the results of the straw poll at the Third AES Candidate Conference reflected this dichotomy: those that thought security was paramount chose Serpent, while those more concerned with performance chose Rijndael
The assembly implementation here, was a joint effort between myself and Peter Ferrie.
Compact code
#define R(v,n)(((v)>>(n))|((v)<<(32-(n)))) #define X(a,b)t=a,a=b,b=t #define HI(b)(((b)>>4)&0x0F) #define LO(b)((b)&0x0F) #define F(a,b)for(a=0;a<b;a++) typedef unsigned char B; typedef unsigned int W; void sbox(W*, W); void serpent(void*mk,void*data) { W i,j,s,rk[4],k[8],*x=data; F(i,8)k[i]=((W*)mk)[i]; for(i=0;;) { F(j,4) { rk[j]=R((k[0]^k[3]^k[5]^k[7]^0x9e3779b9UL^(i*4+j)),21); F(s,7)k[s]=k[s+1]; k[7]=rk[j]; } sbox(rk,3-i); x[0]^=rk[0];x[1]^=rk[1]; x[2]^=rk[2];x[3]^=rk[3]; if(i==32)break; sbox(x,i); if(++i!=32) { x[0]=R(x[0],19);x[2]=R(x[2],29); x[1]^=x[0]^x[2];x[3]^=x[2]^(x[0]<<3); x[1]=R(x[1],31);x[3]=R(x[3],25); x[0]^=x[1]^x[3];x[2]^=x[3]^(x[1]<<7); x[0]=R(x[0],27);x[2]=R(x[2],10); } } } void sbox(W *x, W idx) { B s[16],p[16],t,i,j,c; B sbox[8][8] = { { 0x83, 0x1F, 0x6A, 0xB5, 0xDE, 0x24, 0x07, 0xC9 }, { 0xCF, 0x72, 0x09, 0xA5, 0xB1, 0x8E, 0xD6, 0x43 }, { 0x68, 0x97, 0xC3, 0xFA, 0x1D, 0x4E, 0xB0, 0x25 }, { 0xF0, 0x8B, 0x9C, 0x36, 0x1D, 0x42, 0x7A, 0xE5 }, { 0xF1, 0x38, 0x0C, 0x6B, 0x52, 0xA4, 0xE9, 0xD7 }, { 0x5F, 0xB2, 0xA4, 0xC9, 0x30, 0x8E, 0x6D, 0x17 }, { 0x27, 0x5C, 0x48, 0xB6, 0x9E, 0xF1, 0x3D, 0x0A }, { 0xD1, 0x0F, 0x8E, 0xB2, 0x47, 0xAC, 0x39, 0x65 }}; for(i=0;i<16;i+=2) { t=sbox[idx%8][i/2]; s[i+0]=LO(t); s[i+1]=HI(t); } // initial permutation F(i,16) { F(j,8) { c=x[j%4]&1; x[j%4]>>=1; p[i]=(c<<7)|(p[i]>>1); } } F(i,16)p[i]=(s[HI(p[i])]<<4)|s[LO(p[i])]; // final permutation F(i,4) { F(j,32) { c=((W*)p)[i]&1; ((W*)p)[i]>>=1; x[j%4]=(c<<31)|(x[j%4]>>1); } } }
x86 assembly
; ----------------------------------------------- ; Serpent-256 block cipher in x86 assembly (Encryption only) ; ; size: 364 bytes ; ; global calls use cdecl convention ; ; ----------------------------------------------- bits 32 %ifndef BIN global serpent %endif %define x edi %define k esi %define x0 eax %define x1 ebx %define x2 ecx %define x3 edx %define x4 ebp serpent: pushad mov esi, [esp+32+4] ; esi = mk ; allocate 64 bytes of memory pushad pushad ; copy 256-bit key to local memory ; F(i,8)k[i]=((W*)mk)[i]; mov edi, esp push 8 pop ecx rep movsd sx_main: ; create a subkey for this round ; rk[j]=R((k[0]^k[3]^k[5]^k[7]^0x9e3779b9UL^(i*4+j)),21); xor edx, edx ; edx = j mov esi, esp ; esi = k lea edi, [esi+32] ; edi = rk sx_subkey: mov eax, [esi] ; t =k[0] xor eax, [esi+3*4] ; t^=k[3] xor eax, [esi+5*4] ; t^=k[5] xor eax, [esi+7*4] ; t^=k[7] xor eax, 0x9e3779b9 ; t^=0x9e3779b9UL lea ebp, [edx+4*ecx] ; ebp=(j+4*i) xor eax, ebp ; t^=ebp ror eax, 21 ; t = R(t, 21) mov [edi+4*edx], eax ; rk[j]=t ; F(s,7)k[s]=k[s+1]; pushad push esi ; save k cmpsd ; esi = &k[1] pop edi ; edi = &k[0] mov cl, 7 rep movsd ; k[7]=rk[j]; stosd popad ; j++ inc edx cmp dl, 4 jne sx_subkey ; sbox(rk, 3-i); dec edx sub edx, ecx ; 3 - i call sbox mov esi, [esp+96+8] ; esi = data ; linear layer ; x[0]^=rk[0];x[1]^=rk[1]; ; x[2]^=rk[2];x[3]^=rk[3]; pushad mov cl, 4 xor_key: mov eax, [edi] ; eax = rk[i] xor [esi], eax ; x[i]^= eax cmpsd ; advance esi and edi loop xor_key popad ; if(i==32)break; cmp cl, 32 je sx_end ; nonlinear layer ; sbox(x, i); mov edx, ecx mov edi, esi call sbox ; if(++i != 32) { inc ecx cmp cl, 32 je sx_main ; linear layer pushad lodsd xchg x3, eax lodsd xchg x1, eax lodsd xchg x2, eax lodsd xchg x3, eax ; x[0]=R(x[0],19);x[2]=R(x[2],29); ror x0, 19 ror x2, 29 ; x[1]^=x[0]^x[2];x[3]^=x[2]^(x[0]<<3); xor x1, x0 xor x1, x2 xor x3, x2 mov x4, x0 shl x4, 3 xor x3, x4 ; x[1]=R(x[1],31);x[3]=R(x[3],25); ror x1, 31 ror x3, 25 ; x[0]^=x[1]^x[3];x[2]^=x[3]^(x[1]<<7); xor x0, x1 xor x0, x3 xor x2, x3 mov x4, x1 shl x4, 7 xor x2, x4 ; x[0]=R(x[0],27);x[2]=R(x[2],10); ror x0, 27 ror x2, 10 stosd xchg x1, eax stosd xchg x2, eax stosd xchg x3, eax stosd popad jmp sx_main sx_end: popad popad popad ret ; edx = idx ; edi = x sbox: pushad call init_sbox ; sbox db 083h, 01Fh, 06Ah, 0B5h, 0DEh, 024h, 007h, 0C9h db 0CFh, 072h, 009h, 0A5h, 0B1h, 08Eh, 0D6h, 043h db 068h, 097h, 0C3h, 0FAh, 01Dh, 04Eh, 0B0h, 025h db 0F0h, 08Bh, 09Ch, 036h, 01Dh, 042h, 07Ah, 0E5h db 0F1h, 038h, 00Ch, 06Bh, 052h, 0A4h, 0E9h, 0D7h db 05Fh, 0B2h, 0A4h, 0C9h, 030h, 08Eh, 06Dh, 017h db 027h, 05Ch, 048h, 0B6h, 09Eh, 0F1h, 03Dh, 00Ah db 0D1h, 00Fh, 08Eh, 0B2h, 047h, 0ACh, 039h, 065h init_sbox: pop esi ; esi=sbox and edx, 7 ; %= 8 lea esi, [esi+8*edx] ; esi = &sbox[i*8] mov edx, edi call ld_perm ; void permute (out, in); ; edi = out ; esi = in ; CF = type permute: pushad xchg eax, ecx ; ecx should be zero push 16 pop ecx pushad rep stosb popad cdq ; m=0 jnc do_fp ; initial permutation ip_m_l: mov eax, edx and eax, 3 shr dword[esi+4*eax], 1 rcr byte[edi], 1 inc edx test dl, 7 jne ip_m_l inc edi loop ip_m_l xit_perm: popad ret ; final permutation do_fp: mov cl, 4 ; n=4 fp_m_l: mov eax, edx and eax, 3 shr dword[esi], 1 rcr dword[edi+4*eax], 1 inc edx test dl, 31 jne fp_m_l lodsd loop fp_m_l popad ret ld_perm: pop ebp pushad ; create 2 16-byte blocks mov ebx, esp ; ebx=sb mov edi, esp ; edi=sb mov cl, 8 ; SERPENT_BLK_LEN/2 sb_l1: lodsb ; t = sbp[i/2]; aam 16 stosw loop sb_l1 ; permute (&tmp_blk, blk, SERPENT_IP); mov esi, edx stc call ebp ;permute mov esi, edi mov cl, 16 push esi sb_l2: lodsb aam 16 xchg ah, al xlatb xchg ah, al xlatb aad 16 stosb loop sb_l2 pop esi mov edi, edx ; permute (blk, &tmp_blk, SERPENT_FP); call ebp popad popad ret
Nice asm…
How many hours you spent for writing this code, brother?
LikeLike
Hello and Thank you. I think more time was spent optimizing the C code and analyzing output of different compilers to see what worked best before writing ASM.
I’d spend time thinking of how to get everything smaller in C first so I honestly couldn’t say how long it took. Most of the work was done 6+ months ago and I only wrote/tested ASM yesterday in few hours maybe.
It’s usually just a gradual process but writing the ASM didn’t take long once the C code was clean. It can still be improved but I wanted to draw a line under it.
LikeLike
Some suggestions for shrinking the code further:
mov ebx, [esi+4*eax]
shr dword[esi+4*eax], 1
and ebx, 1
shl ebx, 7
shr byte[edi+ecx], 1
or byte[edi+ecx], bl
can become
shr dword[esi+4*eax], 1
rcr byte[edi+ecx], 1
—
mov ebx, [esi+4*ecx]
shr dword[esi+4*ecx], 1
and ebx, 1
shl ebx, 31
…
shr dword[edi+4*eax], 1
or dword[edi+4*eax], ebx
can become
shr dword[esi+4*ecx], 1
…
rcr dword[edi+4*eax], 1
—
lodsb ; t = sbp[i/2];
mov dl, al ; sb.v8[i+0] = HI_NIBBLE(t);
shr al, 4
stosb
xchg eax, edx
and al, 15
can become
lodsb
aam 16
xchg ah, al
stosb
xchg ah, al
stosb
—
lodsb ; t = tmp_blk.v8[i];
mov dl, al
shr al, 4 ; sb.v8[HI_NIBBLE(t)] << 4)
xlatb
shl al, 4
xchg eax, edx
and al, 15 ; sb.v8[LO_NIBBLE(t)]
xlatb
or al, dl
can become
lodsb
aam 16
xchg ah, al
xlatb
xchg ah, al
xlatb
aad 16
—
pushfd
rep stosb
popfd
can become
rep stosb
because flags are not affected here.
I will investigate further when I have more time…
LikeLiked by 1 person
Peter, those are awesome suggestions and I’ve updated to reflect. I also changed use of carry flag. STC is used for encryption. Just makes more sense CF=1 for that.
Funnily enough, I was thinking of using RCR when writing those initial/final permutation functions but not to the extent you suggested. Impressed 🙂 Thanks a lot for those.
Probably the main area of improvement could be in set key function. I still don’t like it but for now, it works okay. The other thing I thought about was possibility of updating the key which is stored in ESI during call to blkxor and i was going to use CLD/STD to move it up or down but I keep thinking it’ll eventually end up with more code than just ADD/SUB so I probably will just leave it for now 🙂
LikeLike
Here’s my attempt to improve it further:
pushad
rep stosb
popad
can become
push edi
rep stosb
pop edi
—
lea esi, [sbox] ; encryption sbox
lea eax, [sbox_inv] ; decryption sbox
should be
mov esi, offset sbox ; encryption sbox
mov eax, offset sbox_inv ; decryption sbox
for smaller code
—
xchg ah, al
; sb.v8[i+0] = HI_NIBBLE(t);
stosb
xchg ah, al
; sb.v8[i+1] = LO_NIBBLE(t);
stosb
can become
xchg ah, al
; sb.v8[i+0] = HI_NIBBLE(t);
; sb.v8[i+1] = LO_NIBBLE(t);
stosw
—
mov eax, esp ; save in eax before saving blk in edi
…
xchg eax, edi
can become
mov ebx, esp
…
mov edi, ebx
and no need to load ebx later
—
xor eax, eax
…
setb al
…
test eax, eax
jz skey_il ; skip padding
or byte [esi+ebx], al
can become
setb al
…
or byte [esi+ebx], al
no need to zero eax, and the test isn’t needed, since the “or” is harmless if al is zero.
—
lea ebx, [edx+4*ecx]
should move above skey_jl for better performance
—
mov edx, ecx ; ecx=i
neg edx ;
add edx, 3 ; i – 3
can become
push 3
pop edx
sub edx, ecx ; 3 – i
—
push 16
pop ecx
pushad
rep movsb
popad
shl ecx, 1 ; *= 2 for default SERPENT_ROUNDS
mov esi, ecx ; esi = 16
shl esi, 4
can become
push 16
pop ecx
rep movsb
mov cl, 32 ; *= 2 for default SERPENT_ROUNDS
imul esi, ecx, 16 ; esi = rounds*16
LikeLiked by 1 person
Pingback: Asmcodes: Twofish-256 | Odzhan
1. Here:
; j++
inc edx
cmp edx, 4
jne skey_l2
The j counter is set to 0 and is checked against 4, use cmp dl,4 it will do the job and you’re it’s byte smaller, use the same strategy for any loops that starts with 0 and doesn’t exceed 0xFF 😉
2. Also I’ve noticed this is used twice:
…
add esp, 32
popad
ret
put a global label on that and jump from the other code to this label
common_return_epilog:
add esp, 32
popad
ret
LikeLiked by 1 person
Hello, perhaps it depends on assembler but the size of code remains same.
/* 01FB */ “x42” /* inc edx */
/* 01FC */ “x83xfax04” /* cmp edx, 0x4 */
/* 01FF */ “x75xd5” /* jnz 0x1d6 */
/* 01FB */ “x42” /* inc edx */
/* 01FC */ “x80xfax04” /* cmp dl, 0x4 */
/* 01FF */ “x75xd5” /* jnz 0x1d6 */
I thought the short jmp could work but unfortunately the distance was over 2 bytes too long 🙂
Having said that, If we can save a couple of bytes in between, which is no doubt possble, it’ll work then so I’ll keep it mind.
Thanks!
LikeLike
Pingback: Twofish-256 Block cipher | crypto on x86 and ARM