## Ascon Permutation Function

### Introduction

Ascon is an Authenticated Encryption Associated Data (AEAD) algorithm submitted to the competition for Authenticated Encryption: Security, Applicability, and Robustness (CAESAR) It was designed by Christoph Dobraunig, Maria Eichlseder, Florian Mendel and Martin Schläffer. Some of the authors mentioned are also behind designs such as Gimli permutation function, and the Grøstl cryptographic hash algorithm.

### Compact code.

This is taken directly from the implementation in SUPERCOP.

```#define R(x,n)(((x)>>(n))|((x)<<(64-(n))))
typedef unsigned long long W;

void ascon(void*p) {
int i;
W   t0,t1,t2,t3,t4,x0,x1,x2,x3,x4,*s=(W*)p;

// load 320-bit state
x0=s[0];x1=s[1];x2=s[2];x3=s[3];x4=s[4];
// apply 12 rounds
for(i=0;i<12;i++) {
// add round constant
x2^=((0xFULL-i)<<4)|i;
// apply non-linear layer
x0^=x4;x4^=x3;x2^=x1;
t4=(x0&~x4);t3=(x4&~x3);t2=(x3&~x2);t1=(x2&~x1);t0=(x1&~x0);
x0^=t1;x1^=t2;x2^=t3;x3^=t4;x4^=t0;
x1^=x0;x0^=x4;x3^=x2;x2=~x2;
// apply linear diffusion layer
x0^=R(x0,19)^R(x0,28);x1^=R(x1,61)^R(x1,39);
x2^=R(x2,1)^R(x2,6);x3^=R(x3,10)^R(x3,17);
x4^=R(x4,7)^R(x4,41);
}
// store 320-bit state
s[0]=x0;s[1]=x1;s[2]=x2;s[3]=x3;s[4]=x4;
}
```

### AMD64 assembly

```; -----------------------------------------------
; Ascon Permutation function in AMD64 assembly
;
; size: 254 bytes
;
; global calls use Microsoft x64 fastcall convention
;
; -----------------------------------------------

bits 64

%ifndef BIN
global ascon
%endif

%define x0 rbx
%define x1 rdx
%define x2 rdi
%define x3 rsi
%define x4 rbp

%define t0 r8
%define t1 r9
%define t2 r10
%define t3 r11
%define t4 r12

%define x rcx
%define r rdx
%define i rax

ascon:
push   rsi
push   rbx
push   rdi
push   rbp
push   r12

push   r

mov    x0, [x+0*8]
mov    x1, [x+1*8]
mov    x2, [x+2*8]
mov    x3, [x+3*8]
mov    x4, [x+4*8]

xor    i, i
permute_loop:
push   i
; **************************
; addition of round constant
; **************************
; x2 ^= ((0xfull - i) << 4) | i;
push  15
pop   rax
sub   rax, [rsp]
shl   rax, 4
or    rax, [rsp]
xor   x2, rax
; **********************
; substitution layer
; **********************
; x0 ^= x4;    x4 ^= x3;    x2 ^= x1;
xor    x0, x4
xor    x4, x3
xor    x2, x1
; t0  = x0;    t1  = x1;    t2  = x2;    t3  =  x3;    t4  = x4;
mov    t0, x0
mov    t1, x1
mov    t2, x2
mov    t3, x3
mov    t4, x4
; t0  = ~t0;   t1  = ~t1;   t2  = ~t2;   t3  = ~t3;    t4  = ~t4;
not    t0
not    t1
not    t2
not    t3
not    t4
; t0 &= x1;    t1 &= x2;    t2 &= x3;    t3 &=  x4;    t4 &= x0;
and    t0, x1
and    t1, x2
and    t2, x3
and    t3, x4
and    t4, x0
; x0 ^= t1;    x1 ^= t2;    x2 ^= t3;    x3 ^=  t4;    x4 ^= t0;
xor    x0, t1
xor    x1, t2
xor    x2, t3
xor    x3, t4
xor    x4, t0
; x1 ^= x0;    x0 ^= x4;    x3 ^= x2;    x2  = ~x2;
xor    x1, x0
xor    x0, x4
xor    x3, x2
not    x2
; **********************
; linear diffusion layer
; **********************
; x0 ^= ROTR(x0, 19) ^ ROTR(x0, 28);
mov    t0, x0
ror    t0, 19
xor    x0, t0
ror    t0, 28-19
xor    x0, t0

; x1 ^= ROTR(x1, 61) ^ ROTR(x1, 39);
mov    t0, x1
ror    t0, 39
xor    x1, t0
ror    t0, 61-39
xor    x1, t0

; x2 ^= ROTR(x2,  1) ^ ROTR(x2,  6);
mov    t0, x2
ror    t0, 1
xor    x2, t0
ror    t0, 6-1
xor    x2, t0

; x3 ^= ROTR(x3, 10) ^ ROTR(x3, 17);
mov    t0, x3
ror    t0, 10
xor    x3, t0
ror    t0, 17-10
xor    x3, t0

; x4 ^= ROTR(x4,  7) ^ ROTR(x4, 41);
mov    t0, x4
ror    t0, 7
xor    x4, t0
ror    t0, 41-7
xor    x4, t0

pop    i
inc    i
cmp    i, [rsp]
jnz    permute_loop

; save
mov    [x+0*8], x0
mov    [x+1*8], x1
mov    [x+2*8], x2
mov    [x+3*8], x3
mov    [x+4*8], x4

pop    r

pop    r12
pop    rbp
pop    rdi
pop    rbx
pop    rsi
ret
```

### ARM64 / AArch64 assembly

```// ASCON in ARM64 assembly
// 192 bytes

.arch armv8-a
.text

.global ascon

ascon:
mov    x10, x0
// load 320-bit state
ldp    x0, x1, [x10]
ldp    x2, x3, [x10, 16]
ldr    x4, [x10, 32]

// apply 12 rounds
mov    x11, xzr
L0:
// add round constant
// x2^=((0xFULL-i)<<4)|i;
mov    x12, 0xF
sub    x12, x12, x11
orr    x12, x11, x12, lsl 4
eor    x2, x2, x12

// apply non-linear layer
// x0^=x4;x4^=x3;x2^=x1;
eor    x0, x0, x4
eor    x4, x4, x3
eor    x2, x2, x1

// t4=(x0&~x4);t3=(x4&~x3);t2=(x3&~x2);t1=(x2&~x1);t0=(x1&~x0);
bic    x5, x1, x0
bic    x6, x2, x1
bic    x7, x3, x2
bic    x8, x4, x3
bic    x9, x0, x4

// x0^=t1;x1^=t2;x2^=t3;x3^=t4;x4^=t0;
eor    x0, x0, x6
eor    x1, x1, x7
eor    x2, x2, x8
eor    x3, x3, x9
eor    x4, x4, x5

// x1^=x0;x0^=x4;x3^=x2;x2=~x2;
eor    x1, x1, x0
eor    x0, x0, x4
eor    x3, x3, x2
mvn    x2, x2

// apply linear diffusion layer
// x0^=R(x0,19)^R(x0,28);
ror    x5, x0, 19
eor    x5, x5, x0, ror 28
eor    x0, x0, x5

// x1^=R(x1,61)^R(x1,39);
ror    x5, x1, 61
eor    x5, x5, x1, ror 39
eor    x1, x1, x5

// x2^=R(x2,1)^R(x2,6);
ror    x5, x2, 1
eor    x5, x5, x2, ror 6
eor    x2, x2, x5

// x3^=R(x3,10)^R(x3,17);
ror    x5, x3, 10
eor    x5, x5, x3, ror 17
eor    x3, x3, x5

// x4^=R(x4,7)^R(x4,41);
ror    x5, x4, 7
eor    x5, x5, x4, ror 41
eor    x4, x4, x5

// i++
add    x11, x11, 1
// i < 12
cmp    x11, 12
bne    L0

// save 320-bit state
stp    x0, x1, [x10]
stp    x2, x3, [x10, 16]
str    x4, [x10, 32]
ret
```

Sources here.

This entry was posted in cryptography, encryption, programming, security and tagged , , . Bookmark the permalink.