Ascon Permutation Function

Introduction

Ascon is an Authenticated Encryption Associated Data (AEAD) algorithm submitted to the competition for Authenticated Encryption: Security, Applicability, and Robustness (CAESAR) It was designed by Christoph Dobraunig, Maria Eichlseder, Florian Mendel and Martin Schläffer. Some of the authors mentioned are also behind designs such as Gimli permutation function, and the Grøstl cryptographic hash algorithm.

Compact code.

This is taken directly from the implementation in SUPERCOP.

#define R(x,n)(((x)>>(n))|((x)<<(64-(n))))
typedef unsigned long long W;

void ascon(void*p) {
    int i;
    W   t0,t1,t2,t3,t4,x0,x1,x2,x3,x4,*s=(W*)p;
    
    // load 320-bit state
    x0=s[0];x1=s[1];x2=s[2];x3=s[3];x4=s[4];
    // apply 12 rounds
    for(i=0;i<12;i++) {
      // add round constant
      x2^=((0xFULL-i)<<4)|i;
      // apply non-linear layer
      x0^=x4;x4^=x3;x2^=x1;
      t4=(x0&~x4);t3=(x4&~x3);t2=(x3&~x2);t1=(x2&~x1);t0=(x1&~x0);
      x0^=t1;x1^=t2;x2^=t3;x3^=t4;x4^=t0;
      x1^=x0;x0^=x4;x3^=x2;x2=~x2;
      // apply linear diffusion layer
      x0^=R(x0,19)^R(x0,28);x1^=R(x1,61)^R(x1,39);
      x2^=R(x2,1)^R(x2,6);x3^=R(x3,10)^R(x3,17);
      x4^=R(x4,7)^R(x4,41);
    }
    // store 320-bit state
    s[0]=x0;s[1]=x1;s[2]=x2;s[3]=x3;s[4]=x4;
}

AMD64 assembly

; -----------------------------------------------
; Ascon Permutation function in AMD64 assembly
;
; size: 254 bytes
;
; global calls use Microsoft x64 fastcall convention
;
; -----------------------------------------------

    bits 64
    
    %ifndef BIN
      global ascon
    %endif
    
%define x0 rbx
%define x1 rdx
%define x2 rdi
%define x3 rsi
%define x4 rbp

%define t0 r8
%define t1 r9
%define t2 r10
%define t3 r11
%define t4 r12

%define x rcx
%define r rdx
%define i rax
    
ascon:
    push   rsi
    push   rbx
    push   rdi
    push   rbp
    push   r12

    push   r
        
    ; load
    mov    x0, [x+0*8]
    mov    x1, [x+1*8]
    mov    x2, [x+2*8]
    mov    x3, [x+3*8]
    mov    x4, [x+4*8]
    
    xor    i, i
permute_loop:
    push   i
    ; **************************
    ; addition of round constant
    ; **************************    
    ; x2 ^= ((0xfull - i) << 4) | i;
    push  15
    pop   rax
    sub   rax, [rsp]
    shl   rax, 4
    or    rax, [rsp]
    xor   x2, rax    
    ; **********************
    ; substitution layer
    ; **********************
    ; x0 ^= x4;    x4 ^= x3;    x2 ^= x1;
    xor    x0, x4
    xor    x4, x3
    xor    x2, x1
    ; t0  = x0;    t1  = x1;    t2  = x2;    t3  =  x3;    t4  = x4;
    mov    t0, x0
    mov    t1, x1
    mov    t2, x2
    mov    t3, x3
    mov    t4, x4
    ; t0  = ~t0;   t1  = ~t1;   t2  = ~t2;   t3  = ~t3;    t4  = ~t4;
    not    t0
    not    t1
    not    t2
    not    t3
    not    t4
    ; t0 &= x1;    t1 &= x2;    t2 &= x3;    t3 &=  x4;    t4 &= x0;
    and    t0, x1
    and    t1, x2
    and    t2, x3
    and    t3, x4
    and    t4, x0
    ; x0 ^= t1;    x1 ^= t2;    x2 ^= t3;    x3 ^=  t4;    x4 ^= t0;
    xor    x0, t1
    xor    x1, t2
    xor    x2, t3
    xor    x3, t4
    xor    x4, t0
    ; x1 ^= x0;    x0 ^= x4;    x3 ^= x2;    x2  = ~x2;
    xor    x1, x0  
    xor    x0, x4  
    xor    x3, x2  
    not    x2    
    ; **********************
    ; linear diffusion layer
    ; **********************
    ; x0 ^= ROTR(x0, 19) ^ ROTR(x0, 28);
    mov    t0, x0
    ror    t0, 19
    xor    x0, t0
    ror    t0, 28-19
    xor    x0, t0
    
    ; x1 ^= ROTR(x1, 61) ^ ROTR(x1, 39);
    mov    t0, x1
    ror    t0, 39
    xor    x1, t0
    ror    t0, 61-39
    xor    x1, t0

    ; x2 ^= ROTR(x2,  1) ^ ROTR(x2,  6);
    mov    t0, x2
    ror    t0, 1
    xor    x2, t0
    ror    t0, 6-1
    xor    x2, t0
    
    ; x3 ^= ROTR(x3, 10) ^ ROTR(x3, 17);
    mov    t0, x3
    ror    t0, 10
    xor    x3, t0
    ror    t0, 17-10
    xor    x3, t0
    
    ; x4 ^= ROTR(x4,  7) ^ ROTR(x4, 41);
    mov    t0, x4
    ror    t0, 7
    xor    x4, t0
    ror    t0, 41-7
    xor    x4, t0
    
    pop    i
    inc    i
    cmp    i, [rsp]
    jnz    permute_loop  
   
    ; save
    mov    [x+0*8], x0    
    mov    [x+1*8], x1  
    mov    [x+2*8], x2  
    mov    [x+3*8], x3  
    mov    [x+4*8], x4  

    pop    r
    
    pop    r12
    pop    rbp
    pop    rdi
    pop    rbx
    pop    rsi
    ret    

ARM64 / AArch64 assembly

// ASCON in ARM64 assembly
// 192 bytes

    .arch armv8-a
    .text

    .global ascon

ascon:
    mov    x10, x0
    // load 320-bit state
    ldp    x0, x1, [x10]
    ldp    x2, x3, [x10, 16]
    ldr    x4, [x10, 32]

    // apply 12 rounds
    mov    x11, xzr
L0:
    // add round constant
    // x2^=((0xFULL-i)<<4)|i;
    mov    x12, 0xF
    sub    x12, x12, x11
    orr    x12, x11, x12, lsl 4
    eor    x2, x2, x12

    // apply non-linear layer
    // x0^=x4;x4^=x3;x2^=x1;
    eor    x0, x0, x4
    eor    x4, x4, x3
    eor    x2, x2, x1

    // t4=(x0&~x4);t3=(x4&~x3);t2=(x3&~x2);t1=(x2&~x1);t0=(x1&~x0);
    bic    x5, x1, x0
    bic    x6, x2, x1
    bic    x7, x3, x2
    bic    x8, x4, x3
    bic    x9, x0, x4

    // x0^=t1;x1^=t2;x2^=t3;x3^=t4;x4^=t0;
    eor    x0, x0, x6
    eor    x1, x1, x7
    eor    x2, x2, x8
    eor    x3, x3, x9
    eor    x4, x4, x5

    // x1^=x0;x0^=x4;x3^=x2;x2=~x2;
    eor    x1, x1, x0
    eor    x0, x0, x4
    eor    x3, x3, x2
    mvn    x2, x2

    // apply linear diffusion layer
    // x0^=R(x0,19)^R(x0,28);
    ror    x5, x0, 19
    eor    x5, x5, x0, ror 28
    eor    x0, x0, x5
            
    // x1^=R(x1,61)^R(x1,39);
    ror    x5, x1, 61
    eor    x5, x5, x1, ror 39
    eor    x1, x1, x5

    // x2^=R(x2,1)^R(x2,6);
    ror    x5, x2, 1
    eor    x5, x5, x2, ror 6
    eor    x2, x2, x5

    // x3^=R(x3,10)^R(x3,17);
    ror    x5, x3, 10
    eor    x5, x5, x3, ror 17
    eor    x3, x3, x5

    // x4^=R(x4,7)^R(x4,41);
    ror    x5, x4, 7
    eor    x5, x5, x4, ror 41
    eor    x4, x4, x5

    // i++
    add    x11, x11, 1
    // i < 12
    cmp    x11, 12
    bne    L0

    // save 320-bit state
    stp    x0, x1, [x10]
    stp    x2, x3, [x10, 16]
    str    x4, [x10, 32]
    ret

Sources here.

This entry was posted in cryptography, encryption, programming, security and tagged , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s