; --------------------------------------------------------------------------- ; Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved. ; ; The redistribution and use of this software (with or without changes) ; is allowed without the payment of fees or royalties provided that: ; ; source code distributions include the above copyright notice, this ; list of conditions and the following disclaimer; ; ; binary distributions include the above copyright notice, this list ; of conditions and the following disclaimer in their documentation. ; ; This software is provided 'as is' with no explicit or implied warranties ; in respect of its operation, including, but not limited to, correctness ; and fitness for purpose. ; --------------------------------------------------------------------------- ; Issue Date: 20/12/2007 ; ; I am grateful to Dag Arne Osvik for many discussions of the techniques that ; can be used to optimise AES assembler code on AMD64/EM64T architectures. ; Some of the techniques used in this implementation are the result of ; suggestions made by him for which I am most grateful. ; An AES implementation for AMD64 processors using the YASM assembler. This ; implemetation provides only encryption, decryption and hence requires key ; scheduling support in C. It uses 8k bytes of tables but its encryption and ; decryption performance is very close to that obtained using large tables. ; It can use either Windows or Gnu/Linux calling conventions, which are as ; follows: ; windows gnu/linux ; ; in_blk rcx rdi ; out_blk rdx rsi ; context (cx) r8 rdx ; ; preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 ; registers rdi - on both ; ; destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 ; registers - rdi on both ; ; The default convention is that for windows, the gnu/linux convention being ; used if __GNUC__ is defined. ; ; Define _SEH_ to include support for Win64 structured exception handling ; (this requires YASM version 0.6 or later). ; ; This code provides the standard AES block size (128 bits, 16 bytes) and the ; three standard AES key sizes (128, 192 and 256 bits). It has the same call ; interface as my C implementation. It uses the Microsoft C AMD64 calling ; conventions in which the three parameters are placed in rcx, rdx and r8 ; respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. ; ; AES_RETURN aes_encrypt(const unsigned char in_blk[], ; unsigned char out_blk[], const aes_encrypt_ctx cx[1]); ; ; AES_RETURN aes_decrypt(const unsigned char in_blk[], ; unsigned char out_blk[], const aes_decrypt_ctx cx[1]); ; ; AES_RETURN aes_encrypt_key(const unsigned char key[], ; const aes_encrypt_ctx cx[1]); ; ; AES_RETURN aes_decrypt_key(const unsigned char key[], ; const aes_decrypt_ctx cx[1]); ; ; AES_RETURN aes_encrypt_key(const unsigned char key[], ; unsigned int len, const aes_decrypt_ctx cx[1]); ; ; AES_RETURN aes_decrypt_key(const unsigned char key[], ; unsigned int len, const aes_decrypt_ctx cx[1]); ; ; where is 128, 102 or 256. In the last two calls the length can be in ; either bits or bytes. ; ; Comment in/out the following lines to obtain the desired subroutines. These ; selections MUST match those in the C header files aes.h and aesopt.h %define USE_INTEL_AES_IF_AVAILABLE %define AES_128 ; define if AES with 128 bit keys is needed %define AES_192 ; define if AES with 192 bit keys is needed %define AES_256 ; define if AES with 256 bit keys is needed %define AES_VAR ; define if a variable key size is needed %define ENCRYPTION ; define if encryption is needed %define DECRYPTION ; define if decryption is needed %ifdef USE_INTEL_AES_IF_AVAILABLE %define aes_ni(x) aes_ %+ x %+ _i %undef AES_REV_DKS %else %define aes_ni(x) aes_ %+ x %define AES_REV_DKS %endif %define LAST_ROUND_TABLES ; define for the faster version using extra tables ; The encryption key schedule has the following in memory layout where N is the ; number of rounds (10, 12 or 14): ; ; lo: | input key (round 0) | ; each round is four 32-bit words ; | encryption round 1 | ; | encryption round 2 | ; .... ; | encryption round N-1 | ; hi: | encryption round N | ; ; The decryption key schedule is normally set up so that it has the same ; layout as above by actually reversing the order of the encryption key ; schedule in memory (this happens when AES_REV_DKS is set): ; ; lo: | decryption round 0 | = | encryption round N | ; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] ; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] ; .... .... ; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] ; hi: | decryption round N | = | input key (round 0) | ; ; with rounds except the first and last modified using inv_mix_column() ; But if AES_REV_DKS is NOT set the order of keys is left as it is for ; encryption so that it has to be accessed in reverse when used for ; decryption (although the inverse mix column modifications are done) ; ; lo: | decryption round 0 | = | input key (round 0) | ; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] ; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] ; .... .... ; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] ; hi: | decryption round N | = | encryption round N | ; ; This layout is faster when the assembler key scheduling provided here ; is used. ; ; The DLL interface must use the _stdcall convention in which the number ; of bytes of parameter space is added after an @ to the sutine's name. ; We must also remove our parameters from the stack before return (see ; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version. ;%define DLL_EXPORT ; End of user defines %ifdef AES_VAR %ifndef AES_128 %define AES_128 %endif %ifndef AES_192 %define AES_192 %endif %ifndef AES_256 %define AES_256 %endif %endif %ifdef AES_VAR %define KS_LENGTH 60 %elifdef AES_256 %define KS_LENGTH 60 %elifdef AES_192 %define KS_LENGTH 52 %else %define KS_LENGTH 44 %endif %define r0 rax %define r1 rdx %define r2 rcx %define r3 rbx %define r4 rsi %define r5 rdi %define r6 rbp %define r7 rsp %define raxd eax %define rdxd edx %define rcxd ecx %define rbxd ebx %define rsid esi %define rdid edi %define rbpd ebp %define rspd esp %define raxb al %define rdxb dl %define rcxb cl %define rbxb bl %define rsib sil %define rdib dil %define rbpb bpl %define rspb spl %define r0h ah %define r1h dh %define r2h ch %define r3h bh %define r0d eax %define r1d edx %define r2d ecx %define r3d ebx ; finite field multiplies by {02}, {04} and {08} %define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) %define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) %define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) ; finite field multiplies required in table generation %define f3(x) (f2(x) ^ x) %define f9(x) (f8(x) ^ x) %define fb(x) (f8(x) ^ f2(x) ^ x) %define fd(x) (f8(x) ^ f4(x) ^ x) %define fe(x) (f8(x) ^ f4(x) ^ f2(x)) ; macro for expanding S-box data %macro enc_vals 1 db %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5) db %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76) db %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0) db %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0) db %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc) db %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15) db %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a) db %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75) db %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0) db %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84) db %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b) db %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf) db %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85) db %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8) db %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5) db %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2) db %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17) db %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73) db %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88) db %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb) db %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c) db %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79) db %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9) db %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08) db %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6) db %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a) db %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e) db %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e) db %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94) db %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf) db %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68) db %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16) %endmacro %macro dec_vals 1 db %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38) db %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb) db %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87) db %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb) db %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d) db %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e) db %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2) db %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25) db %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16) db %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92) db %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda) db %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84) db %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a) db %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06) db %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02) db %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b) db %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea) db %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73) db %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85) db %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e) db %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89) db %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b) db %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20) db %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4) db %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31) db %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f) db %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d) db %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef) db %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0) db %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61) db %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26) db %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d) %endmacro %define u8(x) f2(x), x, x, f3(x), f2(x), x, x, f3(x) %define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x %define w8(x) x, 0, 0, 0, x, 0, 0, 0 %define tptr rbp ; table pointer %define kptr r8 ; key schedule pointer %define fofs 128 ; adjust offset in key schedule to keep |disp| < 128 %define fk_ref(x,y) [kptr-16*x+fofs+4*y] %ifdef AES_REV_DKS %define rofs 128 %define ik_ref(x,y) [kptr-16*x+rofs+4*y] %else %define rofs -128 %define ik_ref(x,y) [kptr+16*x+rofs+4*y] %endif %define tab_0(x) [tptr+8*x] %define tab_1(x) [tptr+8*x+3] %define tab_2(x) [tptr+8*x+2] %define tab_3(x) [tptr+8*x+1] %define tab_f(x) byte [tptr+8*x+1] %define tab_i(x) byte [tptr+8*x+7] %define t_ref(x,r) tab_ %+ x(r) %macro ff_rnd 5 ; normal forward round mov %1d, fk_ref(%5,0) mov %2d, fk_ref(%5,1) mov %3d, fk_ref(%5,2) mov %4d, fk_ref(%5,3) movzx esi, al movzx edi, ah shr eax, 16 xor %1d, t_ref(0,rsi) xor %4d, t_ref(1,rdi) movzx esi, al movzx edi, ah xor %3d, t_ref(2,rsi) xor %2d, t_ref(3,rdi) movzx esi, bl movzx edi, bh shr ebx, 16 xor %2d, t_ref(0,rsi) xor %1d, t_ref(1,rdi) movzx esi, bl movzx edi, bh xor %4d, t_ref(2,rsi) xor %3d, t_ref(3,rdi) movzx esi, cl movzx edi, ch shr ecx, 16 xor %3d, t_ref(0,rsi) xor %2d, t_ref(1,rdi) movzx esi, cl movzx edi, ch xor %1d, t_ref(2,rsi) xor %4d, t_ref(3,rdi) movzx esi, dl movzx edi, dh shr edx, 16 xor %4d, t_ref(0,rsi) xor %3d, t_ref(1,rdi) movzx esi, dl movzx edi, dh xor %2d, t_ref(2,rsi) xor %1d, t_ref(3,rdi) mov eax,%1d mov ebx,%2d mov ecx,%3d mov edx,%4d %endmacro %ifdef LAST_ROUND_TABLES %macro fl_rnd 5 ; last forward round add tptr, 2048 mov %1d, fk_ref(%5,0) mov %2d, fk_ref(%5,1) mov %3d, fk_ref(%5,2) mov %4d, fk_ref(%5,3) movzx esi, al movzx edi, ah shr eax, 16 xor %1d, t_ref(0,rsi) xor %4d, t_ref(1,rdi) movzx esi, al movzx edi, ah xor %3d, t_ref(2,rsi) xor %2d, t_ref(3,rdi) movzx esi, bl movzx edi, bh shr ebx, 16 xor %2d, t_ref(0,rsi) xor %1d, t_ref(1,rdi) movzx esi, bl movzx edi, bh xor %4d, t_ref(2,rsi) xor %3d, t_ref(3,rdi) movzx esi, cl movzx edi, ch shr ecx, 16 xor %3d, t_ref(0,rsi) xor %2d, t_ref(1,rdi) movzx esi, cl movzx edi, ch xor %1d, t_ref(2,rsi) xor %4d, t_ref(3,rdi) movzx esi, dl movzx edi, dh shr edx, 16 xor %4d, t_ref(0,rsi) xor %3d, t_ref(1,rdi) movzx esi, dl movzx edi, dh xor %2d, t_ref(2,rsi) xor %1d, t_ref(3,rdi) %endmacro %else %macro fl_rnd 5 ; last forward round mov %1d, fk_ref(%5,0) mov %2d, fk_ref(%5,1) mov %3d, fk_ref(%5,2) mov %4d, fk_ref(%5,3) movzx esi, al movzx edi, ah shr eax, 16 movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) xor %1d, esi rol edi, 8 xor %4d, edi movzx esi, al movzx edi, ah movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) rol esi, 16 rol edi, 24 xor %3d, esi xor %2d, edi movzx esi, bl movzx edi, bh shr ebx, 16 movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) xor %2d, esi rol edi, 8 xor %1d, edi movzx esi, bl movzx edi, bh movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) rol esi, 16 rol edi, 24 xor %4d, esi xor %3d, edi movzx esi, cl movzx edi, ch movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) shr ecx, 16 xor %3d, esi rol edi, 8 xor %2d, edi movzx esi, cl movzx edi, ch movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) rol esi, 16 rol edi, 24 xor %1d, esi xor %4d, edi movzx esi, dl movzx edi, dh movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) shr edx, 16 xor %4d, esi rol edi, 8 xor %3d, edi movzx esi, dl movzx edi, dh movzx esi, t_ref(f,rsi) movzx edi, t_ref(f,rdi) rol esi, 16 rol edi, 24 xor %2d, esi xor %1d, edi %endmacro %endif %macro ii_rnd 5 ; normal inverse round mov %1d, ik_ref(%5,0) mov %2d, ik_ref(%5,1) mov %3d, ik_ref(%5,2) mov %4d, ik_ref(%5,3) movzx esi, al movzx edi, ah shr eax, 16 xor %1d, t_ref(0,rsi) xor %2d, t_ref(1,rdi) movzx esi, al movzx edi, ah xor %3d, t_ref(2,rsi) xor %4d, t_ref(3,rdi) movzx esi, bl movzx edi, bh shr ebx, 16 xor %2d, t_ref(0,rsi) xor %3d, t_ref(1,rdi) movzx esi, bl movzx edi, bh xor %4d, t_ref(2,rsi) xor %1d, t_ref(3,rdi) movzx esi, cl movzx edi, ch shr ecx, 16 xor %3d, t_ref(0,rsi) xor %4d, t_ref(1,rdi) movzx esi, cl movzx edi, ch xor %1d, t_ref(2,rsi) xor %2d, t_ref(3,rdi) movzx esi, dl movzx edi, dh shr edx, 16 xor %4d, t_ref(0,rsi) xor %1d, t_ref(1,rdi) movzx esi, dl movzx edi, dh xor %2d, t_ref(2,rsi) xor %3d, t_ref(3,rdi) mov eax,%1d mov ebx,%2d mov ecx,%3d mov edx,%4d %endmacro %ifdef LAST_ROUND_TABLES %macro il_rnd 5 ; last inverse round add tptr, 2048 mov %1d, ik_ref(%5,0) mov %2d, ik_ref(%5,1) mov %3d, ik_ref(%5,2) mov %4d, ik_ref(%5,3) movzx esi, al movzx edi, ah shr eax, 16 xor %1d, t_ref(0,rsi) xor %2d, t_ref(1,rdi) movzx esi, al movzx edi, ah xor %3d, t_ref(2,rsi) xor %4d, t_ref(3,rdi) movzx esi, bl movzx edi, bh shr ebx, 16 xor %2d, t_ref(0,rsi) xor %3d, t_ref(1,rdi) movzx esi, bl movzx edi, bh xor %4d, t_ref(2,rsi) xor %1d, t_ref(3,rdi) movzx esi, cl movzx edi, ch shr ecx, 16 xor %3d, t_ref(0,rsi) xor %4d, t_ref(1,rdi) movzx esi, cl movzx edi, ch xor %1d, t_ref(2,rsi) xor %2d, t_ref(3,rdi) movzx esi, dl movzx edi, dh shr edx, 16 xor %4d, t_ref(0,rsi) xor %1d, t_ref(1,rdi) movzx esi, dl movzx edi, dh xor %2d, t_ref(2,rsi) xor %3d, t_ref(3,rdi) %endmacro %else %macro il_rnd 5 ; last inverse round mov %1d, ik_ref(%5,0) mov %2d, ik_ref(%5,1) mov %3d, ik_ref(%5,2) mov %4d, ik_ref(%5,3) movzx esi, al movzx edi, ah movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) shr eax, 16 xor %1d, esi rol edi, 8 xor %2d, edi movzx esi, al movzx edi, ah movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) rol esi, 16 rol edi, 24 xor %3d, esi xor %4d, edi movzx esi, bl movzx edi, bh movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) shr ebx, 16 xor %2d, esi rol edi, 8 xor %3d, edi movzx esi, bl movzx edi, bh movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) rol esi, 16 rol edi, 24 xor %4d, esi xor %1d, edi movzx esi, cl movzx edi, ch movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) shr ecx, 16 xor %3d, esi rol edi, 8 xor %4d, edi movzx esi, cl movzx edi, ch movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) rol esi, 16 rol edi, 24 xor %1d, esi xor %2d, edi movzx esi, dl movzx edi, dh movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) shr edx, 16 xor %4d, esi rol edi, 8 xor %1d, edi movzx esi, dl movzx edi, dh movzx esi, t_ref(i,rsi) movzx edi, t_ref(i,rdi) rol esi, 16 rol edi, 24 xor %2d, esi xor %3d, edi %endmacro %endif %ifdef ENCRYPTION global aes_ni(encrypt) %ifdef DLL_EXPORT export aes_ni(encrypt) %endif section .data align=64 align 64 enc_tab: enc_vals u8 %ifdef LAST_ROUND_TABLES enc_vals w8 %endif section .text align=16 align 16 %ifdef _SEH_ proc_frame aes_ni(encrypt) alloc_stack 7*8 ; 7 to align stack to 16 bytes save_reg rsi,4*8 save_reg rdi,5*8 save_reg rbx,1*8 save_reg rbp,2*8 save_reg r12,3*8 end_prologue mov rdi, rcx ; input pointer mov [rsp+0*8], rdx ; output pointer %else aes_ni(encrypt): %ifdef __GNUC__ sub rsp, 4*8 ; gnu/linux binary interface mov [rsp+0*8], rsi ; output pointer mov r8, rdx ; context %else sub rsp, 6*8 ; windows binary interface mov [rsp+4*8], rsi mov [rsp+5*8], rdi mov rdi, rcx ; input pointer mov [rsp+0*8], rdx ; output pointer %endif mov [rsp+1*8], rbx ; input pointer in rdi mov [rsp+2*8], rbp ; output pointer in [rsp] mov [rsp+3*8], r12 ; context in r8 %endif movzx esi, byte [kptr+4*KS_LENGTH] lea tptr, [rel enc_tab] sub kptr, fofs mov eax, [rdi+0*4] mov ebx, [rdi+1*4] mov ecx, [rdi+2*4] mov edx, [rdi+3*4] xor eax, [kptr+fofs] xor ebx, [kptr+fofs+4] xor ecx, [kptr+fofs+8] xor edx, [kptr+fofs+12] lea kptr,[kptr+rsi] cmp esi, 10*16 je .3 cmp esi, 12*16 je .2 cmp esi, 14*16 je .1 mov rax, -1 jmp .4 .1: ff_rnd r9, r10, r11, r12, 13 ff_rnd r9, r10, r11, r12, 12 .2: ff_rnd r9, r10, r11, r12, 11 ff_rnd r9, r10, r11, r12, 10 .3: ff_rnd r9, r10, r11, r12, 9 ff_rnd r9, r10, r11, r12, 8 ff_rnd r9, r10, r11, r12, 7 ff_rnd r9, r10, r11, r12, 6 ff_rnd r9, r10, r11, r12, 5 ff_rnd r9, r10, r11, r12, 4 ff_rnd r9, r10, r11, r12, 3 ff_rnd r9, r10, r11, r12, 2 ff_rnd r9, r10, r11, r12, 1 fl_rnd r9, r10, r11, r12, 0 mov rbx, [rsp] mov [rbx], r9d mov [rbx+4], r10d mov [rbx+8], r11d mov [rbx+12], r12d xor rax, rax .4: mov rbx, [rsp+1*8] mov rbp, [rsp+2*8] mov r12, [rsp+3*8] %ifdef __GNUC__ add rsp, 4*8 ret %else mov rsi, [rsp+4*8] mov rdi, [rsp+5*8] %ifdef _SEH_ add rsp, 7*8 ret endproc_frame %else add rsp, 6*8 ret %endif %endif %endif %ifdef DECRYPTION global aes_ni(decrypt) %ifdef DLL_EXPORT export aes_ni(decrypt) %endif section .data align 64 dec_tab: dec_vals v8 %ifdef LAST_ROUND_TABLES dec_vals w8 %endif section .text align 16 %ifdef _SEH_ proc_frame aes_ni(decrypt) alloc_stack 7*8 ; 7 to align stack to 16 bytes save_reg rsi,4*8 save_reg rdi,5*8 save_reg rbx,1*8 save_reg rbp,2*8 save_reg r12,3*8 end_prologue mov rdi, rcx ; input pointer mov [rsp+0*8], rdx ; output pointer %else aes_ni(decrypt): %ifdef __GNUC__ sub rsp, 4*8 ; gnu/linux binary interface mov [rsp+0*8], rsi ; output pointer mov r8, rdx ; context %else sub rsp, 6*8 ; windows binary interface mov [rsp+4*8], rsi mov [rsp+5*8], rdi mov rdi, rcx ; input pointer mov [rsp+0*8], rdx ; output pointer %endif mov [rsp+1*8], rbx ; input pointer in rdi mov [rsp+2*8], rbp ; output pointer in [rsp] mov [rsp+3*8], r12 ; context in r8 %endif movzx esi, byte[kptr+4*KS_LENGTH] lea tptr, [rel dec_tab] sub kptr, rofs mov eax, [rdi+0*4] mov ebx, [rdi+1*4] mov ecx, [rdi+2*4] mov edx, [rdi+3*4] %ifdef AES_REV_DKS mov rdi, kptr lea kptr,[kptr+rsi] %else lea rdi,[kptr+rsi] %endif xor eax, [rdi+rofs] xor ebx, [rdi+rofs+4] xor ecx, [rdi+rofs+8] xor edx, [rdi+rofs+12] cmp esi, 10*16 je .3 cmp esi, 12*16 je .2 cmp esi, 14*16 je .1 mov rax, -1 jmp .4 .1: ii_rnd r9, r10, r11, r12, 13 ii_rnd r9, r10, r11, r12, 12 .2: ii_rnd r9, r10, r11, r12, 11 ii_rnd r9, r10, r11, r12, 10 .3: ii_rnd r9, r10, r11, r12, 9 ii_rnd r9, r10, r11, r12, 8 ii_rnd r9, r10, r11, r12, 7 ii_rnd r9, r10, r11, r12, 6 ii_rnd r9, r10, r11, r12, 5 ii_rnd r9, r10, r11, r12, 4 ii_rnd r9, r10, r11, r12, 3 ii_rnd r9, r10, r11, r12, 2 ii_rnd r9, r10, r11, r12, 1 il_rnd r9, r10, r11, r12, 0 mov rbx, [rsp] mov [rbx], r9d mov [rbx+4], r10d mov [rbx+8], r11d mov [rbx+12], r12d xor rax, rax .4: mov rbx, [rsp+1*8] mov rbp, [rsp+2*8] mov r12, [rsp+3*8] %ifdef __GNUC__ add rsp, 4*8 ret %else mov rsi, [rsp+4*8] mov rdi, [rsp+5*8] %ifdef _SEH_ add rsp, 7*8 ret endproc_frame %else add rsp, 6*8 ret %endif %endif %endif end