From 8a7565113ab937cc99f8f4c929bde2ee08fc498c Mon Sep 17 00:00:00 2001 From: Nikos Mavrogiannopoulos Date: Tue, 26 Nov 2013 23:19:45 +0100 Subject: [PATCH 1/2] updated auto-generated asm files. This fixes a valgrind complaint when AES-NI is in use. --- .../x86/coff/appro-aes-gcm-x86-64-coff.s | 574 ++++-- lib/accelerated/x86/coff/appro-aes-x86-64-coff.s | 1826 ++++++++++++-------- lib/accelerated/x86/coff/padlock-x86-64-coff.s | 495 ++++++ lib/accelerated/x86/coff/padlock-x86-coff.s | 352 +++- lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s | 515 ++++-- lib/accelerated/x86/elf/appro-aes-x86-64.s | 1609 ++++++++++------- lib/accelerated/x86/elf/padlock-x86-64.s | 462 +++++ lib/accelerated/x86/elf/padlock-x86.s | 575 +++++- .../x86/macosx/appro-aes-gcm-x86-64-macosx.s | 515 ++++-- .../x86/macosx/appro-aes-x86-64-macosx.s | 1609 ++++++++++------- lib/accelerated/x86/macosx/padlock-x86-64-macosx.s | 462 +++++ lib/accelerated/x86/macosx/padlock-x86-macosx.s | 349 +++- 12 files changed, 6978 insertions(+), 2365 deletions(-) diff --git a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s index fa449d6..ceb9108 100644 --- a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s +++ b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s @@ -717,6 +717,11 @@ gcm_ghash_4bit: .def gcm_init_clmul; .scl 2; .type 32; .endef .p2align 4 gcm_init_clmul: +.L_init_clmul: +.LSEH_begin_gcm_init_clmul: + +.byte 0x48,0x83,0xec,0x18 +.byte 0x0f,0x29,0x34,0x24 movdqu (%rdx),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -735,15 +740,15 @@ gcm_init_clmul: pxor %xmm5,%xmm2 + pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -753,44 +758,137 @@ gcm_init_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rcx) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rcx) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rcx) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - movdqu %xmm2,(%rcx) - movdqu %xmm0,16(%rcx) + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rcx) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rcx) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rcx) + movaps (%rsp),%xmm6 + leaq 24(%rsp),%rsp +.LSEH_end_gcm_init_clmul: .byte 0xf3,0xc3 .globl gcm_gmult_clmul .def gcm_gmult_clmul; .scl 2; .type 32; .endef .p2align 4 gcm_gmult_clmul: +.L_gmult_clmul: movdqu (%rcx),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rdx),%xmm2 + movdqu 32(%rdx),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 @@ -803,194 +901,372 @@ gcm_gmult_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rcx) .byte 0xf3,0xc3 .globl gcm_ghash_clmul .def gcm_ghash_clmul; .scl 2; .type 32; .endef -.p2align 4 +.p2align 5 gcm_ghash_clmul: +.L_ghash_clmul: + leaq -136(%rsp),%rax .LSEH_begin_gcm_ghash_clmul: -.byte 0x48,0x83,0xec,0x58 -.byte 0x0f,0x29,0x34,0x24 -.byte 0x0f,0x29,0x7c,0x24,0x10 -.byte 0x44,0x0f,0x29,0x44,0x24,0x20 -.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 -.byte 0x44,0x0f,0x29,0x54,0x24,0x40 +.byte 0x48,0x8d,0x60,0xe0 +.byte 0x0f,0x29,0x70,0xe0 +.byte 0x0f,0x29,0x78,0xf0 +.byte 0x44,0x0f,0x29,0x00 +.byte 0x44,0x0f,0x29,0x48,0x10 +.byte 0x44,0x0f,0x29,0x50,0x20 +.byte 0x44,0x0f,0x29,0x58,0x30 +.byte 0x44,0x0f,0x29,0x60,0x40 +.byte 0x44,0x0f,0x29,0x68,0x50 +.byte 0x44,0x0f,0x29,0x70,0x60 +.byte 0x44,0x0f,0x29,0x78,0x70 movdqa .Lbswap_mask(%rip),%xmm5 + movq $11547335547999543296,%rax movdqu (%rcx),%xmm0 movdqu (%rdx),%xmm2 + movdqu 32(%rdx),%xmm10 .byte 102,15,56,0,197 subq $16,%r9 jz .Lodd_tail - movdqu 16(%rdx),%xmm8 + movdqu 16(%rdx),%xmm9 + cmpq $48,%r9 + jb .Lskip4x + subq $48,%r9 + movdqu 48(%rdx),%xmm14 + movdqu 64(%rdx),%xmm15 - movdqu (%r8),%xmm3 - movdqu 16(%r8),%xmm6 -.byte 102,15,56,0,221 + movdqu 48(%r8),%xmm6 + movdqu 32(%r8),%xmm11 .byte 102,15,56,0,245 - pxor %xmm3,%xmm0 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm6,%xmm3 - pxor %xmm2,%xmm4 +.byte 102,68,15,56,0,221 + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm7 + pxor %xmm6,%xmm7 .byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,220,0 - pxor %xmm6,%xmm3 - pxor %xmm7,%xmm3 +.byte 102,68,15,58,68,194,17 +.byte 102,65,15,58,68,250,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,217,0 +.byte 102,69,15,58,68,233,17 + xorps %xmm11,%xmm6 +.byte 102,69,15,58,68,226,16 + xorps %xmm13,%xmm8 + movups 80(%rdx),%xmm10 + xorps %xmm12,%xmm7 + + movdqu 16(%r8),%xmm11 + movdqu 0(%r8),%xmm3 +.byte 102,68,15,56,0,221 +.byte 102,15,56,0,221 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm6 +.byte 102,69,15,58,68,226,0 + xorps %xmm13,%xmm8 + + leaq 64(%r8),%r8 + subq $64,%r9 + jc .Ltail4x + + jmp .Lmod4_loop +.p2align 5 +.Lmod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm7 + movdqu 48(%r8),%xmm11 +.byte 102,68,15,56,0,221 +.byte 102,65,15,58,68,207,17 + xorps %xmm6,%xmm0 + movdqu 32(%r8),%xmm6 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 +.byte 102,65,15,58,68,218,16 + xorps %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,15,56,0,245 + movups 32(%rdx),%xmm10 +.byte 102,68,15,58,68,218,0 + xorps %xmm7,%xmm3 + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm7 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm7 + pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 + pslldq $8,%xmm3 +.byte 102,68,15,58,68,234,17 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + movdqa .L7_mask(%rip),%xmm3 + pxor %xmm4,%xmm1 +.byte 102,72,15,110,224 + + pand %xmm0,%xmm3 +.byte 102,15,56,0,227 +.byte 102,69,15,58,68,226,0 + pxor %xmm0,%xmm4 + psllq $57,%xmm4 + movdqa %xmm4,%xmm3 pslldq $8,%xmm4 - pxor %xmm3,%xmm7 - pxor %xmm4,%xmm6 +.byte 102,65,15,58,68,241,0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqu 0(%r8),%xmm3 + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,193,17 + xorps %xmm11,%xmm6 + movdqu 16(%r8),%xmm11 +.byte 102,68,15,56,0,221 +.byte 102,65,15,58,68,250,16 + xorps %xmm13,%xmm8 + movups 80(%rdx),%xmm10 +.byte 102,15,56,0,221 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm7 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm6 + pxor %xmm1,%xmm0 + +.byte 102,69,15,58,68,226,0 + xorps %xmm13,%xmm8 + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - leaq 32(%r8),%r8 - subq $32,%r9 - jbe .Leven_tail + leaq 64(%r8),%r8 + subq $64,%r9 + jnc .Lmod4_loop + +.Ltail4x: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm7 +.byte 102,65,15,58,68,207,17 + xorps %xmm6,%xmm0 +.byte 102,65,15,58,68,218,16 + xorps %xmm8,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm7,%xmm3 -.Lmod_loop: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 + pxor %xmm0,%xmm1 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - movdqu (%r8),%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%r9 + jz .Ldone + movdqu 32(%rdx),%xmm10 + subq $16,%r9 + jz .Lodd_tail +.Lskip4x: + + + + + + movdqu (%r8),%xmm3 movdqu 16(%r8),%xmm6 .byte 102,15,56,0,221 .byte 102,15,56,0,245 + pxor %xmm3,%xmm0 + + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm3 + pxor %xmm6,%xmm3 +.byte 102,15,58,68,242,0 +.byte 102,68,15,58,68,194,17 +.byte 102,65,15,58,68,218,0 + + leaq 32(%r8),%r8 + subq $32,%r9 + jbe .Leven_tail + jmp .Lmod_loop - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm9 - pshufd $78,%xmm2,%xmm10 - pxor %xmm6,%xmm9 - pxor %xmm2,%xmm10 +.p2align 5 +.Lmod_loop: + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,65,15,58,68,193,0 +.byte 102,65,15,58,68,201,17 +.byte 102,65,15,58,68,226,16 + + pxor %xmm6,%xmm0 + pxor %xmm8,%xmm1 + movdqu (%r8),%xmm8 +.byte 102,68,15,56,0,197 + movdqu 16(%r8),%xmm6 + + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm8,%xmm1 + pxor %xmm3,%xmm4 +.byte 102,15,56,0,245 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm3 + pslldq $8,%xmm4 pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm6,%xmm8 + + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 - pxor %xmm3,%xmm0 .byte 102,15,58,68,242,0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + pshufd $78,%xmm8,%xmm3 + pxor %xmm8,%xmm3 -.byte 102,15,58,68,250,17 +.byte 102,68,15,58,68,194,17 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 - -.byte 102,69,15,58,68,202,0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - - pxor %xmm6,%xmm9 - pxor %xmm7,%xmm9 - movdqa %xmm9,%xmm10 - psrldq $8,%xmm9 - pslldq $8,%xmm10 - pxor %xmm9,%xmm7 - pxor %xmm10,%xmm6 +.byte 102,65,15,58,68,218,0 + pxor %xmm1,%xmm0 leaq 32(%r8),%r8 subq $32,%r9 ja .Lmod_loop .Leven_tail: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,65,15,58,68,193,0 +.byte 102,65,15,58,68,201,17 +.byte 102,65,15,58,68,226,16 + + pxor %xmm6,%xmm0 + pxor %xmm8,%xmm1 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 testq %r9,%r9 jnz .Ldone @@ -1000,12 +1276,10 @@ gcm_ghash_clmul: pxor %xmm3,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,65,15,58,68,218,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -1015,27 +1289,28 @@ gcm_ghash_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .Ldone: .byte 102,15,56,0,197 movdqu %xmm0,(%rcx) @@ -1044,15 +1319,42 @@ gcm_ghash_clmul: movaps 32(%rsp),%xmm8 movaps 48(%rsp),%xmm9 movaps 64(%rsp),%xmm10 - addq $88,%rsp - .byte 0xf3,0xc3 + movaps 80(%rsp),%xmm11 + movaps 96(%rsp),%xmm12 + movaps 112(%rsp),%xmm13 + movaps 128(%rsp),%xmm14 + movaps 144(%rsp),%xmm15 + leaq 168(%rsp),%rsp .LSEH_end_gcm_ghash_clmul: + .byte 0xf3,0xc3 + +.globl gcm_init_avx +.def gcm_init_avx; .scl 2; .type 32; .endef +.p2align 5 +gcm_init_avx: + jmp .L_init_clmul + +.globl gcm_gmult_avx +.def gcm_gmult_avx; .scl 2; .type 32; .endef +.p2align 5 +gcm_gmult_avx: + jmp .L_gmult_clmul + +.globl gcm_ghash_avx +.def gcm_ghash_avx; .scl 2; .type 32; .endef +.p2align 5 +gcm_ghash_avx: + jmp .L_ghash_clmul .p2align 6 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 .p2align 6 .Lrem_4bit: @@ -1189,10 +1491,13 @@ se_handler: .rva .LSEH_end_gcm_ghash_4bit .rva .LSEH_info_gcm_ghash_4bit +.rva .LSEH_begin_gcm_init_clmul +.rva .LSEH_end_gcm_init_clmul +.rva .LSEH_info_gcm_init_clmul + .rva .LSEH_begin_gcm_ghash_clmul .rva .LSEH_end_gcm_ghash_clmul .rva .LSEH_info_gcm_ghash_clmul - .section .xdata .p2align 3 .LSEH_info_gcm_gmult_4bit: @@ -1203,11 +1508,20 @@ se_handler: .byte 9,0,0,0 .rva se_handler .rva .Lghash_prologue,.Lghash_epilogue +.LSEH_info_gcm_init_clmul: +.byte 0x01,0x08,0x03,0x00 +.byte 0x08,0x68,0x00,0x00 +.byte 0x04,0x22,0x00,0x00 .LSEH_info_gcm_ghash_clmul: -.byte 0x01,0x1f,0x0b,0x00 -.byte 0x1f,0xa8,0x04,0x00 -.byte 0x19,0x98,0x03,0x00 -.byte 0x13,0x88,0x02,0x00 -.byte 0x0d,0x78,0x01,0x00 +.byte 0x01,0x33,0x16,0x00 +.byte 0x33,0xf8,0x09,0x00 +.byte 0x2e,0xe8,0x08,0x00 +.byte 0x29,0xd8,0x07,0x00 +.byte 0x24,0xc8,0x06,0x00 +.byte 0x1f,0xb8,0x05,0x00 +.byte 0x1a,0xa8,0x04,0x00 +.byte 0x15,0x98,0x03,0x00 +.byte 0x10,0x88,0x02,0x00 +.byte 0x0c,0x78,0x01,0x00 .byte 0x08,0x68,0x00,0x00 -.byte 0x04,0xa2,0x00,0x00 +.byte 0x04,0x01,0x15,0x00 diff --git a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s index 7bd9665..224a226 100644 --- a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s +++ b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s @@ -997,211 +997,423 @@ aesni_ctr32_encrypt_blocks: movq %r9,%rcx movq 40(%rsp),%r8 - leaq -200(%rsp),%rsp - movaps %xmm6,32(%rsp) - movaps %xmm7,48(%rsp) - movaps %xmm8,64(%rsp) - movaps %xmm9,80(%rsp) - movaps %xmm10,96(%rsp) - movaps %xmm11,112(%rsp) - movaps %xmm12,128(%rsp) - movaps %xmm13,144(%rsp) - movaps %xmm14,160(%rsp) - movaps %xmm15,176(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $288,%rsp + andq $-16,%rsp + movaps %xmm6,-168(%rax) + movaps %xmm7,-152(%rax) + movaps %xmm8,-136(%rax) + movaps %xmm9,-120(%rax) + movaps %xmm10,-104(%rax) + movaps %xmm11,-88(%rax) + movaps %xmm12,-72(%rax) + movaps %xmm13,-56(%rax) + movaps %xmm14,-40(%rax) + movaps %xmm15,-24(%rax) .Lctr32_body: + leaq -8(%rax),%rbp + cmpq $1,%rdx je .Lctr32_one_shortcut - movdqu (%r8),%xmm14 - movdqa .Lbswap_mask(%rip),%xmm15 - xorl %eax,%eax -.byte 102,69,15,58,22,242,3 -.byte 102,68,15,58,34,240,3 + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movdqa %xmm2,112(%rsp) movl 240(%rcx),%eax + + leaq 1(%r8),%r9 + leaq 2(%r8),%r10 + bswapl %r9d bswapl %r10d - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 -.byte 102,69,15,58,34,226,0 - leaq 3(%r10),%r11 -.byte 102,69,15,58,34,235,0 - incl %r10d -.byte 102,69,15,58,34,226,1 - incq %r11 -.byte 102,69,15,58,34,235,1 - incl %r10d -.byte 102,69,15,58,34,226,2 - incq %r11 -.byte 102,69,15,58,34,235,2 - movdqa %xmm12,0(%rsp) -.byte 102,69,15,56,0,231 - movdqa %xmm13,16(%rsp) -.byte 102,69,15,56,0,239 - - pshufd $192,%xmm12,%xmm2 - pshufd $128,%xmm12,%xmm3 - pshufd $64,%xmm12,%xmm4 - cmpq $6,%rdx - jb .Lctr32_tail - shrl $1,%eax - movq %rcx,%r11 - movl %eax,%r10d - subq $6,%rdx - jmp .Lctr32_loop6 + xorl %r11d,%r9d + xorl %r11d,%r10d +.byte 102,65,15,58,34,217,3 + leaq 3(%r8),%r9 + movdqa %xmm3,16(%rsp) +.byte 102,65,15,58,34,226,3 + bswapl %r9d + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%r9d + bswapl %r10d +.byte 102,65,15,58,34,233,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + xorl %r11d,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + xorl %r11d,%r9d + movl %r9d,112+12(%rsp) -.p2align 4 -.Lctr32_loop6: - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm2 - movups (%r11),%xmm0 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm3 - movups 16(%r11),%xmm1 - pshufd $64,%xmm13,%xmm7 - por %xmm14,%xmm4 - por %xmm14,%xmm5 - xorps %xmm0,%xmm2 - por %xmm14,%xmm6 - por %xmm14,%xmm7 + movups 16(%rcx),%xmm1 + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + cmpq $8,%rdx + jb .Lctr32_tail + leaq 128(%rcx),%rcx + subq $8,%rdx + jmp .Lctr32_loop8 - pxor %xmm0,%xmm3 +.p2align 5 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 - movdqa .Lincrement32(%rip),%xmm13 - pxor %xmm0,%xmm5 + bswapl %r9d + movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - movdqa 0(%rsp),%xmm12 - pxor %xmm0,%xmm6 + xorl %r11d,%r9d .byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - jmp .Lctr32_enc_loop6_enter -.p2align 4 -.Lctr32_enc_loop6: +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax + bswapl %r9d .byte 102,15,56,220,225 + xorl %r11d,%r9d .byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lctr32_enc_loop6_enter: - movups 16(%rcx),%xmm1 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx + bswapl %r9d .byte 102,15,56,220,224 + xorl %r11d,%r9d .byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 - jnz .Lctr32_enc_loop6 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + bswapl %r9d +.byte 102,15,56,220,225 + xorl %r11d,%r9d +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + bswapl %r9d +.byte 102,15,56,220,225 + xorl %r11d,%r9d +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 + movdqu 0(%rdi),%xmm10 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + cmpl $11,%eax + jb .Lctr32_enc_done .byte 102,15,56,220,209 - paddd %xmm13,%xmm12 .byte 102,15,56,220,217 - paddd 16(%rsp),%xmm13 .byte 102,15,56,220,225 - movdqa %xmm12,0(%rsp) .byte 102,15,56,220,233 - movdqa %xmm13,16(%rsp) .byte 102,15,56,220,241 -.byte 102,69,15,56,0,231 .byte 102,15,56,220,249 -.byte 102,69,15,56,0,239 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 -.byte 102,15,56,221,208 - movups (%rdi),%xmm8 -.byte 102,15,56,221,216 - movups 16(%rdi),%xmm9 -.byte 102,15,56,221,224 - movups 32(%rdi),%xmm10 -.byte 102,15,56,221,232 - movups 48(%rdi),%xmm11 -.byte 102,15,56,221,240 - movups 64(%rdi),%xmm1 -.byte 102,15,56,221,248 - movups 80(%rdi),%xmm0 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je .Lctr32_enc_done - xorps %xmm2,%xmm8 - pshufd $192,%xmm12,%xmm2 - xorps %xmm3,%xmm9 - pshufd $128,%xmm12,%xmm3 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - pshufd $64,%xmm12,%xmm4 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - xorps %xmm7,%xmm0 - movups %xmm1,64(%rsi) - movups %xmm0,80(%rsi) - leaq 96(%rsi),%rsi - movl %r10d,%eax - subq $6,%rdx - jnc .Lctr32_loop6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 - addq $6,%rdx +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 +.byte 102,15,56,220,209 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112(%rdi),%xmm10 + leaq 128(%rdi),%rdi +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 + movdqa 16(%rsp),%xmm12 +.byte 102,65,15,56,221,237 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 + movdqa 48(%rsp),%xmm14 +.byte 102,65,15,56,221,255 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 +.byte 102,69,15,56,221,202 + movups 16-128(%rcx),%xmm1 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 + + addq $8,%rdx jz .Lctr32_done - movq %r11,%rcx - leal 1(%rax,%rax,1),%eax + leaq -128(%rcx),%rcx .Lctr32_tail: - por %xmm14,%xmm2 - movups (%rdi),%xmm8 - cmpq $2,%rdx - jb .Lctr32_one + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb .Lctr32_loop3 + je .Lctr32_loop4 - por %xmm14,%xmm3 - movups 16(%rdi),%xmm9 - je .Lctr32_two + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm4 - movups 32(%rdi),%xmm10 - cmpq $4,%rdx - jb .Lctr32_three + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 + shrl $1,%eax +.byte 102,15,56,220,225 + decl %eax +.byte 102,15,56,220,233 + movups (%rdi),%xmm10 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 +.byte 102,15,56,220,249 + movups 32(%rdi),%xmm12 +.byte 102,68,15,56,220,193 + movups 16(%rcx),%xmm1 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm5 - movups 48(%rdi),%xmm11 - je .Lctr32_four + call .Lenc_loop8_enter - por %xmm14,%xmm6 - xorps %xmm7,%xmm7 + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done - call _aesni_encrypt6 + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done - movups 64(%rdi),%xmm1 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - movups %xmm1,64(%rsi) + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp .Lctr32_done + +.p2align 5 +.Lctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + decl %eax + jnz .Lctr32_loop4 +.byte 102,15,56,221,209 + movups (%rdi),%xmm10 +.byte 102,15,56,221,217 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 + movups 32(%rdi),%xmm12 +.byte 102,15,56,221,233 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp .Lctr32_done + +.p2align 5 +.Lctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + decl %eax + jnz .Lctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) jmp .Lctr32_done .p2align 4 .Lctr32_one_shortcut: movups (%r8),%xmm2 - movups (%rdi),%xmm8 + movups (%rdi),%xmm10 movl 240(%rcx),%eax -.Lctr32_one: movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -1213,56 +1425,25 @@ aesni_ctr32_encrypt_blocks: leaq 16(%rcx),%rcx jnz .Loop_enc1_7 .byte 102,15,56,221,209 - xorps %xmm2,%xmm8 - movups %xmm8,(%rsi) - jmp .Lctr32_done - -.p2align 4 -.Lctr32_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - movups %xmm9,16(%rsi) - jmp .Lctr32_done - -.p2align 4 -.Lctr32_three: - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - movups %xmm10,32(%rsi) + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) jmp .Lctr32_done .p2align 4 -.Lctr32_four: - call _aesni_encrypt4 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - movups %xmm11,48(%rsi) - .Lctr32_done: - movaps 32(%rsp),%xmm6 - movaps 48(%rsp),%xmm7 - movaps 64(%rsp),%xmm8 - movaps 80(%rsp),%xmm9 - movaps 96(%rsp),%xmm10 - movaps 112(%rsp),%xmm11 - movaps 128(%rsp),%xmm12 - movaps 144(%rsp),%xmm13 - movaps 160(%rsp),%xmm14 - movaps 176(%rsp),%xmm15 - leaq 200(%rsp),%rsp -.Lctr32_ret: + movaps -160(%rbp),%xmm6 + movaps -144(%rbp),%xmm7 + movaps -128(%rbp),%xmm8 + movaps -112(%rbp),%xmm9 + movaps -96(%rbp),%xmm10 + movaps -80(%rbp),%xmm11 + movaps -64(%rbp),%xmm12 + movaps -48(%rbp),%xmm13 + movaps -32(%rbp),%xmm14 + movaps -16(%rbp),%xmm15 + leaq (%rbp),%rsp + popq %rbp +.Lctr32_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi .byte 0xf3,0xc3 @@ -1282,18 +1463,22 @@ aesni_xts_encrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 - leaq -264(%rsp),%rsp - movaps %xmm6,96(%rsp) - movaps %xmm7,112(%rsp) - movaps %xmm8,128(%rsp) - movaps %xmm9,144(%rsp) - movaps %xmm10,160(%rsp) - movaps %xmm11,176(%rsp) - movaps %xmm12,192(%rsp) - movaps %xmm13,208(%rsp) - movaps %xmm14,224(%rsp) - movaps %xmm15,240(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $272,%rsp + andq $-16,%rsp + movaps %xmm6,-168(%rax) + movaps %xmm7,-152(%rax) + movaps %xmm8,-136(%rax) + movaps %xmm9,-120(%rax) + movaps %xmm10,-104(%rax) + movaps %xmm11,-88(%rax) + movaps %xmm12,-72(%rax) + movaps %xmm13,-56(%rax) + movaps %xmm14,-40(%rax) + movaps %xmm15,-24(%rax) .Lxts_enc_body: + leaq -8(%rax),%rbp movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1308,228 +1493,266 @@ aesni_xts_encrypt: leaq 16(%r8),%r8 jnz .Loop_enc1_8 .byte 102,68,15,56,221,249 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movl %eax,%r10d + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pshufd $95,%xmm15,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_enc_short shrl $1,%eax - subl $1,%eax + subl $3,%eax + movups 16(%r11),%xmm1 movl %eax,%r10d + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop -.p2align 4 +.p2align 5 .Lxts_enc_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm12 movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,220,216 + pxor %xmm9,%xmm13 movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,220,224 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,220,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_enc_loop6_enter - -.p2align 4 +.byte 102,15,56,220,240 + movdqa %xmm8,80(%rsp) +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + leaq 64(%r11),%rcx + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_enc_loop6 +.p2align 5 .Lxts_enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lxts_enc_loop6_enter: movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups (%rcx),%xmm0 + decl %eax jnz .Lxts_enc_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm10,%xmm11 .byte 102,15,56,220,249 movups 16(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,208 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm10 + psrad $31,%xmm14 .byte 102,15,56,220,216 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,224 - pxor %xmm9,%xmm15 .byte 102,15,56,220,232 + pxor %xmm14,%xmm15 .byte 102,15,56,220,240 + movaps %xmm11,%xmm12 .byte 102,15,56,220,248 movups 32(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 + movdqa %xmm13,48(%rsp) .byte 102,15,56,220,233 + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm12,%xmm13 .byte 102,15,56,220,249 + movups 48(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 paddq %xmm15,%xmm15 -.byte 102,15,56,221,208 - pand %xmm8,%xmm9 -.byte 102,15,56,221,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,221,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm14,%xmm15 +.byte 102,15,56,220,240 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 + psrad $31,%xmm9 +.byte 102,15,56,221,84,36,0 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 pxor %xmm9,%xmm15 - - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_enc_grandloop - leal 3(%rax,%rax,1),%eax + leal 7(%rax,%rax,1),%eax movq %r11,%rcx movl %eax,%r10d .Lxts_enc_short: + pxor %xmm0,%xmm10 addq $96,%rdx jz .Lxts_enc_done + pxor %xmm0,%xmm11 cmpq $32,%rdx jb .Lxts_enc_one + pxor %xmm0,%xmm12 je .Lxts_enc_two + pxor %xmm0,%xmm13 cmpq $64,%rdx jb .Lxts_enc_three + pxor %xmm0,%xmm14 je .Lxts_enc_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1632,15 +1855,15 @@ aesni_xts_encrypt: call _aesni_encrypt4 - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_enc_done @@ -1681,17 +1904,18 @@ aesni_xts_encrypt: movups %xmm2,-16(%rsi) .Lxts_enc_ret: - movaps 96(%rsp),%xmm6 - movaps 112(%rsp),%xmm7 - movaps 128(%rsp),%xmm8 - movaps 144(%rsp),%xmm9 - movaps 160(%rsp),%xmm10 - movaps 176(%rsp),%xmm11 - movaps 192(%rsp),%xmm12 - movaps 208(%rsp),%xmm13 - movaps 224(%rsp),%xmm14 - movaps 240(%rsp),%xmm15 - leaq 264(%rsp),%rsp + movaps -160(%rbp),%xmm6 + movaps -144(%rbp),%xmm7 + movaps -128(%rbp),%xmm8 + movaps -112(%rbp),%xmm9 + movaps -96(%rbp),%xmm10 + movaps -80(%rbp),%xmm11 + movaps -64(%rbp),%xmm12 + movaps -48(%rbp),%xmm13 + movaps -32(%rbp),%xmm14 + movaps -16(%rbp),%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lxts_enc_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -1712,18 +1936,22 @@ aesni_xts_decrypt: movq 40(%rsp),%r8 movq 48(%rsp),%r9 - leaq -264(%rsp),%rsp - movaps %xmm6,96(%rsp) - movaps %xmm7,112(%rsp) - movaps %xmm8,128(%rsp) - movaps %xmm9,144(%rsp) - movaps %xmm10,160(%rsp) - movaps %xmm11,176(%rsp) - movaps %xmm12,192(%rsp) - movaps %xmm13,208(%rsp) - movaps %xmm14,224(%rsp) - movaps %xmm15,240(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $272,%rsp + andq $-16,%rsp + movaps %xmm6,-168(%rax) + movaps %xmm7,-152(%rax) + movaps %xmm8,-136(%rax) + movaps %xmm9,-120(%rax) + movaps %xmm10,-104(%rax) + movaps %xmm11,-88(%rax) + movaps %xmm12,-72(%rax) + movaps %xmm13,-56(%rax) + movaps %xmm14,-40(%rax) + movaps %xmm15,-24(%rax) .Lxts_dec_body: + leaq -8(%rax),%rbp movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1744,228 +1972,266 @@ aesni_xts_decrypt: shlq $4,%rax subq %rax,%rdx + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movl %eax,%r10d + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pshufd $95,%xmm15,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_dec_short shrl $1,%eax - subl $1,%eax + subl $3,%eax + movups 16(%r11),%xmm1 movl %eax,%r10d + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop -.p2align 4 +.p2align 5 .Lxts_dec_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm12 movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,222,216 + pxor %xmm9,%xmm13 movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,222,224 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,222,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_dec_loop6_enter - -.p2align 4 +.byte 102,15,56,222,240 + movdqa %xmm8,80(%rsp) +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + leaq 64(%r11),%rcx + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_dec_loop6 +.p2align 5 .Lxts_dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Lxts_dec_loop6_enter: movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups (%rcx),%xmm0 + decl %eax jnz .Lxts_dec_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm10,%xmm11 .byte 102,15,56,222,249 movups 16(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,208 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm10 + psrad $31,%xmm14 .byte 102,15,56,222,216 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,224 - pxor %xmm9,%xmm15 .byte 102,15,56,222,232 + pxor %xmm14,%xmm15 .byte 102,15,56,222,240 + movaps %xmm11,%xmm12 .byte 102,15,56,222,248 movups 32(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 + movdqa %xmm13,48(%rsp) .byte 102,15,56,222,233 + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm12,%xmm13 .byte 102,15,56,222,249 + movups 48(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 paddq %xmm15,%xmm15 -.byte 102,15,56,223,208 - pand %xmm8,%xmm9 -.byte 102,15,56,223,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,223,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm14,%xmm15 +.byte 102,15,56,222,240 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 + psrad $31,%xmm9 +.byte 102,15,56,223,84,36,0 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 pxor %xmm9,%xmm15 - - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_dec_grandloop - leal 3(%rax,%rax,1),%eax + leal 7(%rax,%rax,1),%eax movq %r11,%rcx movl %eax,%r10d .Lxts_dec_short: + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 addq $96,%rdx jz .Lxts_dec_done + pxor %xmm0,%xmm12 cmpq $32,%rdx jb .Lxts_dec_one + pxor %xmm0,%xmm13 je .Lxts_dec_two + pxor %xmm0,%xmm14 cmpq $64,%rdx jb .Lxts_dec_three je .Lxts_dec_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -2058,7 +2324,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 movdqa %xmm13,%xmm10 xorps %xmm11,%xmm3 - movdqa %xmm15,%xmm11 + movdqa %xmm14,%xmm11 xorps %xmm12,%xmm4 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -2068,14 +2334,8 @@ aesni_xts_decrypt: .p2align 4 .Lxts_dec_four: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movups (%rdi),%xmm2 - pand %xmm8,%xmm9 movups 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movups 32(%rdi),%xmm4 xorps %xmm10,%xmm2 movups 48(%rdi),%xmm5 @@ -2086,16 +2346,16 @@ aesni_xts_decrypt: call _aesni_decrypt4 - xorps %xmm10,%xmm2 + pxor %xmm10,%xmm2 movdqa %xmm14,%xmm10 - xorps %xmm11,%xmm3 + pxor %xmm11,%xmm3 movdqa %xmm15,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_dec_done @@ -2155,17 +2415,18 @@ aesni_xts_decrypt: movups %xmm2,(%rsi) .Lxts_dec_ret: - movaps 96(%rsp),%xmm6 - movaps 112(%rsp),%xmm7 - movaps 128(%rsp),%xmm8 - movaps 144(%rsp),%xmm9 - movaps 160(%rsp),%xmm10 - movaps 176(%rsp),%xmm11 - movaps 192(%rsp),%xmm12 - movaps 208(%rsp),%xmm13 - movaps 224(%rsp),%xmm14 - movaps 240(%rsp),%xmm15 - leaq 264(%rsp),%rsp + movaps -160(%rbp),%xmm6 + movaps -144(%rbp),%xmm7 + movaps -128(%rbp),%xmm8 + movaps -112(%rbp),%xmm9 + movaps -96(%rbp),%xmm10 + movaps -80(%rbp),%xmm11 + movaps -64(%rbp),%xmm12 + movaps -48(%rbp),%xmm13 + movaps -32(%rbp),%xmm14 + movaps -16(%rbp),%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lxts_dec_epilogue: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -2245,155 +2506,335 @@ aesni_cbc_encrypt: .p2align 4 .Lcbc_decrypt: - leaq -88(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,16(%rsp) - movaps %xmm8,32(%rsp) - movaps %xmm9,48(%rsp) + leaq (%rsp),%rax + pushq %rbp + subq $176,%rsp + andq $-16,%rsp + movaps %xmm6,16(%rsp) + movaps %xmm7,32(%rsp) + movaps %xmm8,48(%rsp) + movaps %xmm9,64(%rsp) + movaps %xmm10,80(%rsp) + movaps %xmm11,96(%rsp) + movaps %xmm12,112(%rsp) + movaps %xmm13,128(%rsp) + movaps %xmm14,144(%rsp) + movaps %xmm15,160(%rsp) .Lcbc_decrypt_body: - movups (%r8),%xmm9 + leaq -8(%rax),%rbp + movups (%r8),%xmm10 movl %r10d,%eax - cmpq $112,%rdx + cmpq $80,%rdx jbe .Lcbc_dec_tail - shrl $1,%r10d + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + cmpq $112,%rdx + jbe .Lcbc_dec_six_or_seven + subq $112,%rdx - movl %r10d,%eax - movaps %xmm9,64(%rsp) + leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .p2align 4 .Lcbc_dec_loop8: - movaps %xmm0,64(%rsp) movups %xmm9,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_loop8_enter: - movups (%rcx),%xmm0 - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 16(%rcx),%xmm1 + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 - leaq 32(%rcx),%rcx - movdqu 32(%rdi),%xmm4 - xorps %xmm0,%xmm2 - movdqu 48(%rdi),%xmm5 - xorps %xmm0,%xmm3 - movdqu 64(%rdi),%xmm6 .byte 102,15,56,222,209 - pxor %xmm0,%xmm4 - movdqu 80(%rdi),%xmm7 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + setnc %r11b +.byte 102,68,15,56,222,193 + shlq $7,%r11 +.byte 102,68,15,56,222,201 + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 +.byte 102,15,56,222,209 .byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - movdqu 96(%rdi),%xmm8 .byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqu 112(%rdi),%xmm9 .byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - decl %eax .byte 102,15,56,222,241 - pxor %xmm0,%xmm8 .byte 102,15,56,222,249 - pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 .byte 102,68,15,56,222,193 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - - call .Ldec_loop8_enter + movups 112-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + cmpl $11,%eax + jb .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 +.Lcbc_dec_done: +.byte 102,15,56,222,209 + pxor %xmm0,%xmm10 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 + pxor %xmm0,%xmm12 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 + pxor %xmm0,%xmm14 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 + leaq 128(%rdi),%rdi +.byte 102,65,15,56,223,228 + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 + movdqu 16(%r11),%xmm12 +.byte 102,65,15,56,223,246 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 + movdqu 48(%r11),%xmm14 +.byte 102,68,15,56,223,193 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps 64(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm1 - xorps %xmm0,%xmm8 - movups 112(%rdi),%xmm0 - xorps %xmm1,%xmm9 movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) - movl %r10d,%eax + movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) - movq %r11,%rcx + movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) - leaq 128(%rdi),%rdi + movdqa %xmm1,%xmm7 movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi + subq $128,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 - movaps %xmm0,%xmm9 + leaq -112(%rcx),%rcx addq $112,%rdx jle .Lcbc_dec_tail_collected - movups %xmm2,(%rsi) - leal 1(%r10,%r10,1),%eax + movups %xmm9,(%rsi) leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe .Lcbc_dec_tail + + movaps %xmm11,%xmm2 +.Lcbc_dec_six_or_seven: + cmpq $96,%rdx + ja .Lcbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + jmp .Lcbc_dec_tail_collected + +.p2align 4 +.Lcbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + jmp .Lcbc_dec_tail_collected + .Lcbc_dec_tail: movups (%rdi),%xmm2 - movaps %xmm2,%xmm8 - cmpq $16,%rdx + subq $16,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 - movaps %xmm3,%xmm7 - cmpq $32,%rdx + movaps %xmm2,%xmm11 + subq $16,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 - movaps %xmm4,%xmm6 - cmpq $48,%rdx + movaps %xmm3,%xmm12 + subq $16,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 - cmpq $64,%rdx + movaps %xmm4,%xmm13 + subq $16,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 - cmpq $80,%rdx - jbe .Lcbc_dec_five - - movups 80(%rdi),%xmm7 - cmpq $96,%rdx - jbe .Lcbc_dec_six - - movups 96(%rdi),%xmm8 - movaps %xmm9,64(%rsp) - call _aesni_decrypt8 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps 64(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm9 - xorps %xmm0,%xmm8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - movaps %xmm8,%xmm2 - subq $112,%rdx + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + subq $16,%rdx jmp .Lcbc_dec_tail_collected + .p2align 4 .Lcbc_dec_one: + movaps %xmm2,%xmm11 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -2405,116 +2846,79 @@ aesni_cbc_encrypt: leaq 16(%rcx),%rcx jnz .Loop_dec1_16 .byte 102,15,56,223,209 - xorps %xmm9,%xmm2 - movaps %xmm8,%xmm9 - subq $16,%rdx + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 jmp .Lcbc_dec_tail_collected .p2align 4 .Lcbc_dec_two: + movaps %xmm3,%xmm12 xorps %xmm4,%xmm4 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - movaps %xmm7,%xmm9 - movaps %xmm3,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 leaq 16(%rsi),%rsi - subq $32,%rdx jmp .Lcbc_dec_tail_collected .p2align 4 .Lcbc_dec_three: + movaps %xmm4,%xmm13 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - movaps %xmm6,%xmm9 - movaps %xmm4,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + movdqa %xmm4,%xmm2 leaq 32(%rsi),%rsi - subq $48,%rdx jmp .Lcbc_dec_tail_collected .p2align 4 .Lcbc_dec_four: + movaps %xmm5,%xmm14 call _aesni_decrypt4 - xorps %xmm9,%xmm2 - movups 48(%rdi),%xmm9 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - xorps %xmm6,%xmm5 - movups %xmm4,32(%rsi) - movaps %xmm5,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + movdqa %xmm5,%xmm2 leaq 48(%rsi),%rsi - subq $64,%rdx - jmp .Lcbc_dec_tail_collected -.p2align 4 -.Lcbc_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm9 - xorps %xmm1,%xmm6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - movaps %xmm6,%xmm2 - subq $80,%rdx - jmp .Lcbc_dec_tail_collected -.p2align 4 -.Lcbc_dec_six: - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm0 - xorps %xmm1,%xmm6 - movups 80(%rdi),%xmm9 - xorps %xmm0,%xmm7 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - movaps %xmm7,%xmm2 - subq $96,%rdx jmp .Lcbc_dec_tail_collected + .p2align 4 .Lcbc_dec_tail_collected: + movups %xmm10,(%r8) andq $15,%rdx - movups %xmm9,(%r8) jnz .Lcbc_dec_tail_partial movups %xmm2,(%rsi) jmp .Lcbc_dec_ret .p2align 4 .Lcbc_dec_tail_partial: - movaps %xmm2,64(%rsp) + movaps %xmm2,(%rsp) movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx - leaq 64(%rsp),%rsi + leaq (%rsp),%rsi .long 0x9066A4F3 .Lcbc_dec_ret: - movaps (%rsp),%xmm6 - movaps 16(%rsp),%xmm7 - movaps 32(%rsp),%xmm8 - movaps 48(%rsp),%xmm9 - leaq 88(%rsp),%rsp + movaps 16(%rsp),%xmm6 + movaps 32(%rsp),%xmm7 + movaps 48(%rsp),%xmm8 + movaps 64(%rsp),%xmm9 + movaps 80(%rsp),%xmm10 + movaps 96(%rsp),%xmm11 + movaps 112(%rsp),%xmm12 + movaps 128(%rsp),%xmm13 + movaps 144(%rsp),%xmm14 + movaps 160(%rsp),%xmm15 + leaq (%rbp),%rsp + popq %rbp .Lcbc_ret: movq 8(%rsp),%rdi movq 16(%rsp),%rsi @@ -2759,6 +3163,8 @@ __aesni_set_encrypt_key: .long 1,0,0,0 .Lxts_magic: .long 0x87,0,1,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 @@ -2823,45 +3229,9 @@ ccm64_se_handler: jmp .Lcommon_seh_tail -.def ctr32_se_handler; .scl 3; .type 32; .endef -.p2align 4 -ctr32_se_handler: - pushq %rsi - pushq %rdi - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushfq - subq $64,%rsp - - movq 120(%r8),%rax - movq 248(%r8),%rbx - - leaq .Lctr32_body(%rip),%r10 - cmpq %r10,%rbx - jb .Lcommon_seh_tail - - movq 152(%r8),%rax - - leaq .Lctr32_ret(%rip),%r10 - cmpq %r10,%rbx - jae .Lcommon_seh_tail - - leaq 32(%rax),%rsi - leaq 512(%r8),%rdi - movl $20,%ecx -.long 0xa548f3fc - leaq 200(%rax),%rax - - jmp .Lcommon_seh_tail - - -.def xts_se_handler; .scl 3; .type 32; .endef +.def ctr_xts_se_handler; .scl 3; .type 32; .endef .p2align 4 -xts_se_handler: +ctr_xts_se_handler: pushq %rsi pushq %rdi pushq %rbx @@ -2891,13 +3261,13 @@ xts_se_handler: cmpq %r10,%rbx jae .Lcommon_seh_tail - leaq 96(%rax),%rsi + movq 160(%r8),%rax + leaq -160(%rax),%rsi leaq 512(%r8),%rdi movl $20,%ecx .long 0xa548f3fc - leaq 104+160(%rax),%rax - jmp .Lcommon_seh_tail + jmp .Lcommon_rbp_tail .def cbc_se_handler; .scl 3; .type 32; .endef .p2align 4 @@ -2928,11 +3298,16 @@ cbc_se_handler: cmpq %r10,%rbx jae .Lcommon_seh_tail - leaq 0(%rax),%rsi + leaq 16(%rax),%rsi leaq 512(%r8),%rdi - movl $8,%ecx + movl $20,%ecx .long 0xa548f3fc - leaq 88(%rax),%rax + +.Lcommon_rbp_tail: + movq 160(%r8),%rax + movq (%rax),%rbp + leaq 8(%rax),%rax + movq %rbp,160(%r8) jmp .Lcommon_seh_tail .Lrestore_cbc_rax: @@ -3029,14 +3404,15 @@ cbc_se_handler: .rva .Lccm64_dec_body,.Lccm64_dec_ret .LSEH_info_ctr32: .byte 9,0,0,0 -.rva ctr32_se_handler +.rva ctr_xts_se_handler +.rva .Lctr32_body,.Lctr32_epilogue .LSEH_info_xts_enc: .byte 9,0,0,0 -.rva xts_se_handler +.rva ctr_xts_se_handler .rva .Lxts_enc_body,.Lxts_enc_epilogue .LSEH_info_xts_dec: .byte 9,0,0,0 -.rva xts_se_handler +.rva ctr_xts_se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue .LSEH_info_cbc: .byte 9,0,0,0 diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s index 9f658ee..a3a0e30 100644 --- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s @@ -686,6 +686,501 @@ padlock_cbc_encrypt: movq 16(%rsp),%rsi .byte 0xf3,0xc3 .LSEH_end_padlock_cbc_encrypt: +.globl padlock_cfb_encrypt +.def padlock_cfb_encrypt; .scl 2; .type 32; .endef +.p2align 4 +padlock_cfb_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_padlock_cfb_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz .Lcfb_abort + testq $15,%rcx + jnz .Lcfb_abort + leaq .Lpadlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz .Lcfb_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz .Lcfb_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + jmp .Lcfb_loop +.p2align 4 +.Lcfb_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz .Lcfb_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +.Lcfb_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,224 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz .Lcfb_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +.Lcfb_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jnz .Lcfb_loop + cmpq %rbp,%rsp + je .Lcfb_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +.Lcfb_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja .Lcfb_bzero + +.Lcfb_done: + leaq (%rbp),%rsp + jmp .Lcfb_exit + +.p2align 4 +.Lcfb_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,224 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) +.Lcfb_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +.Lcfb_abort: + popq %rbx + popq %rbp + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_padlock_cfb_encrypt: +.globl padlock_ofb_encrypt +.def padlock_ofb_encrypt; .scl 2; .type 32; .endef +.p2align 4 +padlock_ofb_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_padlock_ofb_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz .Lofb_abort + testq $15,%rcx + jnz .Lofb_abort + leaq .Lpadlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz .Lofb_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz .Lofb_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + jmp .Lofb_loop +.p2align 4 +.Lofb_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz .Lofb_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +.Lofb_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,232 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz .Lofb_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +.Lofb_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jnz .Lofb_loop + cmpq %rbp,%rsp + je .Lofb_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +.Lofb_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja .Lofb_bzero + +.Lofb_done: + leaq (%rbp),%rsp + jmp .Lofb_exit + +.p2align 4 +.Lofb_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,232 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) +.Lofb_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +.Lofb_abort: + popq %rbx + popq %rbp + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_padlock_ofb_encrypt: +.globl padlock_ctr32_encrypt +.def padlock_ctr32_encrypt; .scl 2; .type 32; .endef +.p2align 4 +padlock_ctr32_encrypt: + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_padlock_ctr32_encrypt: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz .Lctr32_abort + testq $15,%rcx + jnz .Lctr32_abort + leaq .Lpadlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz .Lctr32_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz .Lctr32_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx +.Lctr32_reenter: + movl -4(%rdx),%eax + bswapl %eax + negl %eax + andl $31,%eax + movq $512,%rbx + shll $4,%eax + cmovzq %rbx,%rax + cmpq %rax,%rcx + cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + cmpq %rbx,%rcx + ja .Lctr32_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lctr32_unaligned_tail + jmp .Lctr32_loop +.p2align 4 +.Lctr32_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz .Lctr32_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +.Lctr32_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + movl -4(%rdx),%eax + testl $4294901760,%eax + jnz .Lctr32_no_carry + bswapl %eax + addl $65536,%eax + bswapl %eax + movl %eax,-4(%rdx) +.Lctr32_no_carry: + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz .Lctr32_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +.Lctr32_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jz .Lctr32_break + cmpq %rbx,%rcx + jae .Lctr32_loop + movq %rcx,%rbx + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jnz .Lctr32_loop +.Lctr32_unaligned_tail: + xorl %eax,%eax + cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lctr32_loop +.p2align 4 +.Lctr32_break: + cmpq %rbp,%rsp + je .Lctr32_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +.Lctr32_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja .Lctr32_bzero + +.Lctr32_done: + leaq (%rbp),%rsp + jmp .Lctr32_exit + +.p2align 4 +.Lctr32_aligned: + movl -4(%rdx),%eax + bswapl %eax + negl %eax + andl $65535,%eax + movq $1048576,%rbx + shll $4,%eax + cmovzq %rbx,%rax + cmpq %rax,%rcx + cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + jbe .Lctr32_aligned_skip + +.Lctr32_aligned_loop: + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + + movl -4(%rdx),%eax + bswapl %eax + addl $65536,%eax + bswapl %eax + movl %eax,-4(%rdx) + + movq %r10,%rcx + subq %r11,%rcx + movq $1048576,%rbx + jz .Lctr32_exit + cmpq %rbx,%rcx + jae .Lctr32_aligned_loop + +.Lctr32_aligned_skip: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $32,%rbp + movq $32-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lctr32_aligned_tail + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + testq %rbp,%rbp + jz .Lctr32_exit + +.Lctr32_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lctr32_loop +.Lctr32_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +.Lctr32_abort: + popq %rbx + popq %rbp + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_padlock_ctr32_encrypt: .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 .data diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s index 69eb468..d969f30 100644 --- a/lib/accelerated/x86/coff/padlock-x86-coff.s +++ b/lib/accelerated/x86/coff/padlock-x86-coff.s @@ -515,6 +515,354 @@ _padlock_cbc_encrypt: popl %ebx popl %ebp ret +.globl _padlock_cfb_encrypt +.def _padlock_cfb_encrypt; .scl 2; .type 32; .endef +.align 16 +_padlock_cfb_encrypt: +.L_padlock_cfb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz .L028cfb_abort + testl $15,%ecx + jnz .L028cfb_abort + leal .Lpadlock_saved_context,%eax + pushfl + cld + call __padlock_verify_ctx +.L029cfb_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%edx) + jnz .L030cfb_aligned + testl $15,%edi + setz %al + testl $15,%esi + setz %bl + testl %ebx,%eax + jnz .L030cfb_aligned + negl %eax + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp .L031cfb_loop +.align 16 +.L031cfb_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + testl $15,%edi + cmovnzl %esp,%edi + testl $15,%esi + jz .L032cfb_inp_aligned + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi + movl %ebx,%ecx + movl %edi,%esi +.L032cfb_inp_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,224 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) + movl (%ebp),%edi + movl 12(%ebp),%ebx + testl $15,%edi + jz .L033cfb_out_aligned + movl %ebx,%ecx + leal (%esp),%esi + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi +.L033cfb_out_aligned: + movl 4(%ebp),%esi + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz .L031cfb_loop + cmpl %ebp,%esp + je .L034cfb_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +.L035cfb_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja .L035cfb_bzero +.L034cfb_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + jmp .L036cfb_exit +.align 16 +.L030cfb_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,224 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) +.L036cfb_exit: + movl $1,%eax + leal 4(%esp),%esp +.L028cfb_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _padlock_ofb_encrypt +.def _padlock_ofb_encrypt; .scl 2; .type 32; .endef +.align 16 +_padlock_ofb_encrypt: +.L_padlock_ofb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz .L037ofb_abort + testl $15,%ecx + jnz .L037ofb_abort + leal .Lpadlock_saved_context,%eax + pushfl + cld + call __padlock_verify_ctx +.L038ofb_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%edx) + jnz .L039ofb_aligned + testl $15,%edi + setz %al + testl $15,%esi + setz %bl + testl %ebx,%eax + jnz .L039ofb_aligned + negl %eax + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp .L040ofb_loop +.align 16 +.L040ofb_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + testl $15,%edi + cmovnzl %esp,%edi + testl $15,%esi + jz .L041ofb_inp_aligned + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi + movl %ebx,%ecx + movl %edi,%esi +.L041ofb_inp_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,232 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) + movl (%ebp),%edi + movl 12(%ebp),%ebx + testl $15,%edi + jz .L042ofb_out_aligned + movl %ebx,%ecx + leal (%esp),%esi + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi +.L042ofb_out_aligned: + movl 4(%ebp),%esi + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz .L040ofb_loop + cmpl %ebp,%esp + je .L043ofb_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +.L044ofb_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja .L044ofb_bzero +.L043ofb_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + jmp .L045ofb_exit +.align 16 +.L039ofb_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,232 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) +.L045ofb_exit: + movl $1,%eax + leal 4(%esp),%esp +.L037ofb_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _padlock_ctr32_encrypt +.def _padlock_ctr32_encrypt; .scl 2; .type 32; .endef +.align 16 +_padlock_ctr32_encrypt: +.L_padlock_ctr32_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz .L046ctr32_abort + testl $15,%ecx + jnz .L046ctr32_abort + leal .Lpadlock_saved_context,%eax + pushfl + cld + call __padlock_verify_ctx +.L047ctr32_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + movq -16(%edx),%mm0 + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp .L048ctr32_loop +.align 16 +.L048ctr32_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + movl -4(%edx),%ecx + xorl %edi,%edi + movl -8(%edx),%eax +.L049ctr32_prepare: + movl %ecx,12(%esp,%edi,1) + bswap %ecx + movq %mm0,(%esp,%edi,1) + incl %ecx + movl %eax,8(%esp,%edi,1) + bswap %ecx + leal 16(%edi),%edi + cmpl %ebx,%edi + jb .L049ctr32_prepare + movl %ecx,-4(%edx) + leal (%esp),%esi + leal (%esp),%edi + movl %ebx,%ecx + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,200 + movl (%ebp),%edi + movl 12(%ebp),%ebx + movl 4(%ebp),%esi + xorl %ecx,%ecx +.L050ctr32_xor: + movups (%esi,%ecx,1),%xmm1 + leal 16(%ecx),%ecx + pxor -16(%esp,%ecx,1),%xmm1 + movups %xmm1,-16(%edi,%ecx,1) + cmpl %ebx,%ecx + jb .L050ctr32_xor + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz .L048ctr32_loop + pxor %xmm0,%xmm0 + leal (%esp),%eax +.L051ctr32_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja .L051ctr32_bzero +.L052ctr32_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + movl $1,%eax + leal 4(%esp),%esp + emms +.L046ctr32_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .globl _padlock_xstore .def _padlock_xstore; .scl 2; .type 32; .endef .align 16 @@ -533,10 +881,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne .L028ret + jne .L053ret addl $4,184(%ecx) movl $0,%eax -.L028ret: +.L053ret: ret .globl _padlock_sha1_oneshot .def _padlock_sha1_oneshot; .scl 2; .type 32; .endef diff --git a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s index 8f2b96f..9755951 100644 --- a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s +++ b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s @@ -697,6 +697,7 @@ gcm_ghash_4bit: .type gcm_init_clmul,@function .align 16 gcm_init_clmul: +.L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -715,15 +716,15 @@ gcm_init_clmul: pxor %xmm5,%xmm2 + pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -733,44 +734,134 @@ gcm_init_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - movdqu %xmm2,(%rdi) - movdqu %xmm0,16(%rdi) + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: +.L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 @@ -783,186 +874,358 @@ gcm_gmult_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_ghash_clmul .type gcm_ghash_clmul,@function -.align 16 +.align 32 gcm_ghash_clmul: +.L_ghash_clmul: movdqa .Lbswap_mask(%rip),%xmm5 + movq $11547335547999543296,%rax movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm10 .byte 102,15,56,0,197 subq $16,%rcx jz .Lodd_tail - movdqu 16(%rsi),%xmm8 + movdqu 16(%rsi),%xmm9 + cmpq $48,%rcx + jb .Lskip4x + subq $48,%rcx + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 - movdqu (%rdx),%xmm3 - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 + movdqu 48(%rdx),%xmm6 + movdqu 32(%rdx),%xmm11 .byte 102,15,56,0,245 - pxor %xmm3,%xmm0 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm6,%xmm3 - pxor %xmm2,%xmm4 +.byte 102,68,15,56,0,221 + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm7 + pxor %xmm6,%xmm7 .byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,220,0 - pxor %xmm6,%xmm3 - pxor %xmm7,%xmm3 +.byte 102,68,15,58,68,194,17 +.byte 102,65,15,58,68,250,0 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,217,0 +.byte 102,69,15,58,68,233,17 + xorps %xmm11,%xmm6 +.byte 102,69,15,58,68,226,16 + xorps %xmm13,%xmm8 + movups 80(%rsi),%xmm10 + xorps %xmm12,%xmm7 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm3 +.byte 102,68,15,56,0,221 +.byte 102,15,56,0,221 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm6 +.byte 102,69,15,58,68,226,0 + xorps %xmm13,%xmm8 + + leaq 64(%rdx),%rdx + subq $64,%rcx + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm7 + movdqu 48(%rdx),%xmm11 +.byte 102,68,15,56,0,221 +.byte 102,65,15,58,68,207,17 + xorps %xmm6,%xmm0 + movdqu 32(%rdx),%xmm6 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 +.byte 102,65,15,58,68,218,16 + xorps %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,15,56,0,245 + movups 32(%rsi),%xmm10 +.byte 102,68,15,58,68,218,0 + xorps %xmm7,%xmm3 + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm7 + + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm7 + pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 + pslldq $8,%xmm3 +.byte 102,68,15,58,68,234,17 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + movdqa .L7_mask(%rip),%xmm3 + pxor %xmm4,%xmm1 +.byte 102,72,15,110,224 + + pand %xmm0,%xmm3 +.byte 102,15,56,0,227 +.byte 102,69,15,58,68,226,0 + pxor %xmm0,%xmm4 + psllq $57,%xmm4 + movdqa %xmm4,%xmm3 pslldq $8,%xmm4 - pxor %xmm3,%xmm7 - pxor %xmm4,%xmm6 +.byte 102,65,15,58,68,241,0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqu 0(%rdx),%xmm3 + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,193,17 + xorps %xmm11,%xmm6 + movdqu 16(%rdx),%xmm11 +.byte 102,68,15,56,0,221 +.byte 102,65,15,58,68,250,16 + xorps %xmm13,%xmm8 + movups 80(%rsi),%xmm10 +.byte 102,15,56,0,221 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm7 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm6 + pxor %xmm1,%xmm0 + +.byte 102,69,15,58,68,226,0 + xorps %xmm13,%xmm8 + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - leaq 32(%rdx),%rdx - subq $32,%rcx - jbe .Leven_tail + leaq 64(%rdx),%rdx + subq $64,%rcx + jnc .Lmod4_loop + +.Ltail4x: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm7 +.byte 102,65,15,58,68,207,17 + xorps %xmm6,%xmm0 +.byte 102,65,15,58,68,218,16 + xorps %xmm8,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm7,%xmm3 -.Lmod_loop: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 + pxor %xmm0,%xmm1 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - movdqu (%rdx),%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%rcx + jz .Ldone + movdqu 32(%rsi),%xmm10 + subq $16,%rcx + jz .Lodd_tail +.Lskip4x: + + + + + + movdqu (%rdx),%xmm3 movdqu 16(%rdx),%xmm6 .byte 102,15,56,0,221 .byte 102,15,56,0,245 + pxor %xmm3,%xmm0 + + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm3 + pxor %xmm6,%xmm3 +.byte 102,15,58,68,242,0 +.byte 102,68,15,58,68,194,17 +.byte 102,65,15,58,68,218,0 + + leaq 32(%rdx),%rdx + subq $32,%rcx + jbe .Leven_tail + jmp .Lmod_loop - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm9 - pshufd $78,%xmm2,%xmm10 - pxor %xmm6,%xmm9 - pxor %xmm2,%xmm10 +.align 32 +.Lmod_loop: + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,65,15,58,68,193,0 +.byte 102,65,15,58,68,201,17 +.byte 102,65,15,58,68,226,16 + + pxor %xmm6,%xmm0 + pxor %xmm8,%xmm1 + movdqu (%rdx),%xmm8 +.byte 102,68,15,56,0,197 + movdqu 16(%rdx),%xmm6 + + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm8,%xmm1 + pxor %xmm3,%xmm4 +.byte 102,15,56,0,245 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm3 + pslldq $8,%xmm4 pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm6,%xmm8 + + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 - pxor %xmm3,%xmm0 .byte 102,15,58,68,242,0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + pshufd $78,%xmm8,%xmm3 + pxor %xmm8,%xmm3 -.byte 102,15,58,68,250,17 +.byte 102,68,15,58,68,194,17 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 - -.byte 102,69,15,58,68,202,0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - - pxor %xmm6,%xmm9 - pxor %xmm7,%xmm9 - movdqa %xmm9,%xmm10 - psrldq $8,%xmm9 - pslldq $8,%xmm10 - pxor %xmm9,%xmm7 - pxor %xmm10,%xmm6 +.byte 102,65,15,58,68,218,0 + pxor %xmm1,%xmm0 leaq 32(%rdx),%rdx subq $32,%rcx ja .Lmod_loop .Leven_tail: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,65,15,58,68,193,0 +.byte 102,65,15,58,68,201,17 +.byte 102,65,15,58,68,226,16 + + pxor %xmm6,%xmm0 + pxor %xmm8,%xmm1 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 testq %rcx,%rcx jnz .Ldone @@ -972,12 +1235,10 @@ gcm_ghash_clmul: pxor %xmm3,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,65,15,58,68,218,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -987,38 +1248,60 @@ gcm_ghash_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .Ldone: .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 -.LSEH_end_gcm_ghash_clmul: .size gcm_ghash_clmul,.-gcm_ghash_clmul +.globl gcm_init_avx +.type gcm_init_avx,@function +.align 32 +gcm_init_avx: + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +.globl gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: + jmp .L_gmult_clmul +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 .align 64 .type .Lrem_4bit,@object .Lrem_4bit: diff --git a/lib/accelerated/x86/elf/appro-aes-x86-64.s b/lib/accelerated/x86/elf/appro-aes-x86-64.s index f48666f..d3734a6 100644 --- a/lib/accelerated/x86/elf/appro-aes-x86-64.s +++ b/lib/accelerated/x86/elf/appro-aes-x86-64.s @@ -925,199 +925,412 @@ aesni_ccm64_decrypt_blocks: .type aesni_ctr32_encrypt_blocks,@function .align 16 aesni_ctr32_encrypt_blocks: + leaq (%rsp),%rax + pushq %rbp + subq $128,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + cmpq $1,%rdx je .Lctr32_one_shortcut - movdqu (%r8),%xmm14 - movdqa .Lbswap_mask(%rip),%xmm15 - xorl %eax,%eax -.byte 102,69,15,58,22,242,3 -.byte 102,68,15,58,34,240,3 + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movdqa %xmm2,112(%rsp) movl 240(%rcx),%eax + + leaq 1(%r8),%r9 + leaq 2(%r8),%r10 + bswapl %r9d bswapl %r10d - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 -.byte 102,69,15,58,34,226,0 - leaq 3(%r10),%r11 -.byte 102,69,15,58,34,235,0 - incl %r10d -.byte 102,69,15,58,34,226,1 - incq %r11 -.byte 102,69,15,58,34,235,1 - incl %r10d -.byte 102,69,15,58,34,226,2 - incq %r11 -.byte 102,69,15,58,34,235,2 - movdqa %xmm12,-40(%rsp) -.byte 102,69,15,56,0,231 - movdqa %xmm13,-24(%rsp) -.byte 102,69,15,56,0,239 - - pshufd $192,%xmm12,%xmm2 - pshufd $128,%xmm12,%xmm3 - pshufd $64,%xmm12,%xmm4 - cmpq $6,%rdx - jb .Lctr32_tail - shrl $1,%eax - movq %rcx,%r11 - movl %eax,%r10d - subq $6,%rdx - jmp .Lctr32_loop6 + xorl %r11d,%r9d + xorl %r11d,%r10d +.byte 102,65,15,58,34,217,3 + leaq 3(%r8),%r9 + movdqa %xmm3,16(%rsp) +.byte 102,65,15,58,34,226,3 + bswapl %r9d + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%r9d + bswapl %r10d +.byte 102,65,15,58,34,233,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + xorl %r11d,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + xorl %r11d,%r9d + movl %r9d,112+12(%rsp) -.align 16 -.Lctr32_loop6: - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm2 - movups (%r11),%xmm0 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm3 - movups 16(%r11),%xmm1 - pshufd $64,%xmm13,%xmm7 - por %xmm14,%xmm4 - por %xmm14,%xmm5 - xorps %xmm0,%xmm2 - por %xmm14,%xmm6 - por %xmm14,%xmm7 + movups 16(%rcx),%xmm1 + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + cmpq $8,%rdx + jb .Lctr32_tail + leaq 128(%rcx),%rcx + subq $8,%rdx + jmp .Lctr32_loop8 - pxor %xmm0,%xmm3 +.align 32 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 - movdqa .Lincrement32(%rip),%xmm13 - pxor %xmm0,%xmm5 + bswapl %r9d + movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - movdqa -40(%rsp),%xmm12 - pxor %xmm0,%xmm6 + xorl %r11d,%r9d .byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - jmp .Lctr32_enc_loop6_enter -.align 16 -.Lctr32_enc_loop6: +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax + bswapl %r9d .byte 102,15,56,220,225 + xorl %r11d,%r9d .byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lctr32_enc_loop6_enter: - movups 16(%rcx),%xmm1 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx + bswapl %r9d .byte 102,15,56,220,224 + xorl %r11d,%r9d .byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 - jnz .Lctr32_enc_loop6 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + bswapl %r9d +.byte 102,15,56,220,225 + xorl %r11d,%r9d +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + bswapl %r9d +.byte 102,15,56,220,225 + xorl %r11d,%r9d +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 + movdqu 0(%rdi),%xmm10 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + cmpl $11,%eax + jb .Lctr32_enc_done .byte 102,15,56,220,209 - paddd %xmm13,%xmm12 .byte 102,15,56,220,217 - paddd -24(%rsp),%xmm13 .byte 102,15,56,220,225 - movdqa %xmm12,-40(%rsp) .byte 102,15,56,220,233 - movdqa %xmm13,-24(%rsp) .byte 102,15,56,220,241 -.byte 102,69,15,56,0,231 .byte 102,15,56,220,249 -.byte 102,69,15,56,0,239 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 -.byte 102,15,56,221,208 - movups (%rdi),%xmm8 -.byte 102,15,56,221,216 - movups 16(%rdi),%xmm9 -.byte 102,15,56,221,224 - movups 32(%rdi),%xmm10 -.byte 102,15,56,221,232 - movups 48(%rdi),%xmm11 -.byte 102,15,56,221,240 - movups 64(%rdi),%xmm1 -.byte 102,15,56,221,248 - movups 80(%rdi),%xmm0 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je .Lctr32_enc_done - xorps %xmm2,%xmm8 - pshufd $192,%xmm12,%xmm2 - xorps %xmm3,%xmm9 - pshufd $128,%xmm12,%xmm3 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - pshufd $64,%xmm12,%xmm4 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - xorps %xmm7,%xmm0 - movups %xmm1,64(%rsi) - movups %xmm0,80(%rsi) - leaq 96(%rsi),%rsi - movl %r10d,%eax - subq $6,%rdx - jnc .Lctr32_loop6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 +.byte 102,15,56,220,209 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112(%rdi),%xmm10 + leaq 128(%rdi),%rdi +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 + movdqa 16(%rsp),%xmm12 +.byte 102,65,15,56,221,237 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 + movdqa 48(%rsp),%xmm14 +.byte 102,65,15,56,221,255 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 +.byte 102,69,15,56,221,202 + movups 16-128(%rcx),%xmm1 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 - addq $6,%rdx + addq $8,%rdx jz .Lctr32_done - movq %r11,%rcx - leal 1(%rax,%rax,1),%eax + leaq -128(%rcx),%rcx .Lctr32_tail: - por %xmm14,%xmm2 - movups (%rdi),%xmm8 - cmpq $2,%rdx - jb .Lctr32_one + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb .Lctr32_loop3 + je .Lctr32_loop4 - por %xmm14,%xmm3 - movups 16(%rdi),%xmm9 - je .Lctr32_two + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm4 - movups 32(%rdi),%xmm10 - cmpq $4,%rdx - jb .Lctr32_three + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 + shrl $1,%eax +.byte 102,15,56,220,225 + decl %eax +.byte 102,15,56,220,233 + movups (%rdi),%xmm10 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 +.byte 102,15,56,220,249 + movups 32(%rdi),%xmm12 +.byte 102,68,15,56,220,193 + movups 16(%rcx),%xmm1 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm5 - movups 48(%rdi),%xmm11 - je .Lctr32_four + call .Lenc_loop8_enter - por %xmm14,%xmm6 - xorps %xmm7,%xmm7 + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done - call _aesni_encrypt6 + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done - movups 64(%rdi),%xmm1 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - movups %xmm1,64(%rsi) + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + decl %eax + jnz .Lctr32_loop4 +.byte 102,15,56,221,209 + movups (%rdi),%xmm10 +.byte 102,15,56,221,217 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 + movups 32(%rdi),%xmm12 +.byte 102,15,56,221,233 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + decl %eax + jnz .Lctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) jmp .Lctr32_done .align 16 .Lctr32_one_shortcut: movups (%r8),%xmm2 - movups (%rdi),%xmm8 + movups (%rdi),%xmm10 movl 240(%rcx),%eax -.Lctr32_one: movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -1129,51 +1342,26 @@ aesni_ctr32_encrypt_blocks: leaq 16(%rcx),%rcx jnz .Loop_enc1_7 .byte 102,15,56,221,209 - xorps %xmm2,%xmm8 - movups %xmm8,(%rsi) - jmp .Lctr32_done - -.align 16 -.Lctr32_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - movups %xmm9,16(%rsi) - jmp .Lctr32_done - -.align 16 -.Lctr32_three: - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - movups %xmm10,32(%rsi) + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) jmp .Lctr32_done .align 16 -.Lctr32_four: - call _aesni_encrypt4 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - movups %xmm11,48(%rsi) - .Lctr32_done: + leaq (%rbp),%rsp + popq %rbp +.Lctr32_epilogue: .byte 0xf3,0xc3 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks .globl aesni_xts_encrypt .type aesni_xts_encrypt,@function .align 16 aesni_xts_encrypt: - leaq -104(%rsp),%rsp + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1188,228 +1376,266 @@ aesni_xts_encrypt: leaq 16(%r8),%r8 jnz .Loop_enc1_8 .byte 102,68,15,56,221,249 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movl %eax,%r10d + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pshufd $95,%xmm15,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_enc_short shrl $1,%eax - subl $1,%eax + subl $3,%eax + movups 16(%r11),%xmm1 movl %eax,%r10d + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop -.align 16 +.align 32 .Lxts_enc_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm12 movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,220,216 + pxor %xmm9,%xmm13 movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,220,224 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,220,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_enc_loop6_enter - -.align 16 +.byte 102,15,56,220,240 + movdqa %xmm8,80(%rsp) +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + leaq 64(%r11),%rcx + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_enc_loop6 +.align 32 .Lxts_enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lxts_enc_loop6_enter: movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups (%rcx),%xmm0 + decl %eax jnz .Lxts_enc_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm10,%xmm11 .byte 102,15,56,220,249 movups 16(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,208 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm10 + psrad $31,%xmm14 .byte 102,15,56,220,216 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,224 - pxor %xmm9,%xmm15 .byte 102,15,56,220,232 + pxor %xmm14,%xmm15 .byte 102,15,56,220,240 + movaps %xmm11,%xmm12 .byte 102,15,56,220,248 movups 32(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 + movdqa %xmm13,48(%rsp) .byte 102,15,56,220,233 + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm12,%xmm13 .byte 102,15,56,220,249 + movups 48(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 paddq %xmm15,%xmm15 -.byte 102,15,56,221,208 - pand %xmm8,%xmm9 -.byte 102,15,56,221,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,221,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm14,%xmm15 +.byte 102,15,56,220,240 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 + psrad $31,%xmm9 +.byte 102,15,56,221,84,36,0 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 pxor %xmm9,%xmm15 - - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_enc_grandloop - leal 3(%rax,%rax,1),%eax + leal 7(%rax,%rax,1),%eax movq %r11,%rcx movl %eax,%r10d .Lxts_enc_short: + pxor %xmm0,%xmm10 addq $96,%rdx jz .Lxts_enc_done + pxor %xmm0,%xmm11 cmpq $32,%rdx jb .Lxts_enc_one + pxor %xmm0,%xmm12 je .Lxts_enc_two + pxor %xmm0,%xmm13 cmpq $64,%rdx jb .Lxts_enc_three + pxor %xmm0,%xmm14 je .Lxts_enc_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1512,15 +1738,15 @@ aesni_xts_encrypt: call _aesni_encrypt4 - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_enc_done @@ -1561,7 +1787,8 @@ aesni_xts_encrypt: movups %xmm2,-16(%rsi) .Lxts_enc_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp .Lxts_enc_epilogue: .byte 0xf3,0xc3 .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -1569,7 +1796,11 @@ aesni_xts_encrypt: .type aesni_xts_decrypt,@function .align 16 aesni_xts_decrypt: - leaq -104(%rsp),%rsp + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1590,228 +1821,266 @@ aesni_xts_decrypt: shlq $4,%rax subq %rax,%rdx + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movl %eax,%r10d + movdqa .Lxts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pshufd $95,%xmm15,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc .Lxts_dec_short shrl $1,%eax - subl $1,%eax + subl $3,%eax + movups 16(%r11),%xmm1 movl %eax,%r10d + leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop -.align 16 +.align 32 .Lxts_dec_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm12 movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,222,216 + pxor %xmm9,%xmm13 movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,222,224 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,222,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp .Lxts_dec_loop6_enter - -.align 16 +.byte 102,15,56,222,240 + movdqa %xmm8,80(%rsp) +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + leaq 64(%r11),%rcx + pshufd $95,%xmm15,%xmm9 + jmp .Lxts_dec_loop6 +.align 32 .Lxts_dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Lxts_dec_loop6_enter: movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups (%rcx),%xmm0 + decl %eax jnz .Lxts_dec_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm10,%xmm11 .byte 102,15,56,222,249 movups 16(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,208 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm10 + psrad $31,%xmm14 .byte 102,15,56,222,216 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,224 - pxor %xmm9,%xmm15 .byte 102,15,56,222,232 + pxor %xmm14,%xmm15 .byte 102,15,56,222,240 + movaps %xmm11,%xmm12 .byte 102,15,56,222,248 movups 32(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 + movdqa %xmm13,48(%rsp) .byte 102,15,56,222,233 + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm12,%xmm13 .byte 102,15,56,222,249 + movups 48(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 paddq %xmm15,%xmm15 -.byte 102,15,56,223,208 - pand %xmm8,%xmm9 -.byte 102,15,56,223,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,223,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm14,%xmm15 +.byte 102,15,56,222,240 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 + psrad $31,%xmm9 +.byte 102,15,56,223,84,36,0 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 pxor %xmm9,%xmm15 - - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc .Lxts_dec_grandloop - leal 3(%rax,%rax,1),%eax + leal 7(%rax,%rax,1),%eax movq %r11,%rcx movl %eax,%r10d .Lxts_dec_short: + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 addq $96,%rdx jz .Lxts_dec_done + pxor %xmm0,%xmm12 cmpq $32,%rdx jb .Lxts_dec_one + pxor %xmm0,%xmm13 je .Lxts_dec_two + pxor %xmm0,%xmm14 cmpq $64,%rdx jb .Lxts_dec_three je .Lxts_dec_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1904,7 +2173,7 @@ aesni_xts_decrypt: xorps %xmm10,%xmm2 movdqa %xmm13,%xmm10 xorps %xmm11,%xmm3 - movdqa %xmm15,%xmm11 + movdqa %xmm14,%xmm11 xorps %xmm12,%xmm4 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -1914,14 +2183,8 @@ aesni_xts_decrypt: .align 16 .Lxts_dec_four: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movups (%rdi),%xmm2 - pand %xmm8,%xmm9 movups 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movups 32(%rdi),%xmm4 xorps %xmm10,%xmm2 movups 48(%rdi),%xmm5 @@ -1932,16 +2195,16 @@ aesni_xts_decrypt: call _aesni_decrypt4 - xorps %xmm10,%xmm2 + pxor %xmm10,%xmm2 movdqa %xmm14,%xmm10 - xorps %xmm11,%xmm3 + pxor %xmm11,%xmm3 movdqa %xmm15,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp .Lxts_dec_done @@ -2001,7 +2264,8 @@ aesni_xts_decrypt: movups %xmm2,(%rsi) .Lxts_dec_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp .Lxts_dec_epilogue: .byte 0xf3,0xc3 .size aesni_xts_decrypt,.-aesni_xts_decrypt @@ -2068,149 +2332,324 @@ aesni_cbc_encrypt: .align 16 .Lcbc_decrypt: - movups (%r8),%xmm9 + leaq (%rsp),%rax + pushq %rbp + subq $16,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r8),%xmm10 movl %r10d,%eax - cmpq $112,%rdx + cmpq $80,%rdx jbe .Lcbc_dec_tail - shrl $1,%r10d + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + cmpq $112,%rdx + jbe .Lcbc_dec_six_or_seven + subq $112,%rdx - movl %r10d,%eax - movaps %xmm9,-24(%rsp) + leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .align 16 .Lcbc_dec_loop8: - movaps %xmm0,-24(%rsp) movups %xmm9,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_loop8_enter: - movups (%rcx),%xmm0 - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 16(%rcx),%xmm1 + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 - leaq 32(%rcx),%rcx - movdqu 32(%rdi),%xmm4 - xorps %xmm0,%xmm2 - movdqu 48(%rdi),%xmm5 - xorps %xmm0,%xmm3 - movdqu 64(%rdi),%xmm6 .byte 102,15,56,222,209 - pxor %xmm0,%xmm4 - movdqu 80(%rdi),%xmm7 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 .byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - movdqu 96(%rdi),%xmm8 .byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqu 112(%rdi),%xmm9 .byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - decl %eax .byte 102,15,56,222,241 - pxor %xmm0,%xmm8 .byte 102,15,56,222,249 - pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 + setnc %r11b .byte 102,68,15,56,222,193 + shlq $7,%r11 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - - call .Ldec_loop8_enter + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + cmpl $11,%eax + jb .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je .Lcbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 +.Lcbc_dec_done: +.byte 102,15,56,222,209 + pxor %xmm0,%xmm10 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 + pxor %xmm0,%xmm12 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 + pxor %xmm0,%xmm14 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 + leaq 128(%rdi),%rdi +.byte 102,65,15,56,223,228 + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 + movdqu 16(%r11),%xmm12 +.byte 102,65,15,56,223,246 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 + movdqu 48(%r11),%xmm14 +.byte 102,68,15,56,223,193 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm1 - xorps %xmm0,%xmm8 - movups 112(%rdi),%xmm0 - xorps %xmm1,%xmm9 movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) - movl %r10d,%eax + movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) - movq %r11,%rcx + movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) - leaq 128(%rdi),%rdi + movdqa %xmm1,%xmm7 movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi + subq $128,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 - movaps %xmm0,%xmm9 + leaq -112(%rcx),%rcx addq $112,%rdx jle .Lcbc_dec_tail_collected - movups %xmm2,(%rsi) - leal 1(%r10,%r10,1),%eax + movups %xmm9,(%rsi) leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe .Lcbc_dec_tail + + movaps %xmm11,%xmm2 +.Lcbc_dec_six_or_seven: + cmpq $96,%rdx + ja .Lcbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + jmp .Lcbc_dec_tail_collected + .Lcbc_dec_tail: movups (%rdi),%xmm2 - movaps %xmm2,%xmm8 - cmpq $16,%rdx + subq $16,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 - movaps %xmm3,%xmm7 - cmpq $32,%rdx + movaps %xmm2,%xmm11 + subq $16,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 - movaps %xmm4,%xmm6 - cmpq $48,%rdx + movaps %xmm3,%xmm12 + subq $16,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 - cmpq $64,%rdx + movaps %xmm4,%xmm13 + subq $16,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 - cmpq $80,%rdx - jbe .Lcbc_dec_five - - movups 80(%rdi),%xmm7 - cmpq $96,%rdx - jbe .Lcbc_dec_six - - movups 96(%rdi),%xmm8 - movaps %xmm9,-24(%rsp) - call _aesni_decrypt8 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm9 - xorps %xmm0,%xmm8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - movaps %xmm8,%xmm2 - subq $112,%rdx + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + subq $16,%rdx jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_one: + movaps %xmm2,%xmm11 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -2222,111 +2661,69 @@ aesni_cbc_encrypt: leaq 16(%rcx),%rcx jnz .Loop_dec1_16 .byte 102,15,56,223,209 - xorps %xmm9,%xmm2 - movaps %xmm8,%xmm9 - subq $16,%rdx + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: + movaps %xmm3,%xmm12 xorps %xmm4,%xmm4 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - movaps %xmm7,%xmm9 - movaps %xmm3,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 leaq 16(%rsi),%rsi - subq $32,%rdx jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: + movaps %xmm4,%xmm13 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - movaps %xmm6,%xmm9 - movaps %xmm4,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + movdqa %xmm4,%xmm2 leaq 32(%rsi),%rsi - subq $48,%rdx jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_four: + movaps %xmm5,%xmm14 call _aesni_decrypt4 - xorps %xmm9,%xmm2 - movups 48(%rdi),%xmm9 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - xorps %xmm6,%xmm5 - movups %xmm4,32(%rsi) - movaps %xmm5,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + movdqa %xmm5,%xmm2 leaq 48(%rsi),%rsi - subq $64,%rdx - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm9 - xorps %xmm1,%xmm6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - movaps %xmm6,%xmm2 - subq $80,%rdx - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_six: - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm0 - xorps %xmm1,%xmm6 - movups 80(%rdi),%xmm9 - xorps %xmm0,%xmm7 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - movaps %xmm7,%xmm2 - subq $96,%rdx jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_tail_collected: + movups %xmm10,(%r8) andq $15,%rdx - movups %xmm9,(%r8) jnz .Lcbc_dec_tail_partial movups %xmm2,(%rsi) jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: - movaps %xmm2,-24(%rsp) + movaps %xmm2,(%rsp) movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx - leaq -24(%rsp),%rsi + leaq (%rsp),%rsi .long 0x9066A4F3 .Lcbc_dec_ret: + leaq (%rbp),%rsp + popq %rbp .Lcbc_ret: .byte 0xf3,0xc3 .size aesni_cbc_encrypt,.-aesni_cbc_encrypt @@ -2569,6 +2966,8 @@ __aesni_set_encrypt_key: .long 1,0,0,0 .Lxts_magic: .long 0x87,0,1,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s index 4709ac2..2ac113d 100644 --- a/lib/accelerated/x86/elf/padlock-x86-64.s +++ b/lib/accelerated/x86/elf/padlock-x86-64.s @@ -595,6 +595,468 @@ padlock_cbc_encrypt: popq %rbp .byte 0xf3,0xc3 .size padlock_cbc_encrypt,.-padlock_cbc_encrypt +.globl padlock_cfb_encrypt +.type padlock_cfb_encrypt,@function +.align 16 +padlock_cfb_encrypt: + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz .Lcfb_abort + testq $15,%rcx + jnz .Lcfb_abort + leaq .Lpadlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz .Lcfb_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz .Lcfb_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + jmp .Lcfb_loop +.align 16 +.Lcfb_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz .Lcfb_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +.Lcfb_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,224 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz .Lcfb_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +.Lcfb_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jnz .Lcfb_loop + cmpq %rbp,%rsp + je .Lcfb_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +.Lcfb_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja .Lcfb_bzero + +.Lcfb_done: + leaq (%rbp),%rsp + jmp .Lcfb_exit + +.align 16 +.Lcfb_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,224 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) +.Lcfb_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +.Lcfb_abort: + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size padlock_cfb_encrypt,.-padlock_cfb_encrypt +.globl padlock_ofb_encrypt +.type padlock_ofb_encrypt,@function +.align 16 +padlock_ofb_encrypt: + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz .Lofb_abort + testq $15,%rcx + jnz .Lofb_abort + leaq .Lpadlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz .Lofb_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz .Lofb_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + jmp .Lofb_loop +.align 16 +.Lofb_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz .Lofb_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +.Lofb_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,232 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz .Lofb_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +.Lofb_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jnz .Lofb_loop + cmpq %rbp,%rsp + je .Lofb_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +.Lofb_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja .Lofb_bzero + +.Lofb_done: + leaq (%rbp),%rsp + jmp .Lofb_exit + +.align 16 +.Lofb_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,232 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) +.Lofb_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +.Lofb_abort: + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size padlock_ofb_encrypt,.-padlock_ofb_encrypt +.globl padlock_ctr32_encrypt +.type padlock_ctr32_encrypt,@function +.align 16 +padlock_ctr32_encrypt: + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz .Lctr32_abort + testq $15,%rcx + jnz .Lctr32_abort + leaq .Lpadlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz .Lctr32_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz .Lctr32_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx +.Lctr32_reenter: + movl -4(%rdx),%eax + bswapl %eax + negl %eax + andl $31,%eax + movq $512,%rbx + shll $4,%eax + cmovzq %rbx,%rax + cmpq %rax,%rcx + cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + cmpq %rbx,%rcx + ja .Lctr32_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz .Lctr32_unaligned_tail + jmp .Lctr32_loop +.align 16 +.Lctr32_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz .Lctr32_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +.Lctr32_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + movl -4(%rdx),%eax + testl $4294901760,%eax + jnz .Lctr32_no_carry + bswapl %eax + addl $65536,%eax + bswapl %eax + movl %eax,-4(%rdx) +.Lctr32_no_carry: + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz .Lctr32_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +.Lctr32_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jz .Lctr32_break + cmpq %rbx,%rcx + jae .Lctr32_loop + movq %rcx,%rbx + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jnz .Lctr32_loop +.Lctr32_unaligned_tail: + xorl %eax,%eax + cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp .Lctr32_loop +.align 16 +.Lctr32_break: + cmpq %rbp,%rsp + je .Lctr32_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +.Lctr32_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja .Lctr32_bzero + +.Lctr32_done: + leaq (%rbp),%rsp + jmp .Lctr32_exit + +.align 16 +.Lctr32_aligned: + movl -4(%rdx),%eax + bswapl %eax + negl %eax + andl $65535,%eax + movq $1048576,%rbx + shll $4,%eax + cmovzq %rbx,%rax + cmpq %rax,%rcx + cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + jbe .Lctr32_aligned_skip + +.Lctr32_aligned_loop: + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + + movl -4(%rdx),%eax + bswapl %eax + addl $65536,%eax + bswapl %eax + movl %eax,-4(%rdx) + + movq %r10,%rcx + subq %r11,%rcx + movq $1048576,%rbx + jz .Lctr32_exit + cmpq %rbx,%rcx + jae .Lctr32_aligned_loop + +.Lctr32_aligned_skip: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $32,%rbp + movq $32-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz .Lctr32_aligned_tail + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + testq %rbp,%rbp + jz .Lctr32_exit + +.Lctr32_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp .Lctr32_loop +.Lctr32_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +.Lctr32_abort: + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size padlock_ctr32_encrypt,.-padlock_ctr32_encrypt .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 .data diff --git a/lib/accelerated/x86/elf/padlock-x86.s b/lib/accelerated/x86/elf/padlock-x86.s index ea982ec..2199255 100644 --- a/lib/accelerated/x86/elf/padlock-x86.s +++ b/lib/accelerated/x86/elf/padlock-x86.s @@ -187,16 +187,14 @@ padlock_ecb_encrypt: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $128,%ecx - jbe .L006ecb_short testl $32,(%edx) - jnz .L007ecb_aligned + jnz .L006ecb_aligned testl $15,%edi setz %al testl $15,%esi setz %bl testl %ebx,%eax - jnz .L007ecb_aligned + jnz .L006ecb_aligned negl %eax movl $512,%ebx notl %eax @@ -208,10 +206,28 @@ padlock_ecb_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp - jmp .L008ecb_loop + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L007ecb_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $128,%eax + movl $-128,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L008ecb_unaligned_tail + jmp .L007ecb_loop .align 16 -.L008ecb_loop: +.L007ecb_loop: movl %edi,(%ebp) movl %esi,4(%ebp) movl %ecx,8(%ebp) @@ -236,8 +252,8 @@ padlock_ecb_encrypt: testl $15,%edi jz .L010ecb_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi .L010ecb_out_aligned: @@ -247,43 +263,75 @@ padlock_ecb_encrypt: addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L008ecb_loop + jz .L011ecb_break + cmpl %ebx,%ecx + jae .L007ecb_loop +.L008ecb_unaligned_tail: + xorl %eax,%eax cmpl %ebp,%esp - je .L011ecb_done + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.align 16 +.L011ecb_break: + cmpl %ebp,%esp + je .L012ecb_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L012ecb_bzero: +.L013ecb_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L012ecb_bzero -.L011ecb_done: + ja .L013ecb_bzero +.L012ecb_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L013ecb_exit + jmp .L014ecb_exit .align 16 -.L006ecb_short: +.L006ecb_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax - leal -24(%esp),%ebp - subl %ecx,%eax - leal (%eax,%ebp,1),%esp - andl $-16,%esp - xorl %ebx,%ebx -.L014ecb_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx - cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L014ecb_short_copy - movl %esp,%esi - movl %ecx,%ebx - jmp .L008ecb_loop -.align 16 -.L007ecb_aligned: + cmpl $128,%ebp + movl $127,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L015ecb_aligned_tail leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx .byte 243,15,167,200 -.L013ecb_exit: + testl %ebp,%ebp + jz .L014ecb_exit +.L015ecb_aligned_tail: + movl %ebp,%ecx + leal -24(%esp),%ebp + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L007ecb_loop +.L014ecb_exit: movl $1,%eax leal 4(%esp),%esp .L004ecb_abort: @@ -307,19 +355,17 @@ padlock_cbc_encrypt: movl 28(%esp),%edx movl 32(%esp),%ecx testl $15,%edx - jnz .L015cbc_abort + jnz .L016cbc_abort testl $15,%ecx - jnz .L015cbc_abort - leal .Lpadlock_saved_context-.L016cbc_pic_point,%eax + jnz .L016cbc_abort + leal .Lpadlock_saved_context-.L017cbc_pic_point,%eax pushfl cld call _padlock_verify_ctx -.L016cbc_pic_point: +.L017cbc_pic_point: leal 16(%edx),%edx xorl %eax,%eax xorl %ebx,%ebx - cmpl $64,%ecx - jbe .L017cbc_short testl $32,(%edx) jnz .L018cbc_aligned testl $15,%edi @@ -339,7 +385,25 @@ padlock_cbc_encrypt: negl %eax andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + cmpl %ebx,%ecx + ja .L019cbc_loop + movl %esi,%eax + cmpl %esp,%ebp + cmovel %edi,%eax + addl %ecx,%eax + negl %eax + andl $4095,%eax + cmpl $64,%eax + movl $-64,%eax + cmovael %ebx,%eax + andl %eax,%ebx + jz .L020cbc_unaligned_tail jmp .L019cbc_loop .align 16 .L019cbc_loop: @@ -351,13 +415,13 @@ padlock_cbc_encrypt: testl $15,%edi cmovnzl %esp,%edi testl $15,%esi - jz .L020cbc_inp_aligned + jz .L021cbc_inp_aligned shrl $2,%ecx .byte 243,165 subl %ebx,%edi movl %ebx,%ecx movl %edi,%esi -.L020cbc_inp_aligned: +.L021cbc_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx @@ -367,67 +431,450 @@ padlock_cbc_encrypt: movl (%ebp),%edi movl 12(%ebp),%ebx testl $15,%edi - jz .L021cbc_out_aligned + jz .L022cbc_out_aligned movl %ebx,%ecx - shrl $2,%ecx leal (%esp),%esi + shrl $2,%ecx .byte 243,165 subl %ebx,%edi -.L021cbc_out_aligned: +.L022cbc_out_aligned: movl 4(%ebp),%esi movl 8(%ebp),%ecx addl %ebx,%edi addl %ebx,%esi subl %ebx,%ecx movl $512,%ebx - jnz .L019cbc_loop + jz .L023cbc_break + cmpl %ebx,%ecx + jae .L019cbc_loop +.L020cbc_unaligned_tail: + xorl %eax,%eax cmpl %ebp,%esp - je .L022cbc_done + cmovel %ecx,%eax + subl %eax,%esp + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.align 16 +.L023cbc_break: + cmpl %ebp,%esp + je .L024cbc_done pxor %xmm0,%xmm0 leal (%esp),%eax -.L023cbc_bzero: +.L025cbc_bzero: movaps %xmm0,(%eax) leal 16(%eax),%eax cmpl %eax,%ebp - ja .L023cbc_bzero -.L022cbc_done: + ja .L025cbc_bzero +.L024cbc_done: + movl 16(%ebp),%ebp leal 24(%ebp),%esp - jmp .L024cbc_exit + jmp .L026cbc_exit .align 16 -.L017cbc_short: +.L018cbc_aligned: + leal (%esi,%ecx,1),%ebp + negl %ebp + andl $4095,%ebp xorl %eax,%eax + cmpl $64,%ebp + movl $63,%ebp + cmovael %eax,%ebp + andl %ecx,%ebp + subl %ebp,%ecx + jz .L027cbc_aligned_tail + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,208 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) + testl %ebp,%ebp + jz .L026cbc_exit +.L027cbc_aligned_tail: + movl %ebp,%ecx leal -24(%esp),%ebp - subl %ecx,%eax + movl %ebp,%esp + movl %ebp,%eax + subl %ecx,%esp + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + movl %edi,%eax + movl %ecx,%ebx + shrl $2,%ecx + leal (%esp),%edi +.byte 243,165 + movl %esp,%esi + movl %eax,%edi + movl %ebx,%ecx + jmp .L019cbc_loop +.L026cbc_exit: + movl $1,%eax + leal 4(%esp),%esp +.L016cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin +.globl padlock_cfb_encrypt +.type padlock_cfb_encrypt,@function +.align 16 +padlock_cfb_encrypt: +.L_padlock_cfb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz .L028cfb_abort + testl $15,%ecx + jnz .L028cfb_abort + leal .Lpadlock_saved_context-.L029cfb_pic_point,%eax + pushfl + cld + call _padlock_verify_ctx +.L029cfb_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%edx) + jnz .L030cfb_aligned + testl $15,%edi + setz %al + testl $15,%esi + setz %bl + testl %ebx,%eax + jnz .L030cfb_aligned + negl %eax + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp andl $-16,%esp + movl %eax,16(%ebp) + jmp .L031cfb_loop +.align 16 +.L031cfb_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + testl $15,%edi + cmovnzl %esp,%edi + testl $15,%esi + jz .L032cfb_inp_aligned + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi + movl %ebx,%ecx + movl %edi,%esi +.L032cfb_inp_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,224 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) + movl (%ebp),%edi + movl 12(%ebp),%ebx + testl $15,%edi + jz .L033cfb_out_aligned + movl %ebx,%ecx + leal (%esp),%esi + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi +.L033cfb_out_aligned: + movl 4(%ebp),%esi + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz .L031cfb_loop + cmpl %ebp,%esp + je .L034cfb_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +.L035cfb_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja .L035cfb_bzero +.L034cfb_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + jmp .L036cfb_exit +.align 16 +.L030cfb_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,224 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) +.L036cfb_exit: + movl $1,%eax + leal 4(%esp),%esp +.L028cfb_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size padlock_cfb_encrypt,.-.L_padlock_cfb_encrypt_begin +.globl padlock_ofb_encrypt +.type padlock_ofb_encrypt,@function +.align 16 +padlock_ofb_encrypt: +.L_padlock_ofb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz .L037ofb_abort + testl $15,%ecx + jnz .L037ofb_abort + leal .Lpadlock_saved_context-.L038ofb_pic_point,%eax + pushfl + cld + call _padlock_verify_ctx +.L038ofb_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax xorl %ebx,%ebx -.L025cbc_short_copy: - movups (%esi,%ebx,1),%xmm0 - leal 16(%ebx),%ebx + testl $32,(%edx) + jnz .L039ofb_aligned + testl $15,%edi + setz %al + testl $15,%esi + setz %bl + testl %ebx,%eax + jnz .L039ofb_aligned + negl %eax + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp cmpl %ebx,%ecx - movaps %xmm0,-16(%esp,%ebx,1) - ja .L025cbc_short_copy - movl %esp,%esi + cmovcl %ecx,%ebx + andl %ebx,%eax movl %ecx,%ebx - jmp .L019cbc_loop + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp .L040ofb_loop .align 16 -.L018cbc_aligned: +.L040ofb_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + testl $15,%edi + cmovnzl %esp,%edi + testl $15,%esi + jz .L041ofb_inp_aligned + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi + movl %ebx,%ecx + movl %edi,%esi +.L041ofb_inp_aligned: leal -16(%edx),%eax leal 16(%edx),%ebx shrl $4,%ecx -.byte 243,15,167,208 +.byte 243,15,167,232 movaps (%eax),%xmm0 movaps %xmm0,-16(%edx) -.L024cbc_exit: + movl (%ebp),%edi + movl 12(%ebp),%ebx + testl $15,%edi + jz .L042ofb_out_aligned + movl %ebx,%ecx + leal (%esp),%esi + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi +.L042ofb_out_aligned: + movl 4(%ebp),%esi + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz .L040ofb_loop + cmpl %ebp,%esp + je .L043ofb_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +.L044ofb_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja .L044ofb_bzero +.L043ofb_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + jmp .L045ofb_exit +.align 16 +.L039ofb_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,232 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) +.L045ofb_exit: movl $1,%eax leal 4(%esp),%esp -.L015cbc_abort: +.L037ofb_abort: popl %edi popl %esi popl %ebx popl %ebp ret -.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin +.size padlock_ofb_encrypt,.-.L_padlock_ofb_encrypt_begin +.globl padlock_ctr32_encrypt +.type padlock_ctr32_encrypt,@function +.align 16 +padlock_ctr32_encrypt: +.L_padlock_ctr32_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz .L046ctr32_abort + testl $15,%ecx + jnz .L046ctr32_abort + leal .Lpadlock_saved_context-.L047ctr32_pic_point,%eax + pushfl + cld + call _padlock_verify_ctx +.L047ctr32_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + movq -16(%edx),%mm0 + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp .L048ctr32_loop +.align 16 +.L048ctr32_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + movl -4(%edx),%ecx + xorl %edi,%edi + movl -8(%edx),%eax +.L049ctr32_prepare: + movl %ecx,12(%esp,%edi,1) + bswap %ecx + movq %mm0,(%esp,%edi,1) + incl %ecx + movl %eax,8(%esp,%edi,1) + bswap %ecx + leal 16(%edi),%edi + cmpl %ebx,%edi + jb .L049ctr32_prepare + movl %ecx,-4(%edx) + leal (%esp),%esi + leal (%esp),%edi + movl %ebx,%ecx + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,200 + movl (%ebp),%edi + movl 12(%ebp),%ebx + movl 4(%ebp),%esi + xorl %ecx,%ecx +.L050ctr32_xor: + movups (%esi,%ecx,1),%xmm1 + leal 16(%ecx),%ecx + pxor -16(%esp,%ecx,1),%xmm1 + movups %xmm1,-16(%edi,%ecx,1) + cmpl %ebx,%ecx + jb .L050ctr32_xor + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz .L048ctr32_loop + pxor %xmm0,%xmm0 + leal (%esp),%eax +.L051ctr32_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja .L051ctr32_bzero +.L052ctr32_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + movl $1,%eax + leal 4(%esp),%esp + emms +.L046ctr32_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size padlock_ctr32_encrypt,.-.L_padlock_ctr32_encrypt_begin .globl padlock_xstore .type padlock_xstore,@function .align 16 @@ -447,10 +894,10 @@ _win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne .L026ret + jne .L053ret addl $4,184(%ecx) movl $0,%eax -.L026ret: +.L053ret: ret .size _win32_segv_handler,.-_win32_segv_handler .globl padlock_sha1_oneshot diff --git a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s index cfac705..eac88ae 100644 --- a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s +++ b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s @@ -699,6 +699,7 @@ L$ghash_epilogue: .p2align 4 _gcm_init_clmul: +L$_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 @@ -717,15 +718,15 @@ _gcm_init_clmul: pxor %xmm5,%xmm2 + pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -735,44 +736,134 @@ _gcm_init_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - movdqu %xmm2,(%rdi) - movdqu %xmm0,16(%rdi) + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) +.byte 102,15,58,15,227,8 + movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 .globl _gcm_gmult_clmul .p2align 4 _gcm_gmult_clmul: +L$_gmult_clmul: movdqu (%rdi),%xmm0 movdqa L$bswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 @@ -785,186 +876,358 @@ _gcm_gmult_clmul: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 .globl _gcm_ghash_clmul -.p2align 4 +.p2align 5 _gcm_ghash_clmul: +L$_ghash_clmul: movdqa L$bswap_mask(%rip),%xmm5 + movq $11547335547999543296,%rax movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm10 .byte 102,15,56,0,197 subq $16,%rcx jz L$odd_tail - movdqu 16(%rsi),%xmm8 + movdqu 16(%rsi),%xmm9 + cmpq $48,%rcx + jb L$skip4x + subq $48,%rcx + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 - movdqu (%rdx),%xmm3 - movdqu 16(%rdx),%xmm6 -.byte 102,15,56,0,221 + movdqu 48(%rdx),%xmm6 + movdqu 32(%rdx),%xmm11 .byte 102,15,56,0,245 - pxor %xmm3,%xmm0 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm6,%xmm3 - pxor %xmm2,%xmm4 +.byte 102,68,15,56,0,221 + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm7 + pxor %xmm6,%xmm7 .byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,220,0 - pxor %xmm6,%xmm3 - pxor %xmm7,%xmm3 +.byte 102,68,15,58,68,194,17 +.byte 102,65,15,58,68,250,0 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,217,0 +.byte 102,69,15,58,68,233,17 + xorps %xmm11,%xmm6 +.byte 102,69,15,58,68,226,16 + xorps %xmm13,%xmm8 + movups 80(%rsi),%xmm10 + xorps %xmm12,%xmm7 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm3 +.byte 102,68,15,56,0,221 +.byte 102,15,56,0,221 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm6 +.byte 102,69,15,58,68,226,0 + xorps %xmm13,%xmm8 + + leaq 64(%rdx),%rdx + subq $64,%rcx + jc L$tail4x + + jmp L$mod4_loop +.p2align 5 +L$mod4_loop: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm7 + movdqu 48(%rdx),%xmm11 +.byte 102,68,15,56,0,221 +.byte 102,65,15,58,68,207,17 + xorps %xmm6,%xmm0 + movdqu 32(%rdx),%xmm6 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 +.byte 102,65,15,58,68,218,16 + xorps %xmm8,%xmm1 + pxor %xmm11,%xmm12 +.byte 102,15,56,0,245 + movups 32(%rsi),%xmm10 +.byte 102,68,15,58,68,218,0 + xorps %xmm7,%xmm3 + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm7 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm7 + pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 + pslldq $8,%xmm3 +.byte 102,68,15,58,68,234,17 + psrldq $8,%xmm4 + pxor %xmm3,%xmm0 + movdqa L$7_mask(%rip),%xmm3 + pxor %xmm4,%xmm1 +.byte 102,72,15,110,224 + + pand %xmm0,%xmm3 +.byte 102,15,56,0,227 +.byte 102,69,15,58,68,226,0 + pxor %xmm0,%xmm4 + psllq $57,%xmm4 + movdqa %xmm4,%xmm3 pslldq $8,%xmm4 - pxor %xmm3,%xmm7 - pxor %xmm4,%xmm6 +.byte 102,65,15,58,68,241,0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqu 0(%rdx),%xmm3 + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,193,17 + xorps %xmm11,%xmm6 + movdqu 16(%rdx),%xmm11 +.byte 102,68,15,56,0,221 +.byte 102,65,15,58,68,250,16 + xorps %xmm13,%xmm8 + movups 80(%rsi),%xmm10 +.byte 102,15,56,0,221 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm7 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 +.byte 102,69,15,58,68,222,0 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + psrlq $1,%xmm0 +.byte 102,69,15,58,68,238,17 + xorps %xmm11,%xmm6 + pxor %xmm1,%xmm0 + +.byte 102,69,15,58,68,226,0 + xorps %xmm13,%xmm8 + movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - leaq 32(%rdx),%rdx - subq $32,%rcx - jbe L$even_tail + leaq 64(%rdx),%rdx + subq $64,%rcx + jnc L$mod4_loop + +L$tail4x: +.byte 102,65,15,58,68,199,0 + xorps %xmm12,%xmm7 +.byte 102,65,15,58,68,207,17 + xorps %xmm6,%xmm0 +.byte 102,65,15,58,68,218,16 + xorps %xmm8,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm7,%xmm3 -L$mod_loop: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 + pxor %xmm0,%xmm1 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - movdqu (%rdx),%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $64,%rcx + jz L$done + movdqu 32(%rsi),%xmm10 + subq $16,%rcx + jz L$odd_tail +L$skip4x: + + + + + + movdqu (%rdx),%xmm3 movdqu 16(%rdx),%xmm6 .byte 102,15,56,0,221 .byte 102,15,56,0,245 + pxor %xmm3,%xmm0 + + movdqa %xmm6,%xmm8 + pshufd $78,%xmm6,%xmm3 + pxor %xmm6,%xmm3 +.byte 102,15,58,68,242,0 +.byte 102,68,15,58,68,194,17 +.byte 102,65,15,58,68,218,0 + + leaq 32(%rdx),%rdx + subq $32,%rcx + jbe L$even_tail + jmp L$mod_loop + +.p2align 5 +L$mod_loop: + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,65,15,58,68,193,0 +.byte 102,65,15,58,68,201,17 +.byte 102,65,15,58,68,226,16 + + pxor %xmm6,%xmm0 + pxor %xmm8,%xmm1 + movdqu (%rdx),%xmm8 +.byte 102,68,15,56,0,197 + movdqu 16(%rdx),%xmm6 - movdqa %xmm6,%xmm7 - pshufd $78,%xmm6,%xmm9 - pshufd $78,%xmm2,%xmm10 - pxor %xmm6,%xmm9 - pxor %xmm2,%xmm10 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm8,%xmm1 + pxor %xmm3,%xmm4 +.byte 102,15,56,0,245 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm3 + pslldq $8,%xmm4 pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm6,%xmm8 + + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 - pxor %xmm3,%xmm0 .byte 102,15,58,68,242,0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + pshufd $78,%xmm8,%xmm3 + pxor %xmm8,%xmm3 -.byte 102,15,58,68,250,17 +.byte 102,68,15,58,68,194,17 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 - -.byte 102,69,15,58,68,202,0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm8,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm8,%xmm4 - - pxor %xmm6,%xmm9 - pxor %xmm7,%xmm9 - movdqa %xmm9,%xmm10 - psrldq $8,%xmm9 - pslldq $8,%xmm10 - pxor %xmm9,%xmm7 - pxor %xmm10,%xmm6 +.byte 102,65,15,58,68,218,0 + pxor %xmm1,%xmm0 leaq 32(%rdx),%rdx subq $32,%rcx ja L$mod_loop L$even_tail: -.byte 102,65,15,58,68,192,0 -.byte 102,65,15,58,68,200,17 -.byte 102,15,58,68,220,0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + +.byte 102,65,15,58,68,193,0 +.byte 102,65,15,58,68,201,17 +.byte 102,65,15,58,68,226,16 + + pxor %xmm6,%xmm0 + pxor %xmm8,%xmm1 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 - pxor %xmm6,%xmm0 - pxor %xmm7,%xmm1 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 testq %rcx,%rcx jnz L$done @@ -974,12 +1237,10 @@ L$odd_tail: pxor %xmm3,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 +.byte 102,65,15,58,68,218,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 @@ -989,38 +1250,60 @@ L$odd_tail: pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 - movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 pslldq $8,%xmm0 - psrldq $8,%xmm4 - pxor %xmm3,%xmm0 - pxor %xmm4,%xmm1 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 psrlq $1,%xmm0 - pxor %xmm4,%xmm0 + pxor %xmm1,%xmm0 L$done: .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 -L$SEH_end_gcm_ghash_clmul: + +.globl _gcm_init_avx + +.p2align 5 +_gcm_init_avx: + jmp L$_init_clmul + +.globl _gcm_gmult_avx + +.p2align 5 +_gcm_gmult_avx: + jmp L$_gmult_clmul + +.globl _gcm_ghash_avx + +.p2align 5 +_gcm_ghash_avx: + jmp L$_ghash_clmul .p2align 6 L$bswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 L$0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +L$7_mask: +.long 7,0,7,0 +L$7_mask_poly: +.long 7,0,450,0 .p2align 6 L$rem_4bit: diff --git a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s index a82f0a5..e2cfa17 100644 --- a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s +++ b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s @@ -927,199 +927,412 @@ L$oop_enc1_6: .p2align 4 _aesni_ctr32_encrypt_blocks: + leaq (%rsp),%rax + pushq %rbp + subq $128,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + cmpq $1,%rdx je L$ctr32_one_shortcut - movdqu (%r8),%xmm14 - movdqa L$bswap_mask(%rip),%xmm15 - xorl %eax,%eax -.byte 102,69,15,58,22,242,3 -.byte 102,68,15,58,34,240,3 + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%r11d + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movdqa %xmm2,112(%rsp) movl 240(%rcx),%eax + + leaq 1(%r8),%r9 + leaq 2(%r8),%r10 + bswapl %r9d bswapl %r10d - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 -.byte 102,69,15,58,34,226,0 - leaq 3(%r10),%r11 -.byte 102,69,15,58,34,235,0 - incl %r10d -.byte 102,69,15,58,34,226,1 - incq %r11 -.byte 102,69,15,58,34,235,1 - incl %r10d -.byte 102,69,15,58,34,226,2 - incq %r11 -.byte 102,69,15,58,34,235,2 - movdqa %xmm12,-40(%rsp) -.byte 102,69,15,56,0,231 - movdqa %xmm13,-24(%rsp) -.byte 102,69,15,56,0,239 - - pshufd $192,%xmm12,%xmm2 - pshufd $128,%xmm12,%xmm3 - pshufd $64,%xmm12,%xmm4 - cmpq $6,%rdx - jb L$ctr32_tail - shrl $1,%eax - movq %rcx,%r11 - movl %eax,%r10d - subq $6,%rdx - jmp L$ctr32_loop6 + xorl %r11d,%r9d + xorl %r11d,%r10d +.byte 102,65,15,58,34,217,3 + leaq 3(%r8),%r9 + movdqa %xmm3,16(%rsp) +.byte 102,65,15,58,34,226,3 + bswapl %r9d + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %r11d,%r9d + bswapl %r10d +.byte 102,65,15,58,34,233,3 + xorl %r11d,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + xorl %r11d,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %r11d,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + xorl %r11d,%r9d + movl %r9d,112+12(%rsp) -.p2align 4 -L$ctr32_loop6: - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm2 - movups (%r11),%xmm0 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm3 - movups 16(%r11),%xmm1 - pshufd $64,%xmm13,%xmm7 - por %xmm14,%xmm4 - por %xmm14,%xmm5 - xorps %xmm0,%xmm2 - por %xmm14,%xmm6 - por %xmm14,%xmm7 + movups 16(%rcx),%xmm1 + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + cmpq $8,%rdx + jb L$ctr32_tail + leaq 128(%rcx),%rcx + subq $8,%rdx + jmp L$ctr32_loop8 - pxor %xmm0,%xmm3 +.p2align 5 +L$ctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 .byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 .byte 102,15,56,220,217 - movdqa L$increment32(%rip),%xmm13 - pxor %xmm0,%xmm5 + bswapl %r9d + movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - movdqa -40(%rsp),%xmm12 - pxor %xmm0,%xmm6 + xorl %r11d,%r9d .byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - jmp L$ctr32_enc_loop6_enter -.p2align 4 -L$ctr32_enc_loop6: +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 48-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 64-128(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax + bswapl %r9d .byte 102,15,56,220,225 + xorl %r11d,%r9d .byte 102,15,56,220,233 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -L$ctr32_enc_loop6_enter: - movups 16(%rcx),%xmm1 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 80-128(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx + bswapl %r9d .byte 102,15,56,220,224 + xorl %r11d,%r9d .byte 102,15,56,220,232 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups (%rcx),%xmm0 - jnz L$ctr32_enc_loop6 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 96-128(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + bswapl %r9d +.byte 102,15,56,220,225 + xorl %r11d,%r9d +.byte 102,15,56,220,233 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 112-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 128-128(%rcx),%xmm0 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 + bswapl %r9d +.byte 102,15,56,220,225 + xorl %r11d,%r9d +.byte 102,15,56,220,233 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 144-128(%rcx),%xmm1 +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 + bswapl %r9d +.byte 102,15,56,220,224 + xorl %r11d,%r9d +.byte 102,15,56,220,232 + movl %r9d,112+12(%rsp) +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 + movdqu 0(%rdi),%xmm10 +.byte 102,68,15,56,220,200 + movups 160-128(%rcx),%xmm0 + + cmpl $11,%eax + jb L$ctr32_enc_done .byte 102,15,56,220,209 - paddd %xmm13,%xmm12 .byte 102,15,56,220,217 - paddd -24(%rsp),%xmm13 .byte 102,15,56,220,225 - movdqa %xmm12,-40(%rsp) .byte 102,15,56,220,233 - movdqa %xmm13,-24(%rsp) .byte 102,15,56,220,241 -.byte 102,69,15,56,0,231 .byte 102,15,56,220,249 -.byte 102,69,15,56,0,239 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 176-128(%rcx),%xmm1 -.byte 102,15,56,221,208 - movups (%rdi),%xmm8 -.byte 102,15,56,221,216 - movups 16(%rdi),%xmm9 -.byte 102,15,56,221,224 - movups 32(%rdi),%xmm10 -.byte 102,15,56,221,232 - movups 48(%rdi),%xmm11 -.byte 102,15,56,221,240 - movups 64(%rdi),%xmm1 -.byte 102,15,56,221,248 - movups 80(%rdi),%xmm0 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 192-128(%rcx),%xmm0 + je L$ctr32_enc_done - xorps %xmm2,%xmm8 - pshufd $192,%xmm12,%xmm2 - xorps %xmm3,%xmm9 - pshufd $128,%xmm12,%xmm3 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - pshufd $64,%xmm12,%xmm4 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - xorps %xmm7,%xmm0 - movups %xmm1,64(%rsi) - movups %xmm0,80(%rsi) - leaq 96(%rsi),%rsi - movl %r10d,%eax - subq $6,%rdx - jnc L$ctr32_loop6 +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movups 208-128(%rcx),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 +.byte 102,68,15,56,220,192 +.byte 102,68,15,56,220,200 + movups 224-128(%rcx),%xmm0 + +L$ctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 +.byte 102,15,56,220,209 + pxor %xmm0,%xmm15 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 +.byte 102,68,15,56,220,193 +.byte 102,68,15,56,220,201 + movdqu 96(%rdi),%xmm1 + +.byte 102,65,15,56,221,210 + pxor %xmm0,%xmm1 + movdqu 112(%rdi),%xmm10 + leaq 128(%rdi),%rdi +.byte 102,65,15,56,221,219 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 +.byte 102,65,15,56,221,228 + movdqa 16(%rsp),%xmm12 +.byte 102,65,15,56,221,237 + movdqa 32(%rsp),%xmm13 +.byte 102,65,15,56,221,246 + movdqa 48(%rsp),%xmm14 +.byte 102,65,15,56,221,255 + movdqa 64(%rsp),%xmm15 +.byte 102,68,15,56,221,193 + movdqa 80(%rsp),%xmm0 +.byte 102,69,15,56,221,202 + movups 16-128(%rcx),%xmm1 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc L$ctr32_loop8 - addq $6,%rdx + addq $8,%rdx jz L$ctr32_done - movq %r11,%rcx - leal 1(%rax,%rax,1),%eax + leaq -128(%rcx),%rcx L$ctr32_tail: - por %xmm14,%xmm2 - movups (%rdi),%xmm8 - cmpq $2,%rdx - jb L$ctr32_one + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb L$ctr32_loop3 + je L$ctr32_loop4 - por %xmm14,%xmm3 - movups 16(%rdi),%xmm9 - je L$ctr32_two + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 - pshufd $192,%xmm13,%xmm5 - por %xmm14,%xmm4 - movups 32(%rdi),%xmm10 - cmpq $4,%rdx - jb L$ctr32_three + movups 16(%rcx),%xmm0 +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 + shrl $1,%eax +.byte 102,15,56,220,225 + decl %eax +.byte 102,15,56,220,233 + movups (%rdi),%xmm10 +.byte 102,15,56,220,241 + movups 16(%rdi),%xmm11 +.byte 102,15,56,220,249 + movups 32(%rdi),%xmm12 +.byte 102,68,15,56,220,193 + movups 16(%rcx),%xmm1 - pshufd $128,%xmm13,%xmm6 - por %xmm14,%xmm5 - movups 48(%rdi),%xmm11 - je L$ctr32_four + call L$enc_loop8_enter - por %xmm14,%xmm6 - xorps %xmm7,%xmm7 + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb L$ctr32_done - call _aesni_encrypt6 + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je L$ctr32_done - movups 64(%rdi),%xmm1 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - xorps %xmm6,%xmm1 - movups %xmm11,48(%rsi) - movups %xmm1,64(%rsi) + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop4: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx),%xmm1 + decl %eax + jnz L$ctr32_loop4 +.byte 102,15,56,221,209 + movups (%rdi),%xmm10 +.byte 102,15,56,221,217 + movups 16(%rdi),%xmm11 +.byte 102,15,56,221,225 + movups 32(%rdi),%xmm12 +.byte 102,15,56,221,233 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop3: +.byte 102,15,56,220,209 + leaq 16(%rcx),%rcx +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 + movups (%rcx),%xmm1 + decl %eax + jnz L$ctr32_loop3 +.byte 102,15,56,221,209 +.byte 102,15,56,221,217 +.byte 102,15,56,221,225 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb L$ctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je L$ctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) jmp L$ctr32_done .p2align 4 L$ctr32_one_shortcut: movups (%r8),%xmm2 - movups (%rdi),%xmm8 + movups (%rdi),%xmm10 movl 240(%rcx),%eax -L$ctr32_one: movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -1131,51 +1344,26 @@ L$oop_enc1_7: leaq 16(%rcx),%rcx jnz L$oop_enc1_7 .byte 102,15,56,221,209 - xorps %xmm2,%xmm8 - movups %xmm8,(%rsi) - jmp L$ctr32_done - -.p2align 4 -L$ctr32_two: - xorps %xmm4,%xmm4 - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - movups %xmm9,16(%rsi) - jmp L$ctr32_done - -.p2align 4 -L$ctr32_three: - call _aesni_encrypt3 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - movups %xmm10,32(%rsi) + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) jmp L$ctr32_done .p2align 4 -L$ctr32_four: - call _aesni_encrypt4 - xorps %xmm2,%xmm8 - xorps %xmm3,%xmm9 - movups %xmm8,(%rsi) - xorps %xmm4,%xmm10 - movups %xmm9,16(%rsi) - xorps %xmm5,%xmm11 - movups %xmm10,32(%rsi) - movups %xmm11,48(%rsi) - L$ctr32_done: + leaq (%rbp),%rsp + popq %rbp +L$ctr32_epilogue: .byte 0xf3,0xc3 .globl _aesni_xts_encrypt .p2align 4 _aesni_xts_encrypt: - leaq -104(%rsp),%rsp + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1190,228 +1378,266 @@ L$oop_enc1_8: leaq 16(%r8),%r8 jnz L$oop_enc1_8 .byte 102,68,15,56,221,249 + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movl %eax,%r10d + movdqa L$xts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pshufd $95,%xmm15,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc L$xts_enc_short shrl $1,%eax - subl $1,%eax + subl $3,%eax + movups 16(%r11),%xmm1 movl %eax,%r10d + leaq L$xts_magic(%rip),%r8 jmp L$xts_enc_grandloop -.p2align 4 +.p2align 5 L$xts_enc_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,220,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,220,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,220,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,220,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,220,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 + pxor %xmm9,%xmm12 movdqa %xmm11,16(%rsp) -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,220,216 + pxor %xmm9,%xmm13 movdqa %xmm12,32(%rsp) -.byte 102,15,56,220,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,220,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,220,224 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,220,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,220,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp L$xts_enc_loop6_enter - -.p2align 4 +.byte 102,15,56,220,240 + movdqa %xmm8,80(%rsp) +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + leaq 64(%r11),%rcx + pshufd $95,%xmm15,%xmm9 + jmp L$xts_enc_loop6 +.p2align 5 L$xts_enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 - decl %eax .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -L$xts_enc_loop6_enter: movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + .byte 102,15,56,220,208 .byte 102,15,56,220,216 - leaq 32(%rcx),%rcx .byte 102,15,56,220,224 .byte 102,15,56,220,232 .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups (%rcx),%xmm0 + decl %eax jnz L$xts_enc_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 .byte 102,15,56,220,233 + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm10,%xmm11 .byte 102,15,56,220,249 movups 16(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,208 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm10 + psrad $31,%xmm14 .byte 102,15,56,220,216 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,224 - pxor %xmm9,%xmm15 .byte 102,15,56,220,232 + pxor %xmm14,%xmm15 .byte 102,15,56,220,240 + movaps %xmm11,%xmm12 .byte 102,15,56,220,248 movups 32(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,220,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,220,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,220,225 - pxor %xmm9,%xmm15 + movdqa %xmm13,48(%rsp) .byte 102,15,56,220,233 + pxor %xmm14,%xmm15 .byte 102,15,56,220,241 + movaps %xmm12,%xmm13 .byte 102,15,56,220,249 + movups 48(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,220,216 paddq %xmm15,%xmm15 -.byte 102,15,56,221,208 - pand %xmm8,%xmm9 -.byte 102,15,56,221,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,221,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 + pand %xmm8,%xmm14 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + pxor %xmm14,%xmm15 +.byte 102,15,56,220,240 + movaps %xmm13,%xmm14 +.byte 102,15,56,220,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,220,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,220,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 + psrad $31,%xmm9 +.byte 102,15,56,221,84,36,0 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 +.byte 102,15,56,221,92,36,16 +.byte 102,15,56,221,100,36,32 pxor %xmm9,%xmm15 - - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) +.byte 102,15,56,221,108,36,48 +.byte 102,15,56,221,116,36,64 +.byte 102,15,56,221,124,36,80 movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc L$xts_enc_grandloop - leal 3(%rax,%rax,1),%eax + leal 7(%rax,%rax,1),%eax movq %r11,%rcx movl %eax,%r10d L$xts_enc_short: + pxor %xmm0,%xmm10 addq $96,%rdx jz L$xts_enc_done + pxor %xmm0,%xmm11 cmpq $32,%rdx jb L$xts_enc_one + pxor %xmm0,%xmm12 je L$xts_enc_two + pxor %xmm0,%xmm13 cmpq $64,%rdx jb L$xts_enc_three + pxor %xmm0,%xmm14 je L$xts_enc_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1514,15 +1740,15 @@ L$xts_enc_four: call _aesni_encrypt4 - xorps %xmm10,%xmm2 - movdqa %xmm15,%xmm10 - xorps %xmm11,%xmm3 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp L$xts_enc_done @@ -1563,7 +1789,8 @@ L$oop_enc1_10: movups %xmm2,-16(%rsi) L$xts_enc_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp L$xts_enc_epilogue: .byte 0xf3,0xc3 @@ -1571,7 +1798,11 @@ L$xts_enc_epilogue: .p2align 4 _aesni_xts_decrypt: - leaq -104(%rsp),%rsp + leaq (%rsp),%rax + pushq %rbp + subq $112,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp movups (%r9),%xmm15 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1592,228 +1823,266 @@ L$oop_enc1_11: shlq $4,%rax subq %rax,%rdx + movups (%rcx),%xmm0 movq %rcx,%r11 movl %r10d,%eax + shll $4,%r10d movq %rdx,%r9 andq $-16,%rdx + movups 16(%rcx,%r10,1),%xmm1 + movl %eax,%r10d + movdqa L$xts_magic(%rip),%xmm8 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pshufd $95,%xmm15,%xmm9 + pxor %xmm0,%xmm1 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm10 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm10 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm11 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm11 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm12 + psrad $31,%xmm14 paddq %xmm15,%xmm15 - pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 - pxor %xmm9,%xmm15 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm12 + pxor %xmm14,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 movdqa %xmm15,%xmm13 + psrad $31,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 + pxor %xmm0,%xmm13 + pxor %xmm14,%xmm15 + movdqa %xmm15,%xmm14 + psrad $31,%xmm9 paddq %xmm15,%xmm15 pand %xmm8,%xmm9 - pcmpgtd %xmm15,%xmm14 + pxor %xmm0,%xmm14 pxor %xmm9,%xmm15 + movaps %xmm1,96(%rsp) + subq $96,%rdx jc L$xts_dec_short shrl $1,%eax - subl $1,%eax + subl $3,%eax + movups 16(%r11),%xmm1 movl %eax,%r10d + leaq L$xts_magic(%rip),%r8 jmp L$xts_dec_grandloop -.p2align 4 +.p2align 5 L$xts_dec_grandloop: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu 0(%rdi),%xmm2 - pand %xmm8,%xmm9 + movdqa %xmm0,%xmm8 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 - movdqu 48(%rdi),%xmm5 + movdqu 32(%rdi),%xmm4 pxor %xmm11,%xmm3 - movdqu 64(%rdi),%xmm6 +.byte 102,15,56,222,209 + movdqu 48(%rdi),%xmm5 pxor %xmm12,%xmm4 - movdqu 80(%rdi),%xmm7 - leaq 96(%rdi),%rdi +.byte 102,15,56,222,217 + movdqu 64(%rdi),%xmm6 pxor %xmm13,%xmm5 - movups (%r11),%xmm0 +.byte 102,15,56,222,225 + movdqu 80(%rdi),%xmm7 + pxor %xmm15,%xmm8 + movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 - pxor %xmm15,%xmm7 - - +.byte 102,15,56,222,233 + movups 32(%r11),%xmm0 + leaq 96(%rdi),%rdi + pxor %xmm8,%xmm7 - movups 16(%r11),%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm0,%xmm3 + pxor %xmm9,%xmm10 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) -.byte 102,15,56,222,209 - leaq 32(%r11),%rcx - pxor %xmm0,%xmm4 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 + pxor %xmm9,%xmm12 movdqa %xmm11,16(%rsp) -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 +.byte 102,15,56,222,216 + pxor %xmm9,%xmm13 movdqa %xmm12,32(%rsp) -.byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqa %xmm13,48(%rsp) -.byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - movups (%rcx),%xmm0 - decl %eax +.byte 102,15,56,222,224 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,232 + pxor %xmm9,%xmm8 movdqa %xmm14,64(%rsp) -.byte 102,15,56,222,241 - movdqa %xmm15,80(%rsp) -.byte 102,15,56,222,249 - pxor %xmm14,%xmm14 - pcmpgtd %xmm15,%xmm14 - jmp L$xts_dec_loop6_enter - -.p2align 4 +.byte 102,15,56,222,240 + movdqa %xmm8,80(%rsp) +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + leaq 64(%r11),%rcx + pshufd $95,%xmm15,%xmm9 + jmp L$xts_dec_loop6 +.p2align 5 L$xts_dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 - decl %eax .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -L$xts_dec_loop6_enter: movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + .byte 102,15,56,222,208 .byte 102,15,56,222,216 - leaq 32(%rcx),%rcx .byte 102,15,56,222,224 .byte 102,15,56,222,232 .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups (%rcx),%xmm0 + decl %eax jnz L$xts_dec_loop6 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - paddq %xmm15,%xmm15 + movdqa (%r8),%xmm8 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + paddq %xmm15,%xmm15 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + pand %xmm8,%xmm14 + movups (%r11),%xmm10 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 .byte 102,15,56,222,233 + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm10,%xmm11 .byte 102,15,56,222,249 movups 16(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm10 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,208 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm10 + psrad $31,%xmm14 .byte 102,15,56,222,216 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,224 - pxor %xmm9,%xmm15 .byte 102,15,56,222,232 + pxor %xmm14,%xmm15 .byte 102,15,56,222,240 + movaps %xmm11,%xmm12 .byte 102,15,56,222,248 movups 32(%rcx),%xmm0 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm11 - paddq %xmm15,%xmm15 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 .byte 102,15,56,222,209 - pand %xmm8,%xmm9 + pxor %xmm15,%xmm11 + psrad $31,%xmm14 .byte 102,15,56,222,217 - pcmpgtd %xmm15,%xmm14 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm14 .byte 102,15,56,222,225 - pxor %xmm9,%xmm15 + movdqa %xmm13,48(%rsp) .byte 102,15,56,222,233 + pxor %xmm14,%xmm15 .byte 102,15,56,222,241 + movaps %xmm12,%xmm13 .byte 102,15,56,222,249 + movups 48(%rcx),%xmm1 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm12 + movdqa %xmm9,%xmm14 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,208 + pxor %xmm15,%xmm12 + psrad $31,%xmm14 +.byte 102,15,56,222,216 paddq %xmm15,%xmm15 -.byte 102,15,56,223,208 - pand %xmm8,%xmm9 -.byte 102,15,56,223,216 - pcmpgtd %xmm15,%xmm14 -.byte 102,15,56,223,224 - pxor %xmm9,%xmm15 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 + pand %xmm8,%xmm14 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + pxor %xmm14,%xmm15 +.byte 102,15,56,222,240 + movaps %xmm13,%xmm14 +.byte 102,15,56,222,248 - pshufd $19,%xmm14,%xmm9 - pxor %xmm14,%xmm14 - movdqa %xmm15,%xmm13 + movdqa %xmm9,%xmm0 + paddd %xmm9,%xmm9 +.byte 102,15,56,222,209 + pxor %xmm15,%xmm13 + psrad $31,%xmm0 +.byte 102,15,56,222,217 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm0 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm15 + movups (%r11),%xmm0 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + + pxor %xmm15,%xmm14 + psrad $31,%xmm9 +.byte 102,15,56,223,84,36,0 paddq %xmm15,%xmm15 - xorps 0(%rsp),%xmm2 pand %xmm8,%xmm9 - xorps 16(%rsp),%xmm3 - pcmpgtd %xmm15,%xmm14 +.byte 102,15,56,223,92,36,16 +.byte 102,15,56,223,100,36,32 pxor %xmm9,%xmm15 - - xorps 32(%rsp),%xmm4 - movups %xmm2,0(%rsi) - xorps 48(%rsp),%xmm5 - movups %xmm3,16(%rsi) - xorps 64(%rsp),%xmm6 - movups %xmm4,32(%rsi) - xorps 80(%rsp),%xmm7 - movups %xmm5,48(%rsi) +.byte 102,15,56,223,108,36,48 +.byte 102,15,56,223,116,36,64 +.byte 102,15,56,223,124,36,80 movl %r10d,%eax - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movups %xmm2,-96(%rsi) + movups %xmm3,-80(%rsi) + movups %xmm4,-64(%rsi) + movups %xmm5,-48(%rsi) + movups %xmm6,-32(%rsi) + movups %xmm7,-16(%rsi) subq $96,%rdx jnc L$xts_dec_grandloop - leal 3(%rax,%rax,1),%eax + leal 7(%rax,%rax,1),%eax movq %r11,%rcx movl %eax,%r10d L$xts_dec_short: + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 addq $96,%rdx jz L$xts_dec_done + pxor %xmm0,%xmm12 cmpq $32,%rdx jb L$xts_dec_one + pxor %xmm0,%xmm13 je L$xts_dec_two + pxor %xmm0,%xmm14 cmpq $64,%rdx jb L$xts_dec_three je L$xts_dec_four - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movdqu (%rdi),%xmm2 - pand %xmm8,%xmm9 movdqu 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movdqu 32(%rdi),%xmm4 pxor %xmm10,%xmm2 movdqu 48(%rdi),%xmm5 @@ -1906,7 +2175,7 @@ L$xts_dec_three: xorps %xmm10,%xmm2 movdqa %xmm13,%xmm10 xorps %xmm11,%xmm3 - movdqa %xmm15,%xmm11 + movdqa %xmm14,%xmm11 xorps %xmm12,%xmm4 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -1916,14 +2185,8 @@ L$xts_dec_three: .p2align 4 L$xts_dec_four: - pshufd $19,%xmm14,%xmm9 - movdqa %xmm15,%xmm14 - paddq %xmm15,%xmm15 movups (%rdi),%xmm2 - pand %xmm8,%xmm9 movups 16(%rdi),%xmm3 - pxor %xmm9,%xmm15 - movups 32(%rdi),%xmm4 xorps %xmm10,%xmm2 movups 48(%rdi),%xmm5 @@ -1934,16 +2197,16 @@ L$xts_dec_four: call _aesni_decrypt4 - xorps %xmm10,%xmm2 + pxor %xmm10,%xmm2 movdqa %xmm14,%xmm10 - xorps %xmm11,%xmm3 + pxor %xmm11,%xmm3 movdqa %xmm15,%xmm11 - xorps %xmm12,%xmm4 - movups %xmm2,(%rsi) - xorps %xmm13,%xmm5 - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm2,(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm3,16(%rsi) + movdqu %xmm4,32(%rsi) + movdqu %xmm5,48(%rsi) leaq 64(%rsi),%rsi jmp L$xts_dec_done @@ -2003,7 +2266,8 @@ L$oop_dec1_14: movups %xmm2,(%rsi) L$xts_dec_ret: - leaq 104(%rsp),%rsp + leaq (%rbp),%rsp + popq %rbp L$xts_dec_epilogue: .byte 0xf3,0xc3 @@ -2070,149 +2334,324 @@ L$cbc_enc_tail: .p2align 4 L$cbc_decrypt: - movups (%r8),%xmm9 + leaq (%rsp),%rax + pushq %rbp + subq $16,%rsp + andq $-16,%rsp + leaq -8(%rax),%rbp + movups (%r8),%xmm10 movl %r10d,%eax - cmpq $112,%rdx + cmpq $80,%rdx jbe L$cbc_dec_tail - shrl $1,%r10d + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + cmpq $112,%rdx + jbe L$cbc_dec_six_or_seven + subq $112,%rdx - movl %r10d,%eax - movaps %xmm9,-24(%rsp) + leaq 112(%rcx),%rcx jmp L$cbc_dec_loop8_enter .p2align 4 L$cbc_dec_loop8: - movaps %xmm0,-24(%rsp) movups %xmm9,(%rsi) leaq 16(%rsi),%rsi L$cbc_dec_loop8_enter: - movups (%rcx),%xmm0 - movups (%rdi),%xmm2 - movups 16(%rdi),%xmm3 - movups 16(%rcx),%xmm1 + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + xorq %r11,%r11 + cmpq $112,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 - leaq 32(%rcx),%rcx - movdqu 32(%rdi),%xmm4 - xorps %xmm0,%xmm2 - movdqu 48(%rdi),%xmm5 - xorps %xmm0,%xmm3 - movdqu 64(%rdi),%xmm6 .byte 102,15,56,222,209 - pxor %xmm0,%xmm4 - movdqu 80(%rdi),%xmm7 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 .byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - movdqu 96(%rdi),%xmm8 .byte 102,15,56,222,225 - pxor %xmm0,%xmm6 - movdqu 112(%rdi),%xmm9 .byte 102,15,56,222,233 - pxor %xmm0,%xmm7 - decl %eax .byte 102,15,56,222,241 - pxor %xmm0,%xmm8 .byte 102,15,56,222,249 - pxor %xmm0,%xmm9 - movups (%rcx),%xmm0 + setnc %r11b .byte 102,68,15,56,222,193 + shlq $7,%r11 .byte 102,68,15,56,222,201 - movups 16(%rcx),%xmm1 - - call L$dec_loop8_enter + addq %rdi,%r11 + movups 48-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 64-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 80-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 96-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 112-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 128-112(%rcx),%xmm0 +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 144-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 160-112(%rcx),%xmm0 + cmpl $11,%eax + jb L$cbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 176-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 192-112(%rcx),%xmm0 + je L$cbc_dec_done +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movups 208-112(%rcx),%xmm1 +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 +.byte 102,68,15,56,222,192 +.byte 102,68,15,56,222,200 + movups 224-112(%rcx),%xmm0 +L$cbc_dec_done: +.byte 102,15,56,222,209 + pxor %xmm0,%xmm10 +.byte 102,15,56,222,217 + pxor %xmm0,%xmm11 +.byte 102,15,56,222,225 + pxor %xmm0,%xmm12 +.byte 102,15,56,222,233 + pxor %xmm0,%xmm13 +.byte 102,15,56,222,241 + pxor %xmm0,%xmm14 +.byte 102,15,56,222,249 + pxor %xmm0,%xmm15 +.byte 102,68,15,56,222,193 +.byte 102,68,15,56,222,201 + movdqu 80(%rdi),%xmm1 + +.byte 102,65,15,56,223,210 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 +.byte 102,65,15,56,223,219 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 + leaq 128(%rdi),%rdi +.byte 102,65,15,56,223,228 + movdqu 0(%r11),%xmm11 +.byte 102,65,15,56,223,237 + movdqu 16(%r11),%xmm12 +.byte 102,65,15,56,223,246 + movdqu 32(%r11),%xmm13 +.byte 102,65,15,56,223,255 + movdqu 48(%r11),%xmm14 +.byte 102,68,15,56,223,193 + movdqu 64(%r11),%xmm15 +.byte 102,69,15,56,223,202 + movdqa %xmm0,%xmm10 + movdqu 80(%r11),%xmm1 + movups -112(%rcx),%xmm0 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm1 - xorps %xmm0,%xmm8 - movups 112(%rdi),%xmm0 - xorps %xmm1,%xmm9 movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 movups %xmm5,48(%rsi) - movl %r10d,%eax + movdqa %xmm14,%xmm5 movups %xmm6,64(%rsi) - movq %r11,%rcx + movdqa %xmm15,%xmm6 movups %xmm7,80(%rsi) - leaq 128(%rdi),%rdi + movdqa %xmm1,%xmm7 movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi + subq $128,%rdx ja L$cbc_dec_loop8 movaps %xmm9,%xmm2 - movaps %xmm0,%xmm9 + leaq -112(%rcx),%rcx addq $112,%rdx jle L$cbc_dec_tail_collected - movups %xmm2,(%rsi) - leal 1(%r10,%r10,1),%eax + movups %xmm9,(%rsi) leaq 16(%rsi),%rsi + cmpq $80,%rdx + jbe L$cbc_dec_tail + + movaps %xmm11,%xmm2 +L$cbc_dec_six_or_seven: + cmpq $96,%rdx + ja L$cbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + jmp L$cbc_dec_tail_collected + +.p2align 4 +L$cbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + jmp L$cbc_dec_tail_collected + L$cbc_dec_tail: movups (%rdi),%xmm2 - movaps %xmm2,%xmm8 - cmpq $16,%rdx + subq $16,%rdx jbe L$cbc_dec_one movups 16(%rdi),%xmm3 - movaps %xmm3,%xmm7 - cmpq $32,%rdx + movaps %xmm2,%xmm11 + subq $16,%rdx jbe L$cbc_dec_two movups 32(%rdi),%xmm4 - movaps %xmm4,%xmm6 - cmpq $48,%rdx + movaps %xmm3,%xmm12 + subq $16,%rdx jbe L$cbc_dec_three movups 48(%rdi),%xmm5 - cmpq $64,%rdx + movaps %xmm4,%xmm13 + subq $16,%rdx jbe L$cbc_dec_four movups 64(%rdi),%xmm6 - cmpq $80,%rdx - jbe L$cbc_dec_five - - movups 80(%rdi),%xmm7 - cmpq $96,%rdx - jbe L$cbc_dec_six - - movups 96(%rdi),%xmm8 - movaps %xmm9,-24(%rsp) - call _aesni_decrypt8 - movups (%rdi),%xmm1 - movups 16(%rdi),%xmm0 - xorps -24(%rsp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%rdi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%rdi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%rdi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%rdi),%xmm0 - xorps %xmm1,%xmm7 - movups 96(%rdi),%xmm9 - xorps %xmm0,%xmm8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - leaq 96(%rsi),%rsi - movaps %xmm8,%xmm2 - subq $112,%rdx + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + subq $16,%rdx jmp L$cbc_dec_tail_collected + .p2align 4 L$cbc_dec_one: + movaps %xmm2,%xmm11 movups (%rcx),%xmm0 movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx @@ -2224,111 +2663,69 @@ L$oop_dec1_16: leaq 16(%rcx),%rcx jnz L$oop_dec1_16 .byte 102,15,56,223,209 - xorps %xmm9,%xmm2 - movaps %xmm8,%xmm9 - subq $16,%rdx + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 jmp L$cbc_dec_tail_collected .p2align 4 L$cbc_dec_two: + movaps %xmm3,%xmm12 xorps %xmm4,%xmm4 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - movaps %xmm7,%xmm9 - movaps %xmm3,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 leaq 16(%rsi),%rsi - subq $32,%rdx jmp L$cbc_dec_tail_collected .p2align 4 L$cbc_dec_three: + movaps %xmm4,%xmm13 call _aesni_decrypt3 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - movaps %xmm6,%xmm9 - movaps %xmm4,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + movdqa %xmm4,%xmm2 leaq 32(%rsi),%rsi - subq $48,%rdx jmp L$cbc_dec_tail_collected .p2align 4 L$cbc_dec_four: + movaps %xmm5,%xmm14 call _aesni_decrypt4 - xorps %xmm9,%xmm2 - movups 48(%rdi),%xmm9 - xorps %xmm8,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%rsi) - xorps %xmm6,%xmm5 - movups %xmm4,32(%rsi) - movaps %xmm5,%xmm2 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + movdqa %xmm5,%xmm2 leaq 48(%rsi),%rsi - subq $64,%rdx - jmp L$cbc_dec_tail_collected -.p2align 4 -L$cbc_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm9 - xorps %xmm1,%xmm6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - leaq 64(%rsi),%rsi - movaps %xmm6,%xmm2 - subq $80,%rdx - jmp L$cbc_dec_tail_collected -.p2align 4 -L$cbc_dec_six: - call _aesni_decrypt6 - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm0 - xorps %xmm9,%xmm2 - xorps %xmm8,%xmm3 - xorps %xmm1,%xmm4 - movups 48(%rdi),%xmm1 - xorps %xmm0,%xmm5 - movups 64(%rdi),%xmm0 - xorps %xmm1,%xmm6 - movups 80(%rdi),%xmm9 - xorps %xmm0,%xmm7 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - movaps %xmm7,%xmm2 - subq $96,%rdx jmp L$cbc_dec_tail_collected + .p2align 4 L$cbc_dec_tail_collected: + movups %xmm10,(%r8) andq $15,%rdx - movups %xmm9,(%r8) jnz L$cbc_dec_tail_partial movups %xmm2,(%rsi) jmp L$cbc_dec_ret .p2align 4 L$cbc_dec_tail_partial: - movaps %xmm2,-24(%rsp) + movaps %xmm2,(%rsp) movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx - leaq -24(%rsp),%rsi + leaq (%rsp),%rsi .long 0x9066A4F3 L$cbc_dec_ret: + leaq (%rbp),%rsp + popq %rbp L$cbc_ret: .byte 0xf3,0xc3 @@ -2571,6 +2968,8 @@ L$increment64: .long 1,0,0,0 L$xts_magic: .long 0x87,0,1,0 +L$increment1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s index b9ec30c..1327e82 100644 --- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s +++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s @@ -597,6 +597,468 @@ L$cbc_abort: popq %rbp .byte 0xf3,0xc3 +.globl _padlock_cfb_encrypt + +.p2align 4 +_padlock_cfb_encrypt: + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz L$cfb_abort + testq $15,%rcx + jnz L$cfb_abort + leaq L$padlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz L$cfb_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz L$cfb_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + jmp L$cfb_loop +.p2align 4 +L$cfb_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz L$cfb_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +L$cfb_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,224 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz L$cfb_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +L$cfb_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jnz L$cfb_loop + cmpq %rbp,%rsp + je L$cfb_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +L$cfb_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja L$cfb_bzero + +L$cfb_done: + leaq (%rbp),%rsp + jmp L$cfb_exit + +.p2align 4 +L$cfb_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,224 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) +L$cfb_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +L$cfb_abort: + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +.globl _padlock_ofb_encrypt + +.p2align 4 +_padlock_ofb_encrypt: + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz L$ofb_abort + testq $15,%rcx + jnz L$ofb_abort + leaq L$padlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz L$ofb_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz L$ofb_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx + jmp L$ofb_loop +.p2align 4 +L$ofb_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz L$ofb_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +L$ofb_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,232 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz L$ofb_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +L$ofb_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jnz L$ofb_loop + cmpq %rbp,%rsp + je L$ofb_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +L$ofb_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja L$ofb_bzero + +L$ofb_done: + leaq (%rbp),%rsp + jmp L$ofb_exit + +.p2align 4 +L$ofb_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,232 + movdqa (%rax),%xmm0 + movdqa %xmm0,-16(%rdx) +L$ofb_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +L$ofb_abort: + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +.globl _padlock_ctr32_encrypt + +.p2align 4 +_padlock_ctr32_encrypt: + pushq %rbp + pushq %rbx + + xorl %eax,%eax + testq $15,%rdx + jnz L$ctr32_abort + testq $15,%rcx + jnz L$ctr32_abort + leaq L$padlock_saved_context(%rip),%rax + pushf + cld + call _padlock_verify_ctx + leaq 16(%rdx),%rdx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%rdx) + jnz L$ctr32_aligned + testq $15,%rdi + setz %al + testq $15,%rsi + setz %bl + testl %ebx,%eax + jnz L$ctr32_aligned + negq %rax + movq $512,%rbx + notq %rax + leaq (%rsp),%rbp + cmpq %rbx,%rcx + cmovcq %rcx,%rbx + andq %rbx,%rax + movq %rcx,%rbx + negq %rax + andq $512-1,%rbx + leaq (%rax,%rbp,1),%rsp + movq $512,%rax + cmovzq %rax,%rbx +L$ctr32_reenter: + movl -4(%rdx),%eax + bswapl %eax + negl %eax + andl $31,%eax + movq $512,%rbx + shll $4,%eax + cmovzq %rbx,%rax + cmpq %rax,%rcx + cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + cmpq %rbx,%rcx + ja L$ctr32_loop + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jz L$ctr32_unaligned_tail + jmp L$ctr32_loop +.p2align 4 +L$ctr32_loop: + cmpq %rcx,%rbx + cmovaq %rcx,%rbx + movq %rdi,%r8 + movq %rsi,%r9 + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + testq $15,%rdi + cmovnzq %rsp,%rdi + testq $15,%rsi + jz L$ctr32_inp_aligned + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi + movq %rbx,%rcx + movq %rdi,%rsi +L$ctr32_inp_aligned: + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + movl -4(%rdx),%eax + testl $4294901760,%eax + jnz L$ctr32_no_carry + bswapl %eax + addl $65536,%eax + bswapl %eax + movl %eax,-4(%rdx) +L$ctr32_no_carry: + movq %r8,%rdi + movq %r11,%rbx + testq $15,%rdi + jz L$ctr32_out_aligned + movq %rbx,%rcx + leaq (%rsp),%rsi + shrq $3,%rcx +.byte 0xf3,0x48,0xa5 + subq %rbx,%rdi +L$ctr32_out_aligned: + movq %r9,%rsi + movq %r10,%rcx + addq %rbx,%rdi + addq %rbx,%rsi + subq %rbx,%rcx + movq $512,%rbx + jz L$ctr32_break + cmpq %rbx,%rcx + jae L$ctr32_loop + movq %rcx,%rbx + movq %rsi,%rax + cmpq %rsp,%rbp + cmoveq %rdi,%rax + addq %rcx,%rax + negq %rax + andq $4095,%rax + cmpq $32,%rax + movq $-32,%rax + cmovaeq %rbx,%rax + andq %rax,%rbx + jnz L$ctr32_loop +L$ctr32_unaligned_tail: + xorl %eax,%eax + cmpq %rsp,%rbp + cmoveq %rcx,%rax + movq %rdi,%r8 + movq %rcx,%rbx + subq %rax,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + movq %rsp,%rsi + movq %r8,%rdi + movq %rbx,%rcx + jmp L$ctr32_loop +.p2align 4 +L$ctr32_break: + cmpq %rbp,%rsp + je L$ctr32_done + + pxor %xmm0,%xmm0 + leaq (%rsp),%rax +L$ctr32_bzero: + movaps %xmm0,(%rax) + leaq 16(%rax),%rax + cmpq %rax,%rbp + ja L$ctr32_bzero + +L$ctr32_done: + leaq (%rbp),%rsp + jmp L$ctr32_exit + +.p2align 4 +L$ctr32_aligned: + movl -4(%rdx),%eax + bswapl %eax + negl %eax + andl $65535,%eax + movq $1048576,%rbx + shll $4,%eax + cmovzq %rbx,%rax + cmpq %rax,%rcx + cmovaq %rax,%rbx + cmovbeq %rcx,%rbx + jbe L$ctr32_aligned_skip + +L$ctr32_aligned_loop: + movq %rcx,%r10 + movq %rbx,%rcx + movq %rbx,%r11 + + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + + movl -4(%rdx),%eax + bswapl %eax + addl $65536,%eax + bswapl %eax + movl %eax,-4(%rdx) + + movq %r10,%rcx + subq %r11,%rcx + movq $1048576,%rbx + jz L$ctr32_exit + cmpq %rbx,%rcx + jae L$ctr32_aligned_loop + +L$ctr32_aligned_skip: + leaq (%rsi,%rcx,1),%rbp + negq %rbp + andq $4095,%rbp + xorl %eax,%eax + cmpq $32,%rbp + movq $32-1,%rbp + cmovaeq %rax,%rbp + andq %rcx,%rbp + subq %rbp,%rcx + jz L$ctr32_aligned_tail + leaq -16(%rdx),%rax + leaq 16(%rdx),%rbx + shrq $4,%rcx +.byte 0xf3,0x0f,0xa7,216 + testq %rbp,%rbp + jz L$ctr32_exit + +L$ctr32_aligned_tail: + movq %rdi,%r8 + movq %rbp,%rbx + movq %rbp,%rcx + leaq (%rsp),%rbp + subq %rcx,%rsp + shrq $3,%rcx + leaq (%rsp),%rdi +.byte 0xf3,0x48,0xa5 + leaq (%r8),%rdi + leaq (%rsp),%rsi + movq %rbx,%rcx + jmp L$ctr32_loop +L$ctr32_exit: + movl $1,%eax + leaq 8(%rsp),%rsp +L$ctr32_abort: + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 .data diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s index 7a38b7c..1a2fa92 100644 --- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s +++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s @@ -510,6 +510,351 @@ L016cbc_abort: popl %ebx popl %ebp ret +.globl _padlock_cfb_encrypt +.align 4 +_padlock_cfb_encrypt: +L_padlock_cfb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz L028cfb_abort + testl $15,%ecx + jnz L028cfb_abort + leal Lpadlock_saved_context-L029cfb_pic_point,%eax + pushfl + cld + call __padlock_verify_ctx +L029cfb_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%edx) + jnz L030cfb_aligned + testl $15,%edi + setz %al + testl $15,%esi + setz %bl + testl %ebx,%eax + jnz L030cfb_aligned + negl %eax + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp L031cfb_loop +.align 4,0x90 +L031cfb_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + testl $15,%edi + cmovnzl %esp,%edi + testl $15,%esi + jz L032cfb_inp_aligned + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi + movl %ebx,%ecx + movl %edi,%esi +L032cfb_inp_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,224 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) + movl (%ebp),%edi + movl 12(%ebp),%ebx + testl $15,%edi + jz L033cfb_out_aligned + movl %ebx,%ecx + leal (%esp),%esi + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi +L033cfb_out_aligned: + movl 4(%ebp),%esi + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz L031cfb_loop + cmpl %ebp,%esp + je L034cfb_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +L035cfb_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja L035cfb_bzero +L034cfb_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + jmp L036cfb_exit +.align 4,0x90 +L030cfb_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,224 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) +L036cfb_exit: + movl $1,%eax + leal 4(%esp),%esp +L028cfb_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _padlock_ofb_encrypt +.align 4 +_padlock_ofb_encrypt: +L_padlock_ofb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz L037ofb_abort + testl $15,%ecx + jnz L037ofb_abort + leal Lpadlock_saved_context-L038ofb_pic_point,%eax + pushfl + cld + call __padlock_verify_ctx +L038ofb_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + xorl %ebx,%ebx + testl $32,(%edx) + jnz L039ofb_aligned + testl $15,%edi + setz %al + testl $15,%esi + setz %bl + testl %ebx,%eax + jnz L039ofb_aligned + negl %eax + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp L040ofb_loop +.align 4,0x90 +L040ofb_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + testl $15,%edi + cmovnzl %esp,%edi + testl $15,%esi + jz L041ofb_inp_aligned + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi + movl %ebx,%ecx + movl %edi,%esi +L041ofb_inp_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,232 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) + movl (%ebp),%edi + movl 12(%ebp),%ebx + testl $15,%edi + jz L042ofb_out_aligned + movl %ebx,%ecx + leal (%esp),%esi + shrl $2,%ecx +.byte 243,165 + subl %ebx,%edi +L042ofb_out_aligned: + movl 4(%ebp),%esi + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz L040ofb_loop + cmpl %ebp,%esp + je L043ofb_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +L044ofb_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja L044ofb_bzero +L043ofb_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + jmp L045ofb_exit +.align 4,0x90 +L039ofb_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,232 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) +L045ofb_exit: + movl $1,%eax + leal 4(%esp),%esp +L037ofb_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _padlock_ctr32_encrypt +.align 4 +_padlock_ctr32_encrypt: +L_padlock_ctr32_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx + jnz L046ctr32_abort + testl $15,%ecx + jnz L046ctr32_abort + leal Lpadlock_saved_context-L047ctr32_pic_point,%eax + pushfl + cld + call __padlock_verify_ctx +L047ctr32_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + movq -16(%edx),%mm0 + movl $512,%ebx + notl %eax + leal -24(%esp),%ebp + cmpl %ebx,%ecx + cmovcl %ecx,%ebx + andl %ebx,%eax + movl %ecx,%ebx + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp + movl $512,%eax + cmovzl %eax,%ebx + movl %ebp,%eax + andl $-16,%ebp + andl $-16,%esp + movl %eax,16(%ebp) + jmp L048ctr32_loop +.align 4,0x90 +L048ctr32_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) + movl %ebx,%ecx + movl %ebx,12(%ebp) + movl -4(%edx),%ecx + xorl %edi,%edi + movl -8(%edx),%eax +L049ctr32_prepare: + movl %ecx,12(%esp,%edi,1) + bswap %ecx + movq %mm0,(%esp,%edi,1) + incl %ecx + movl %eax,8(%esp,%edi,1) + bswap %ecx + leal 16(%edi),%edi + cmpl %ebx,%edi + jb L049ctr32_prepare + movl %ecx,-4(%edx) + leal (%esp),%esi + leal (%esp),%edi + movl %ebx,%ecx + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +.byte 243,15,167,200 + movl (%ebp),%edi + movl 12(%ebp),%ebx + movl 4(%ebp),%esi + xorl %ecx,%ecx +L050ctr32_xor: + movups (%esi,%ecx,1),%xmm1 + leal 16(%ecx),%ecx + pxor -16(%esp,%ecx,1),%xmm1 + movups %xmm1,-16(%edi,%ecx,1) + cmpl %ebx,%ecx + jb L050ctr32_xor + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx + jnz L048ctr32_loop + pxor %xmm0,%xmm0 + leal (%esp),%eax +L051ctr32_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp + ja L051ctr32_bzero +L052ctr32_done: + movl 16(%ebp),%ebp + leal 24(%ebp),%esp + movl $1,%eax + leal 4(%esp),%esp + emms +L046ctr32_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .globl _padlock_xstore .align 4 _padlock_xstore: @@ -526,10 +871,10 @@ __win32_segv_handler: movl 4(%esp),%edx movl 12(%esp),%ecx cmpl $3221225477,(%edx) - jne L028ret + jne L053ret addl $4,184(%ecx) movl $0,%eax -L028ret: +L053ret: ret .globl _padlock_sha1_oneshot .align 4 -- 1.8.4.2