From 302b2107bf1fa1afead856823da70ac65ff8d362 Mon Sep 17 00:00:00 2001 From: Nikos Mavrogiannopoulos Date: Wed, 4 Dec 2013 18:19:03 +0100 Subject: [PATCH] Added fix for #973210 --- gnutls-3.2.7-asm.patch | 11608 +++++++++++++++++++++++++++++++++++++++ gnutls.spec | 7 +- 2 files changed, 11613 insertions(+), 2 deletions(-) create mode 100644 gnutls-3.2.7-asm.patch diff --git a/gnutls-3.2.7-asm.patch b/gnutls-3.2.7-asm.patch new file mode 100644 index 0000000..511773e --- /dev/null +++ b/gnutls-3.2.7-asm.patch @@ -0,0 +1,11608 @@ +From 8a7565113ab937cc99f8f4c929bde2ee08fc498c Mon Sep 17 00:00:00 2001 +From: Nikos Mavrogiannopoulos +Date: Tue, 26 Nov 2013 23:19:45 +0100 +Subject: [PATCH 1/2] updated auto-generated asm files. This fixes a valgrind + complaint when AES-NI is in use. + +--- + .../x86/coff/appro-aes-gcm-x86-64-coff.s | 574 ++++-- + lib/accelerated/x86/coff/appro-aes-x86-64-coff.s | 1826 ++++++++++++-------- + lib/accelerated/x86/coff/padlock-x86-64-coff.s | 495 ++++++ + lib/accelerated/x86/coff/padlock-x86-coff.s | 352 +++- + lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s | 515 ++++-- + lib/accelerated/x86/elf/appro-aes-x86-64.s | 1609 ++++++++++------- + lib/accelerated/x86/elf/padlock-x86-64.s | 462 +++++ + lib/accelerated/x86/elf/padlock-x86.s | 575 +++++- + .../x86/macosx/appro-aes-gcm-x86-64-macosx.s | 515 ++++-- + .../x86/macosx/appro-aes-x86-64-macosx.s | 1609 ++++++++++------- + lib/accelerated/x86/macosx/padlock-x86-64-macosx.s | 462 +++++ + lib/accelerated/x86/macosx/padlock-x86-macosx.s | 349 +++- + 12 files changed, 6978 insertions(+), 2365 deletions(-) + +diff --git a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s +index fa449d6..ceb9108 100644 +--- a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s ++++ b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s +@@ -717,6 +717,11 @@ gcm_ghash_4bit: + .def gcm_init_clmul; .scl 2; .type 32; .endef + .p2align 4 + gcm_init_clmul: ++.L_init_clmul: ++.LSEH_begin_gcm_init_clmul: ++ ++.byte 0x48,0x83,0xec,0x18 ++.byte 0x0f,0x29,0x34,0x24 + movdqu (%rdx),%xmm2 + pshufd $78,%xmm2,%xmm2 + +@@ -735,15 +740,15 @@ gcm_init_clmul: + pxor %xmm5,%xmm2 + + ++ pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 ++ pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 +-.byte 102,15,58,68,220,0 ++.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + +@@ -753,44 +758,137 @@ gcm_init_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ pxor %xmm4,%xmm0 ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ pshufd $78,%xmm2,%xmm3 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm2,%xmm3 ++ movdqu %xmm2,0(%rcx) ++ pxor %xmm0,%xmm4 ++ movdqu %xmm0,16(%rcx) ++.byte 102,15,58,15,227,8 ++ movdqu %xmm4,32(%rcx) ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,15,58,68,194,0 ++.byte 102,15,58,68,202,17 ++.byte 102,15,58,68,222,0 ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ ++ movdqa %xmm3,%xmm4 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 ++ pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 ++ ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ movdqa %xmm0,%xmm5 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,15,58,68,194,0 ++.byte 102,15,58,68,202,17 ++.byte 102,15,58,68,222,0 ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ ++ movdqa %xmm3,%xmm4 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 ++ pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 ++ ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- movdqu %xmm2,(%rcx) +- movdqu %xmm0,16(%rcx) ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ pshufd $78,%xmm5,%xmm3 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm5,%xmm3 ++ movdqu %xmm5,48(%rcx) ++ pxor %xmm0,%xmm4 ++ movdqu %xmm0,64(%rcx) ++.byte 102,15,58,15,227,8 ++ movdqu %xmm4,80(%rcx) ++ movaps (%rsp),%xmm6 ++ leaq 24(%rsp),%rsp ++.LSEH_end_gcm_init_clmul: + .byte 0xf3,0xc3 + + .globl gcm_gmult_clmul + .def gcm_gmult_clmul; .scl 2; .type 32; .endef + .p2align 4 + gcm_gmult_clmul: ++.L_gmult_clmul: + movdqu (%rcx),%xmm0 + movdqa .Lbswap_mask(%rip),%xmm5 + movdqu (%rdx),%xmm2 ++ movdqu 32(%rdx),%xmm4 + .byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 + .byte 102,15,58,68,220,0 +@@ -803,194 +901,372 @@ gcm_gmult_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + .byte 102,15,56,0,197 + movdqu %xmm0,(%rcx) + .byte 0xf3,0xc3 + + .globl gcm_ghash_clmul + .def gcm_ghash_clmul; .scl 2; .type 32; .endef +-.p2align 4 ++.p2align 5 + gcm_ghash_clmul: ++.L_ghash_clmul: ++ leaq -136(%rsp),%rax + .LSEH_begin_gcm_ghash_clmul: + +-.byte 0x48,0x83,0xec,0x58 +-.byte 0x0f,0x29,0x34,0x24 +-.byte 0x0f,0x29,0x7c,0x24,0x10 +-.byte 0x44,0x0f,0x29,0x44,0x24,0x20 +-.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 +-.byte 0x44,0x0f,0x29,0x54,0x24,0x40 ++.byte 0x48,0x8d,0x60,0xe0 ++.byte 0x0f,0x29,0x70,0xe0 ++.byte 0x0f,0x29,0x78,0xf0 ++.byte 0x44,0x0f,0x29,0x00 ++.byte 0x44,0x0f,0x29,0x48,0x10 ++.byte 0x44,0x0f,0x29,0x50,0x20 ++.byte 0x44,0x0f,0x29,0x58,0x30 ++.byte 0x44,0x0f,0x29,0x60,0x40 ++.byte 0x44,0x0f,0x29,0x68,0x50 ++.byte 0x44,0x0f,0x29,0x70,0x60 ++.byte 0x44,0x0f,0x29,0x78,0x70 + movdqa .Lbswap_mask(%rip),%xmm5 ++ movq $11547335547999543296,%rax + + movdqu (%rcx),%xmm0 + movdqu (%rdx),%xmm2 ++ movdqu 32(%rdx),%xmm10 + .byte 102,15,56,0,197 + + subq $16,%r9 + jz .Lodd_tail + +- movdqu 16(%rdx),%xmm8 ++ movdqu 16(%rdx),%xmm9 ++ cmpq $48,%r9 ++ jb .Lskip4x + ++ subq $48,%r9 ++ movdqu 48(%rdx),%xmm14 ++ movdqu 64(%rdx),%xmm15 + + + + +- movdqu (%r8),%xmm3 +- movdqu 16(%r8),%xmm6 +-.byte 102,15,56,0,221 ++ movdqu 48(%r8),%xmm6 ++ movdqu 32(%r8),%xmm11 + .byte 102,15,56,0,245 +- pxor %xmm3,%xmm0 +- movdqa %xmm6,%xmm7 +- pshufd $78,%xmm6,%xmm3 +- pshufd $78,%xmm2,%xmm4 +- pxor %xmm6,%xmm3 +- pxor %xmm2,%xmm4 ++.byte 102,68,15,56,0,221 ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm7 ++ pxor %xmm6,%xmm7 + .byte 102,15,58,68,242,0 +-.byte 102,15,58,68,250,17 +-.byte 102,15,58,68,220,0 +- pxor %xmm6,%xmm3 +- pxor %xmm7,%xmm3 ++.byte 102,68,15,58,68,194,17 ++.byte 102,65,15,58,68,250,0 ++ ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,217,0 ++.byte 102,69,15,58,68,233,17 ++ xorps %xmm11,%xmm6 ++.byte 102,69,15,58,68,226,16 ++ xorps %xmm13,%xmm8 ++ movups 80(%rdx),%xmm10 ++ xorps %xmm12,%xmm7 ++ ++ movdqu 16(%r8),%xmm11 ++ movdqu 0(%r8),%xmm3 ++.byte 102,68,15,56,0,221 ++.byte 102,15,56,0,221 ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm3,%xmm0 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,222,0 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,69,15,58,68,238,17 ++ xorps %xmm11,%xmm6 ++.byte 102,69,15,58,68,226,0 ++ xorps %xmm13,%xmm8 ++ ++ leaq 64(%r8),%r8 ++ subq $64,%r9 ++ jc .Ltail4x ++ ++ jmp .Lmod4_loop ++.p2align 5 ++.Lmod4_loop: ++.byte 102,65,15,58,68,199,0 ++ xorps %xmm12,%xmm7 ++ movdqu 48(%r8),%xmm11 ++.byte 102,68,15,56,0,221 ++.byte 102,65,15,58,68,207,17 ++ xorps %xmm6,%xmm0 ++ movdqu 32(%r8),%xmm6 ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++.byte 102,65,15,58,68,218,16 ++ xorps %xmm8,%xmm1 ++ pxor %xmm11,%xmm12 ++.byte 102,15,56,0,245 ++ movups 32(%rdx),%xmm10 ++.byte 102,68,15,58,68,218,0 ++ xorps %xmm7,%xmm3 ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm7 + ++ pxor %xmm0,%xmm3 ++ pxor %xmm6,%xmm7 ++ pxor %xmm1,%xmm3 + movdqa %xmm3,%xmm4 +- psrldq $8,%xmm3 ++ pslldq $8,%xmm3 ++.byte 102,68,15,58,68,234,17 ++ psrldq $8,%xmm4 ++ pxor %xmm3,%xmm0 ++ movdqa .L7_mask(%rip),%xmm3 ++ pxor %xmm4,%xmm1 ++.byte 102,72,15,110,224 ++ ++ pand %xmm0,%xmm3 ++.byte 102,15,56,0,227 ++.byte 102,69,15,58,68,226,0 ++ pxor %xmm0,%xmm4 ++ psllq $57,%xmm4 ++ movdqa %xmm4,%xmm3 + pslldq $8,%xmm4 +- pxor %xmm3,%xmm7 +- pxor %xmm4,%xmm6 ++.byte 102,65,15,58,68,241,0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ movdqu 0(%r8),%xmm3 ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++.byte 102,69,15,58,68,193,17 ++ xorps %xmm11,%xmm6 ++ movdqu 16(%r8),%xmm11 ++.byte 102,68,15,56,0,221 ++.byte 102,65,15,58,68,250,16 ++ xorps %xmm13,%xmm8 ++ movups 80(%rdx),%xmm10 ++.byte 102,15,56,0,221 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ ++ movdqa %xmm11,%xmm13 ++ pxor %xmm12,%xmm7 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,222,0 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ psrlq $1,%xmm0 ++.byte 102,69,15,58,68,238,17 ++ xorps %xmm11,%xmm6 ++ pxor %xmm1,%xmm0 ++ ++.byte 102,69,15,58,68,226,0 ++ xorps %xmm13,%xmm8 ++ + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm8,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm8,%xmm4 + +- leaq 32(%r8),%r8 +- subq $32,%r9 +- jbe .Leven_tail ++ leaq 64(%r8),%r8 ++ subq $64,%r9 ++ jnc .Lmod4_loop ++ ++.Ltail4x: ++.byte 102,65,15,58,68,199,0 ++ xorps %xmm12,%xmm7 ++.byte 102,65,15,58,68,207,17 ++ xorps %xmm6,%xmm0 ++.byte 102,65,15,58,68,218,16 ++ xorps %xmm8,%xmm1 ++ pxor %xmm0,%xmm1 ++ pxor %xmm7,%xmm3 + +-.Lmod_loop: +-.byte 102,65,15,58,68,192,0 +-.byte 102,65,15,58,68,200,17 +-.byte 102,15,58,68,220,0 +- pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 ++ pxor %xmm0,%xmm1 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 +- movdqu (%r8),%xmm3 +- pxor %xmm6,%xmm0 +- pxor %xmm7,%xmm1 + ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ pxor %xmm4,%xmm0 ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ addq $64,%r9 ++ jz .Ldone ++ movdqu 32(%rdx),%xmm10 ++ subq $16,%r9 ++ jz .Lodd_tail ++.Lskip4x: ++ ++ ++ ++ ++ ++ movdqu (%r8),%xmm3 + movdqu 16(%r8),%xmm6 + .byte 102,15,56,0,221 + .byte 102,15,56,0,245 ++ pxor %xmm3,%xmm0 ++ ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm3 ++ pxor %xmm6,%xmm3 ++.byte 102,15,58,68,242,0 ++.byte 102,68,15,58,68,194,17 ++.byte 102,65,15,58,68,218,0 ++ ++ leaq 32(%r8),%r8 ++ subq $32,%r9 ++ jbe .Leven_tail ++ jmp .Lmod_loop + +- movdqa %xmm6,%xmm7 +- pshufd $78,%xmm6,%xmm9 +- pshufd $78,%xmm2,%xmm10 +- pxor %xmm6,%xmm9 +- pxor %xmm2,%xmm10 ++.p2align 5 ++.Lmod_loop: ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm0,%xmm4 ++ ++.byte 102,65,15,58,68,193,0 ++.byte 102,65,15,58,68,201,17 ++.byte 102,65,15,58,68,226,16 ++ ++ pxor %xmm6,%xmm0 ++ pxor %xmm8,%xmm1 ++ movdqu (%r8),%xmm8 ++.byte 102,68,15,56,0,197 ++ movdqu 16(%r8),%xmm6 ++ ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ pxor %xmm8,%xmm1 ++ pxor %xmm3,%xmm4 ++.byte 102,15,56,0,245 ++ movdqa %xmm4,%xmm3 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 + pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 + ++ movdqa %xmm6,%xmm8 ++ ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 +- pxor %xmm3,%xmm0 + .byte 102,15,58,68,242,0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ pshufd $78,%xmm8,%xmm3 ++ pxor %xmm8,%xmm3 + +-.byte 102,15,58,68,250,17 ++.byte 102,68,15,58,68,194,17 + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 +- +-.byte 102,69,15,58,68,202,0 +- movdqa %xmm0,%xmm1 +- pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm8,%xmm4 +- pxor %xmm0,%xmm3 +- pxor %xmm8,%xmm4 +- +- pxor %xmm6,%xmm9 +- pxor %xmm7,%xmm9 +- movdqa %xmm9,%xmm10 +- psrldq $8,%xmm9 +- pslldq $8,%xmm10 +- pxor %xmm9,%xmm7 +- pxor %xmm10,%xmm6 ++.byte 102,65,15,58,68,218,0 ++ pxor %xmm1,%xmm0 + + leaq 32(%r8),%r8 + subq $32,%r9 + ja .Lmod_loop + + .Leven_tail: +-.byte 102,65,15,58,68,192,0 +-.byte 102,65,15,58,68,200,17 +-.byte 102,15,58,68,220,0 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm0,%xmm4 ++ ++.byte 102,65,15,58,68,193,0 ++.byte 102,65,15,58,68,201,17 ++.byte 102,65,15,58,68,226,16 ++ ++ pxor %xmm6,%xmm0 ++ pxor %xmm8,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 +- +- movdqa %xmm3,%xmm4 ++ pxor %xmm3,%xmm4 ++ movdqa %xmm4,%xmm3 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 +- pxor %xmm6,%xmm0 +- pxor %xmm7,%xmm1 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + testq %r9,%r9 + jnz .Ldone + +@@ -1000,12 +1276,10 @@ gcm_ghash_clmul: + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 +-.byte 102,15,58,68,220,0 ++.byte 102,65,15,58,68,218,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + +@@ -1015,27 +1289,28 @@ gcm_ghash_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + .Ldone: + .byte 102,15,56,0,197 + movdqu %xmm0,(%rcx) +@@ -1044,15 +1319,42 @@ gcm_ghash_clmul: + movaps 32(%rsp),%xmm8 + movaps 48(%rsp),%xmm9 + movaps 64(%rsp),%xmm10 +- addq $88,%rsp +- .byte 0xf3,0xc3 ++ movaps 80(%rsp),%xmm11 ++ movaps 96(%rsp),%xmm12 ++ movaps 112(%rsp),%xmm13 ++ movaps 128(%rsp),%xmm14 ++ movaps 144(%rsp),%xmm15 ++ leaq 168(%rsp),%rsp + .LSEH_end_gcm_ghash_clmul: ++ .byte 0xf3,0xc3 ++ ++.globl gcm_init_avx ++.def gcm_init_avx; .scl 2; .type 32; .endef ++.p2align 5 ++gcm_init_avx: ++ jmp .L_init_clmul ++ ++.globl gcm_gmult_avx ++.def gcm_gmult_avx; .scl 2; .type 32; .endef ++.p2align 5 ++gcm_gmult_avx: ++ jmp .L_gmult_clmul ++ ++.globl gcm_ghash_avx ++.def gcm_ghash_avx; .scl 2; .type 32; .endef ++.p2align 5 ++gcm_ghash_avx: ++ jmp .L_ghash_clmul + + .p2align 6 + .Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + .L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 ++.L7_mask: ++.long 7,0,7,0 ++.L7_mask_poly: ++.long 7,0,450,0 + .p2align 6 + + .Lrem_4bit: +@@ -1189,10 +1491,13 @@ se_handler: + .rva .LSEH_end_gcm_ghash_4bit + .rva .LSEH_info_gcm_ghash_4bit + ++.rva .LSEH_begin_gcm_init_clmul ++.rva .LSEH_end_gcm_init_clmul ++.rva .LSEH_info_gcm_init_clmul ++ + .rva .LSEH_begin_gcm_ghash_clmul + .rva .LSEH_end_gcm_ghash_clmul + .rva .LSEH_info_gcm_ghash_clmul +- + .section .xdata + .p2align 3 + .LSEH_info_gcm_gmult_4bit: +@@ -1203,11 +1508,20 @@ se_handler: + .byte 9,0,0,0 + .rva se_handler + .rva .Lghash_prologue,.Lghash_epilogue ++.LSEH_info_gcm_init_clmul: ++.byte 0x01,0x08,0x03,0x00 ++.byte 0x08,0x68,0x00,0x00 ++.byte 0x04,0x22,0x00,0x00 + .LSEH_info_gcm_ghash_clmul: +-.byte 0x01,0x1f,0x0b,0x00 +-.byte 0x1f,0xa8,0x04,0x00 +-.byte 0x19,0x98,0x03,0x00 +-.byte 0x13,0x88,0x02,0x00 +-.byte 0x0d,0x78,0x01,0x00 ++.byte 0x01,0x33,0x16,0x00 ++.byte 0x33,0xf8,0x09,0x00 ++.byte 0x2e,0xe8,0x08,0x00 ++.byte 0x29,0xd8,0x07,0x00 ++.byte 0x24,0xc8,0x06,0x00 ++.byte 0x1f,0xb8,0x05,0x00 ++.byte 0x1a,0xa8,0x04,0x00 ++.byte 0x15,0x98,0x03,0x00 ++.byte 0x10,0x88,0x02,0x00 ++.byte 0x0c,0x78,0x01,0x00 + .byte 0x08,0x68,0x00,0x00 +-.byte 0x04,0xa2,0x00,0x00 ++.byte 0x04,0x01,0x15,0x00 +diff --git a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s +index 7bd9665..224a226 100644 +--- a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s ++++ b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s +@@ -997,211 +997,423 @@ aesni_ctr32_encrypt_blocks: + movq %r9,%rcx + movq 40(%rsp),%r8 + +- leaq -200(%rsp),%rsp +- movaps %xmm6,32(%rsp) +- movaps %xmm7,48(%rsp) +- movaps %xmm8,64(%rsp) +- movaps %xmm9,80(%rsp) +- movaps %xmm10,96(%rsp) +- movaps %xmm11,112(%rsp) +- movaps %xmm12,128(%rsp) +- movaps %xmm13,144(%rsp) +- movaps %xmm14,160(%rsp) +- movaps %xmm15,176(%rsp) ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $288,%rsp ++ andq $-16,%rsp ++ movaps %xmm6,-168(%rax) ++ movaps %xmm7,-152(%rax) ++ movaps %xmm8,-136(%rax) ++ movaps %xmm9,-120(%rax) ++ movaps %xmm10,-104(%rax) ++ movaps %xmm11,-88(%rax) ++ movaps %xmm12,-72(%rax) ++ movaps %xmm13,-56(%rax) ++ movaps %xmm14,-40(%rax) ++ movaps %xmm15,-24(%rax) + .Lctr32_body: ++ leaq -8(%rax),%rbp ++ + cmpq $1,%rdx + je .Lctr32_one_shortcut + +- movdqu (%r8),%xmm14 +- movdqa .Lbswap_mask(%rip),%xmm15 +- xorl %eax,%eax +-.byte 102,69,15,58,22,242,3 +-.byte 102,68,15,58,34,240,3 ++ movdqu (%r8),%xmm2 ++ movdqu (%rcx),%xmm0 ++ movl 12(%r8),%r8d ++ pxor %xmm0,%xmm2 ++ movl 12(%rcx),%r11d ++ movdqa %xmm2,0(%rsp) ++ bswapl %r8d ++ movdqa %xmm2,%xmm3 ++ movdqa %xmm2,%xmm4 ++ movdqa %xmm2,%xmm5 ++ movdqa %xmm2,64(%rsp) ++ movdqa %xmm2,80(%rsp) ++ movdqa %xmm2,96(%rsp) ++ movdqa %xmm2,112(%rsp) + + movl 240(%rcx),%eax ++ ++ leaq 1(%r8),%r9 ++ leaq 2(%r8),%r10 ++ bswapl %r9d + bswapl %r10d +- pxor %xmm12,%xmm12 +- pxor %xmm13,%xmm13 +-.byte 102,69,15,58,34,226,0 +- leaq 3(%r10),%r11 +-.byte 102,69,15,58,34,235,0 +- incl %r10d +-.byte 102,69,15,58,34,226,1 +- incq %r11 +-.byte 102,69,15,58,34,235,1 +- incl %r10d +-.byte 102,69,15,58,34,226,2 +- incq %r11 +-.byte 102,69,15,58,34,235,2 +- movdqa %xmm12,0(%rsp) +-.byte 102,69,15,56,0,231 +- movdqa %xmm13,16(%rsp) +-.byte 102,69,15,56,0,239 +- +- pshufd $192,%xmm12,%xmm2 +- pshufd $128,%xmm12,%xmm3 +- pshufd $64,%xmm12,%xmm4 +- cmpq $6,%rdx +- jb .Lctr32_tail +- shrl $1,%eax +- movq %rcx,%r11 +- movl %eax,%r10d +- subq $6,%rdx +- jmp .Lctr32_loop6 ++ xorl %r11d,%r9d ++ xorl %r11d,%r10d ++.byte 102,65,15,58,34,217,3 ++ leaq 3(%r8),%r9 ++ movdqa %xmm3,16(%rsp) ++.byte 102,65,15,58,34,226,3 ++ bswapl %r9d ++ leaq 4(%r8),%r10 ++ movdqa %xmm4,32(%rsp) ++ xorl %r11d,%r9d ++ bswapl %r10d ++.byte 102,65,15,58,34,233,3 ++ xorl %r11d,%r10d ++ movdqa %xmm5,48(%rsp) ++ leaq 5(%r8),%r9 ++ movl %r10d,64+12(%rsp) ++ bswapl %r9d ++ leaq 6(%r8),%r10 ++ xorl %r11d,%r9d ++ bswapl %r10d ++ movl %r9d,80+12(%rsp) ++ xorl %r11d,%r10d ++ leaq 7(%r8),%r9 ++ movl %r10d,96+12(%rsp) ++ bswapl %r9d ++ xorl %r11d,%r9d ++ movl %r9d,112+12(%rsp) + +-.p2align 4 +-.Lctr32_loop6: +- pshufd $192,%xmm13,%xmm5 +- por %xmm14,%xmm2 +- movups (%r11),%xmm0 +- pshufd $128,%xmm13,%xmm6 +- por %xmm14,%xmm3 +- movups 16(%r11),%xmm1 +- pshufd $64,%xmm13,%xmm7 +- por %xmm14,%xmm4 +- por %xmm14,%xmm5 +- xorps %xmm0,%xmm2 +- por %xmm14,%xmm6 +- por %xmm14,%xmm7 ++ movups 16(%rcx),%xmm1 + ++ movdqa 64(%rsp),%xmm6 ++ movdqa 80(%rsp),%xmm7 + ++ cmpq $8,%rdx ++ jb .Lctr32_tail + ++ leaq 128(%rcx),%rcx ++ subq $8,%rdx ++ jmp .Lctr32_loop8 + +- pxor %xmm0,%xmm3 ++.p2align 5 ++.Lctr32_loop8: ++ addl $8,%r8d ++ movdqa 96(%rsp),%xmm8 + .byte 102,15,56,220,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++ movl %r8d,%r9d ++ movdqa 112(%rsp),%xmm9 + .byte 102,15,56,220,217 +- movdqa .Lincrement32(%rip),%xmm13 +- pxor %xmm0,%xmm5 ++ bswapl %r9d ++ movups 32-128(%rcx),%xmm0 + .byte 102,15,56,220,225 +- movdqa 0(%rsp),%xmm12 +- pxor %xmm0,%xmm6 ++ xorl %r11d,%r9d + .byte 102,15,56,220,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++ movl %r9d,0+12(%rsp) ++ leaq 1(%r8),%r9 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +- jmp .Lctr32_enc_loop6_enter +-.p2align 4 +-.Lctr32_enc_loop6: ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 48-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,16+12(%rsp) ++ leaq 2(%r8),%r9 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 64-128(%rcx),%xmm0 + .byte 102,15,56,220,209 + .byte 102,15,56,220,217 +- decl %eax ++ bswapl %r9d + .byte 102,15,56,220,225 ++ xorl %r11d,%r9d + .byte 102,15,56,220,233 ++ movl %r9d,32+12(%rsp) ++ leaq 3(%r8),%r9 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +-.Lctr32_enc_loop6_enter: +- movups 16(%rcx),%xmm1 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 80-128(%rcx),%xmm1 + .byte 102,15,56,220,208 + .byte 102,15,56,220,216 +- leaq 32(%rcx),%rcx ++ bswapl %r9d + .byte 102,15,56,220,224 ++ xorl %r11d,%r9d + .byte 102,15,56,220,232 ++ movl %r9d,48+12(%rsp) ++ leaq 4(%r8),%r9 + .byte 102,15,56,220,240 + .byte 102,15,56,220,248 +- movups (%rcx),%xmm0 +- jnz .Lctr32_enc_loop6 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 96-128(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++ bswapl %r9d ++.byte 102,15,56,220,225 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,233 ++ movl %r9d,64+12(%rsp) ++ leaq 5(%r8),%r9 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 112-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,80+12(%rsp) ++ leaq 6(%r8),%r9 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 128-128(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++ bswapl %r9d ++.byte 102,15,56,220,225 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,233 ++ movl %r9d,96+12(%rsp) ++ leaq 7(%r8),%r9 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 144-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,112+12(%rsp) ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++ movdqu 0(%rdi),%xmm10 ++.byte 102,68,15,56,220,200 ++ movups 160-128(%rcx),%xmm0 ++ ++ cmpl $11,%eax ++ jb .Lctr32_enc_done + + .byte 102,15,56,220,209 +- paddd %xmm13,%xmm12 + .byte 102,15,56,220,217 +- paddd 16(%rsp),%xmm13 + .byte 102,15,56,220,225 +- movdqa %xmm12,0(%rsp) + .byte 102,15,56,220,233 +- movdqa %xmm13,16(%rsp) + .byte 102,15,56,220,241 +-.byte 102,69,15,56,0,231 + .byte 102,15,56,220,249 +-.byte 102,69,15,56,0,239 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 176-128(%rcx),%xmm1 + +-.byte 102,15,56,221,208 +- movups (%rdi),%xmm8 +-.byte 102,15,56,221,216 +- movups 16(%rdi),%xmm9 +-.byte 102,15,56,221,224 +- movups 32(%rdi),%xmm10 +-.byte 102,15,56,221,232 +- movups 48(%rdi),%xmm11 +-.byte 102,15,56,221,240 +- movups 64(%rdi),%xmm1 +-.byte 102,15,56,221,248 +- movups 80(%rdi),%xmm0 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 192-128(%rcx),%xmm0 ++ je .Lctr32_enc_done + +- xorps %xmm2,%xmm8 +- pshufd $192,%xmm12,%xmm2 +- xorps %xmm3,%xmm9 +- pshufd $128,%xmm12,%xmm3 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- pshufd $64,%xmm12,%xmm4 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- xorps %xmm6,%xmm1 +- movups %xmm11,48(%rsi) +- xorps %xmm7,%xmm0 +- movups %xmm1,64(%rsi) +- movups %xmm0,80(%rsi) +- leaq 96(%rsi),%rsi +- movl %r10d,%eax +- subq $6,%rdx +- jnc .Lctr32_loop6 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 208-128(%rcx),%xmm1 + +- addq $6,%rdx ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 224-128(%rcx),%xmm0 ++ ++.Lctr32_enc_done: ++ movdqu 16(%rdi),%xmm11 ++ pxor %xmm0,%xmm10 ++ movdqu 32(%rdi),%xmm12 ++ pxor %xmm0,%xmm11 ++ movdqu 48(%rdi),%xmm13 ++ pxor %xmm0,%xmm12 ++ movdqu 64(%rdi),%xmm14 ++ pxor %xmm0,%xmm13 ++ movdqu 80(%rdi),%xmm15 ++ pxor %xmm0,%xmm14 ++.byte 102,15,56,220,209 ++ pxor %xmm0,%xmm15 ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movdqu 96(%rdi),%xmm1 ++ ++.byte 102,65,15,56,221,210 ++ pxor %xmm0,%xmm1 ++ movdqu 112(%rdi),%xmm10 ++ leaq 128(%rdi),%rdi ++.byte 102,65,15,56,221,219 ++ pxor %xmm0,%xmm10 ++ movdqa 0(%rsp),%xmm11 ++.byte 102,65,15,56,221,228 ++ movdqa 16(%rsp),%xmm12 ++.byte 102,65,15,56,221,237 ++ movdqa 32(%rsp),%xmm13 ++.byte 102,65,15,56,221,246 ++ movdqa 48(%rsp),%xmm14 ++.byte 102,65,15,56,221,255 ++ movdqa 64(%rsp),%xmm15 ++.byte 102,68,15,56,221,193 ++ movdqa 80(%rsp),%xmm0 ++.byte 102,69,15,56,221,202 ++ movups 16-128(%rcx),%xmm1 ++ ++ movups %xmm2,(%rsi) ++ movdqa %xmm11,%xmm2 ++ movups %xmm3,16(%rsi) ++ movdqa %xmm12,%xmm3 ++ movups %xmm4,32(%rsi) ++ movdqa %xmm13,%xmm4 ++ movups %xmm5,48(%rsi) ++ movdqa %xmm14,%xmm5 ++ movups %xmm6,64(%rsi) ++ movdqa %xmm15,%xmm6 ++ movups %xmm7,80(%rsi) ++ movdqa %xmm0,%xmm7 ++ movups %xmm8,96(%rsi) ++ movups %xmm9,112(%rsi) ++ leaq 128(%rsi),%rsi ++ ++ subq $8,%rdx ++ jnc .Lctr32_loop8 ++ ++ addq $8,%rdx + jz .Lctr32_done +- movq %r11,%rcx +- leal 1(%rax,%rax,1),%eax ++ leaq -128(%rcx),%rcx + + .Lctr32_tail: +- por %xmm14,%xmm2 +- movups (%rdi),%xmm8 +- cmpq $2,%rdx +- jb .Lctr32_one ++ leaq 16(%rcx),%rcx ++ cmpq $4,%rdx ++ jb .Lctr32_loop3 ++ je .Lctr32_loop4 + +- por %xmm14,%xmm3 +- movups 16(%rdi),%xmm9 +- je .Lctr32_two ++ movdqa 96(%rsp),%xmm8 ++ pxor %xmm9,%xmm9 + +- pshufd $192,%xmm13,%xmm5 +- por %xmm14,%xmm4 +- movups 32(%rdi),%xmm10 +- cmpq $4,%rdx +- jb .Lctr32_three ++ movups 16(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++ shrl $1,%eax ++.byte 102,15,56,220,225 ++ decl %eax ++.byte 102,15,56,220,233 ++ movups (%rdi),%xmm10 ++.byte 102,15,56,220,241 ++ movups 16(%rdi),%xmm11 ++.byte 102,15,56,220,249 ++ movups 32(%rdi),%xmm12 ++.byte 102,68,15,56,220,193 ++ movups 16(%rcx),%xmm1 + +- pshufd $128,%xmm13,%xmm6 +- por %xmm14,%xmm5 +- movups 48(%rdi),%xmm11 +- je .Lctr32_four ++ call .Lenc_loop8_enter + +- por %xmm14,%xmm6 +- xorps %xmm7,%xmm7 ++ movdqu 48(%rdi),%xmm13 ++ pxor %xmm10,%xmm2 ++ movdqu 64(%rdi),%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm10,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ movdqu %xmm6,64(%rsi) ++ cmpq $6,%rdx ++ jb .Lctr32_done + +- call _aesni_encrypt6 ++ movups 80(%rdi),%xmm11 ++ xorps %xmm11,%xmm7 ++ movups %xmm7,80(%rsi) ++ je .Lctr32_done + +- movups 64(%rdi),%xmm1 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- xorps %xmm6,%xmm1 +- movups %xmm11,48(%rsi) +- movups %xmm1,64(%rsi) ++ movups 96(%rdi),%xmm12 ++ xorps %xmm12,%xmm8 ++ movups %xmm8,96(%rsi) ++ jmp .Lctr32_done ++ ++.p2align 5 ++.Lctr32_loop4: ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++ movups (%rcx),%xmm1 ++ decl %eax ++ jnz .Lctr32_loop4 ++.byte 102,15,56,221,209 ++ movups (%rdi),%xmm10 ++.byte 102,15,56,221,217 ++ movups 16(%rdi),%xmm11 ++.byte 102,15,56,221,225 ++ movups 32(%rdi),%xmm12 ++.byte 102,15,56,221,233 ++ movups 48(%rdi),%xmm13 ++ ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) ++ xorps %xmm11,%xmm3 ++ movups %xmm3,16(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm5,48(%rsi) ++ jmp .Lctr32_done ++ ++.p2align 5 ++.Lctr32_loop3: ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++ movups (%rcx),%xmm1 ++ decl %eax ++ jnz .Lctr32_loop3 ++.byte 102,15,56,221,209 ++.byte 102,15,56,221,217 ++.byte 102,15,56,221,225 ++ ++ movups (%rdi),%xmm10 ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) ++ cmpq $2,%rdx ++ jb .Lctr32_done ++ ++ movups 16(%rdi),%xmm11 ++ xorps %xmm11,%xmm3 ++ movups %xmm3,16(%rsi) ++ je .Lctr32_done ++ ++ movups 32(%rdi),%xmm12 ++ xorps %xmm12,%xmm4 ++ movups %xmm4,32(%rsi) + jmp .Lctr32_done + + .p2align 4 + .Lctr32_one_shortcut: + movups (%r8),%xmm2 +- movups (%rdi),%xmm8 ++ movups (%rdi),%xmm10 + movl 240(%rcx),%eax +-.Lctr32_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx +@@ -1213,56 +1425,25 @@ aesni_ctr32_encrypt_blocks: + leaq 16(%rcx),%rcx + jnz .Loop_enc1_7 + .byte 102,15,56,221,209 +- xorps %xmm2,%xmm8 +- movups %xmm8,(%rsi) +- jmp .Lctr32_done +- +-.p2align 4 +-.Lctr32_two: +- xorps %xmm4,%xmm4 +- call _aesni_encrypt3 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- movups %xmm9,16(%rsi) +- jmp .Lctr32_done +- +-.p2align 4 +-.Lctr32_three: +- call _aesni_encrypt3 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- movups %xmm10,32(%rsi) ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) + jmp .Lctr32_done + + .p2align 4 +-.Lctr32_four: +- call _aesni_encrypt4 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- movups %xmm11,48(%rsi) +- + .Lctr32_done: +- movaps 32(%rsp),%xmm6 +- movaps 48(%rsp),%xmm7 +- movaps 64(%rsp),%xmm8 +- movaps 80(%rsp),%xmm9 +- movaps 96(%rsp),%xmm10 +- movaps 112(%rsp),%xmm11 +- movaps 128(%rsp),%xmm12 +- movaps 144(%rsp),%xmm13 +- movaps 160(%rsp),%xmm14 +- movaps 176(%rsp),%xmm15 +- leaq 200(%rsp),%rsp +-.Lctr32_ret: ++ movaps -160(%rbp),%xmm6 ++ movaps -144(%rbp),%xmm7 ++ movaps -128(%rbp),%xmm8 ++ movaps -112(%rbp),%xmm9 ++ movaps -96(%rbp),%xmm10 ++ movaps -80(%rbp),%xmm11 ++ movaps -64(%rbp),%xmm12 ++ movaps -48(%rbp),%xmm13 ++ movaps -32(%rbp),%xmm14 ++ movaps -16(%rbp),%xmm15 ++ leaq (%rbp),%rsp ++ popq %rbp ++.Lctr32_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +@@ -1282,18 +1463,22 @@ aesni_xts_encrypt: + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + +- leaq -264(%rsp),%rsp +- movaps %xmm6,96(%rsp) +- movaps %xmm7,112(%rsp) +- movaps %xmm8,128(%rsp) +- movaps %xmm9,144(%rsp) +- movaps %xmm10,160(%rsp) +- movaps %xmm11,176(%rsp) +- movaps %xmm12,192(%rsp) +- movaps %xmm13,208(%rsp) +- movaps %xmm14,224(%rsp) +- movaps %xmm15,240(%rsp) ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $272,%rsp ++ andq $-16,%rsp ++ movaps %xmm6,-168(%rax) ++ movaps %xmm7,-152(%rax) ++ movaps %xmm8,-136(%rax) ++ movaps %xmm9,-120(%rax) ++ movaps %xmm10,-104(%rax) ++ movaps %xmm11,-88(%rax) ++ movaps %xmm12,-72(%rax) ++ movaps %xmm13,-56(%rax) ++ movaps %xmm14,-40(%rax) ++ movaps %xmm15,-24(%rax) + .Lxts_enc_body: ++ leaq -8(%rax),%rbp + movups (%r9),%xmm15 + movl 240(%r8),%eax + movl 240(%rcx),%r10d +@@ -1308,228 +1493,266 @@ aesni_xts_encrypt: + leaq 16(%r8),%r8 + jnz .Loop_enc1_8 + .byte 102,68,15,56,221,249 ++ movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax ++ shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + ++ movups 16(%rcx,%r10,1),%xmm1 ++ movl %eax,%r10d ++ + movdqa .Lxts_magic(%rip),%xmm8 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pshufd $95,%xmm15,%xmm9 ++ pxor %xmm0,%xmm1 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm10 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm11 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm12 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 ++ psrad $31,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm13 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm15,%xmm14 ++ psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 ++ pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 ++ movaps %xmm1,96(%rsp) ++ + subq $96,%rdx + jc .Lxts_enc_short + + shrl $1,%eax +- subl $1,%eax ++ subl $3,%eax ++ movups 16(%r11),%xmm1 + movl %eax,%r10d ++ leaq .Lxts_magic(%rip),%r8 + jmp .Lxts_enc_grandloop + +-.p2align 4 ++.p2align 5 + .Lxts_enc_grandloop: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu 0(%rdi),%xmm2 +- pand %xmm8,%xmm9 ++ movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- +- movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 +- movdqu 48(%rdi),%xmm5 ++ movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +- movdqu 64(%rdi),%xmm6 ++.byte 102,15,56,220,209 ++ movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +- movdqu 80(%rdi),%xmm7 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,220,217 ++ movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +- movups (%r11),%xmm0 ++.byte 102,15,56,220,225 ++ movdqu 80(%rdi),%xmm7 ++ pxor %xmm15,%xmm8 ++ movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +- pxor %xmm15,%xmm7 +- +- ++.byte 102,15,56,220,233 ++ movups 32(%r11),%xmm0 ++ leaq 96(%rdi),%rdi ++ pxor %xmm8,%xmm7 + +- movups 16(%r11),%xmm1 +- pxor %xmm0,%xmm2 +- pxor %xmm0,%xmm3 ++ pxor %xmm9,%xmm10 ++.byte 102,15,56,220,241 ++ pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +-.byte 102,15,56,220,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++.byte 102,15,56,220,249 ++ movups 48(%r11),%xmm1 ++ ++.byte 102,15,56,220,208 ++ pxor %xmm9,%xmm12 + movdqa %xmm11,16(%rsp) +-.byte 102,15,56,220,217 +- pxor %xmm0,%xmm5 ++.byte 102,15,56,220,216 ++ pxor %xmm9,%xmm13 + movdqa %xmm12,32(%rsp) +-.byte 102,15,56,220,225 +- pxor %xmm0,%xmm6 +- movdqa %xmm13,48(%rsp) +-.byte 102,15,56,220,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++.byte 102,15,56,220,224 ++ pxor %xmm9,%xmm14 ++.byte 102,15,56,220,232 ++ pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +-.byte 102,15,56,220,241 +- movdqa %xmm15,80(%rsp) +-.byte 102,15,56,220,249 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- jmp .Lxts_enc_loop6_enter +- +-.p2align 4 ++.byte 102,15,56,220,240 ++ movdqa %xmm8,80(%rsp) ++.byte 102,15,56,220,248 ++ movups 64(%r11),%xmm0 ++ leaq 64(%r11),%rcx ++ pshufd $95,%xmm15,%xmm9 ++ jmp .Lxts_enc_loop6 ++.p2align 5 + .Lxts_enc_loop6: + .byte 102,15,56,220,209 + .byte 102,15,56,220,217 +- decl %eax + .byte 102,15,56,220,225 + .byte 102,15,56,220,233 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +-.Lxts_enc_loop6_enter: + movups 16(%rcx),%xmm1 ++ leaq 32(%rcx),%rcx ++ + .byte 102,15,56,220,208 + .byte 102,15,56,220,216 +- leaq 32(%rcx),%rcx + .byte 102,15,56,220,224 + .byte 102,15,56,220,232 + .byte 102,15,56,220,240 + .byte 102,15,56,220,248 + movups (%rcx),%xmm0 ++ decl %eax + jnz .Lxts_enc_loop6 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- paddq %xmm15,%xmm15 ++ movdqa (%r8),%xmm8 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,209 +- pand %xmm8,%xmm9 ++ paddq %xmm15,%xmm15 ++ psrad $31,%xmm14 + .byte 102,15,56,220,217 +- pcmpgtd %xmm15,%xmm14 ++ pand %xmm8,%xmm14 ++ movups (%r11),%xmm10 + .byte 102,15,56,220,225 +- pxor %xmm9,%xmm15 + .byte 102,15,56,220,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,241 ++ movaps %xmm10,%xmm11 + .byte 102,15,56,220,249 + movups 16(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm10 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,208 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm10 ++ psrad $31,%xmm14 + .byte 102,15,56,220,216 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,220,224 +- pxor %xmm9,%xmm15 + .byte 102,15,56,220,232 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,240 ++ movaps %xmm11,%xmm12 + .byte 102,15,56,220,248 + movups 32(%rcx),%xmm0 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm11 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,209 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm11 ++ psrad $31,%xmm14 + .byte 102,15,56,220,217 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,220,225 +- pxor %xmm9,%xmm15 ++ movdqa %xmm13,48(%rsp) + .byte 102,15,56,220,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,241 ++ movaps %xmm12,%xmm13 + .byte 102,15,56,220,249 ++ movups 48(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm12 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,220,208 ++ pxor %xmm15,%xmm12 ++ psrad $31,%xmm14 ++.byte 102,15,56,220,216 + paddq %xmm15,%xmm15 +-.byte 102,15,56,221,208 +- pand %xmm8,%xmm9 +-.byte 102,15,56,221,216 +- pcmpgtd %xmm15,%xmm14 +-.byte 102,15,56,221,224 +- pxor %xmm9,%xmm15 +-.byte 102,15,56,221,232 +-.byte 102,15,56,221,240 +-.byte 102,15,56,221,248 ++ pand %xmm8,%xmm14 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++ pxor %xmm14,%xmm15 ++.byte 102,15,56,220,240 ++ movaps %xmm13,%xmm14 ++.byte 102,15,56,220,248 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm13 ++ movdqa %xmm9,%xmm0 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,220,209 ++ pxor %xmm15,%xmm13 ++ psrad $31,%xmm0 ++.byte 102,15,56,220,217 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm0 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++ pxor %xmm0,%xmm15 ++ movups (%r11),%xmm0 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++ movups 16(%r11),%xmm1 ++ ++ pxor %xmm15,%xmm14 ++ psrad $31,%xmm9 ++.byte 102,15,56,221,84,36,0 + paddq %xmm15,%xmm15 +- xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm9 +- xorps 16(%rsp),%xmm3 +- pcmpgtd %xmm15,%xmm14 ++.byte 102,15,56,221,92,36,16 ++.byte 102,15,56,221,100,36,32 + pxor %xmm9,%xmm15 +- +- xorps 32(%rsp),%xmm4 +- movups %xmm2,0(%rsi) +- xorps 48(%rsp),%xmm5 +- movups %xmm3,16(%rsi) +- xorps 64(%rsp),%xmm6 +- movups %xmm4,32(%rsi) +- xorps 80(%rsp),%xmm7 +- movups %xmm5,48(%rsi) ++.byte 102,15,56,221,108,36,48 ++.byte 102,15,56,221,116,36,64 ++.byte 102,15,56,221,124,36,80 + movl %r10d,%eax +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) ++ + leaq 96(%rsi),%rsi ++ movups %xmm2,-96(%rsi) ++ movups %xmm3,-80(%rsi) ++ movups %xmm4,-64(%rsi) ++ movups %xmm5,-48(%rsi) ++ movups %xmm6,-32(%rsi) ++ movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc .Lxts_enc_grandloop + +- leal 3(%rax,%rax,1),%eax ++ leal 7(%rax,%rax,1),%eax + movq %r11,%rcx + movl %eax,%r10d + + .Lxts_enc_short: ++ pxor %xmm0,%xmm10 + addq $96,%rdx + jz .Lxts_enc_done + ++ pxor %xmm0,%xmm11 + cmpq $32,%rdx + jb .Lxts_enc_one ++ pxor %xmm0,%xmm12 + je .Lxts_enc_two + ++ pxor %xmm0,%xmm13 + cmpq $64,%rdx + jb .Lxts_enc_three ++ pxor %xmm0,%xmm14 + je .Lxts_enc_four + +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 +@@ -1632,15 +1855,15 @@ aesni_xts_encrypt: + + call _aesni_encrypt4 + +- xorps %xmm10,%xmm2 +- movdqa %xmm15,%xmm10 +- xorps %xmm11,%xmm3 +- xorps %xmm12,%xmm4 +- movups %xmm2,(%rsi) +- xorps %xmm13,%xmm5 +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) ++ pxor %xmm10,%xmm2 ++ movdqa %xmm14,%xmm10 ++ pxor %xmm11,%xmm3 ++ pxor %xmm12,%xmm4 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm3,16(%rsi) ++ movdqu %xmm4,32(%rsi) ++ movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp .Lxts_enc_done + +@@ -1681,17 +1904,18 @@ aesni_xts_encrypt: + movups %xmm2,-16(%rsi) + + .Lxts_enc_ret: +- movaps 96(%rsp),%xmm6 +- movaps 112(%rsp),%xmm7 +- movaps 128(%rsp),%xmm8 +- movaps 144(%rsp),%xmm9 +- movaps 160(%rsp),%xmm10 +- movaps 176(%rsp),%xmm11 +- movaps 192(%rsp),%xmm12 +- movaps 208(%rsp),%xmm13 +- movaps 224(%rsp),%xmm14 +- movaps 240(%rsp),%xmm15 +- leaq 264(%rsp),%rsp ++ movaps -160(%rbp),%xmm6 ++ movaps -144(%rbp),%xmm7 ++ movaps -128(%rbp),%xmm8 ++ movaps -112(%rbp),%xmm9 ++ movaps -96(%rbp),%xmm10 ++ movaps -80(%rbp),%xmm11 ++ movaps -64(%rbp),%xmm12 ++ movaps -48(%rbp),%xmm13 ++ movaps -32(%rbp),%xmm14 ++ movaps -16(%rbp),%xmm15 ++ leaq (%rbp),%rsp ++ popq %rbp + .Lxts_enc_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi +@@ -1712,18 +1936,22 @@ aesni_xts_decrypt: + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + +- leaq -264(%rsp),%rsp +- movaps %xmm6,96(%rsp) +- movaps %xmm7,112(%rsp) +- movaps %xmm8,128(%rsp) +- movaps %xmm9,144(%rsp) +- movaps %xmm10,160(%rsp) +- movaps %xmm11,176(%rsp) +- movaps %xmm12,192(%rsp) +- movaps %xmm13,208(%rsp) +- movaps %xmm14,224(%rsp) +- movaps %xmm15,240(%rsp) ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $272,%rsp ++ andq $-16,%rsp ++ movaps %xmm6,-168(%rax) ++ movaps %xmm7,-152(%rax) ++ movaps %xmm8,-136(%rax) ++ movaps %xmm9,-120(%rax) ++ movaps %xmm10,-104(%rax) ++ movaps %xmm11,-88(%rax) ++ movaps %xmm12,-72(%rax) ++ movaps %xmm13,-56(%rax) ++ movaps %xmm14,-40(%rax) ++ movaps %xmm15,-24(%rax) + .Lxts_dec_body: ++ leaq -8(%rax),%rbp + movups (%r9),%xmm15 + movl 240(%r8),%eax + movl 240(%rcx),%r10d +@@ -1744,228 +1972,266 @@ aesni_xts_decrypt: + shlq $4,%rax + subq %rax,%rdx + ++ movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax ++ shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + ++ movups 16(%rcx,%r10,1),%xmm1 ++ movl %eax,%r10d ++ + movdqa .Lxts_magic(%rip),%xmm8 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pshufd $95,%xmm15,%xmm9 ++ pxor %xmm0,%xmm1 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm10 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm11 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm12 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 ++ psrad $31,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm13 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm15,%xmm14 ++ psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 ++ pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 ++ movaps %xmm1,96(%rsp) ++ + subq $96,%rdx + jc .Lxts_dec_short + + shrl $1,%eax +- subl $1,%eax ++ subl $3,%eax ++ movups 16(%r11),%xmm1 + movl %eax,%r10d ++ leaq .Lxts_magic(%rip),%r8 + jmp .Lxts_dec_grandloop + +-.p2align 4 ++.p2align 5 + .Lxts_dec_grandloop: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu 0(%rdi),%xmm2 +- pand %xmm8,%xmm9 ++ movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- +- movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 +- movdqu 48(%rdi),%xmm5 ++ movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +- movdqu 64(%rdi),%xmm6 ++.byte 102,15,56,222,209 ++ movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +- movdqu 80(%rdi),%xmm7 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,222,217 ++ movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +- movups (%r11),%xmm0 ++.byte 102,15,56,222,225 ++ movdqu 80(%rdi),%xmm7 ++ pxor %xmm15,%xmm8 ++ movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +- pxor %xmm15,%xmm7 +- +- ++.byte 102,15,56,222,233 ++ movups 32(%r11),%xmm0 ++ leaq 96(%rdi),%rdi ++ pxor %xmm8,%xmm7 + +- movups 16(%r11),%xmm1 +- pxor %xmm0,%xmm2 +- pxor %xmm0,%xmm3 ++ pxor %xmm9,%xmm10 ++.byte 102,15,56,222,241 ++ pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +-.byte 102,15,56,222,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++.byte 102,15,56,222,249 ++ movups 48(%r11),%xmm1 ++ ++.byte 102,15,56,222,208 ++ pxor %xmm9,%xmm12 + movdqa %xmm11,16(%rsp) +-.byte 102,15,56,222,217 +- pxor %xmm0,%xmm5 ++.byte 102,15,56,222,216 ++ pxor %xmm9,%xmm13 + movdqa %xmm12,32(%rsp) +-.byte 102,15,56,222,225 +- pxor %xmm0,%xmm6 +- movdqa %xmm13,48(%rsp) +-.byte 102,15,56,222,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++.byte 102,15,56,222,224 ++ pxor %xmm9,%xmm14 ++.byte 102,15,56,222,232 ++ pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +-.byte 102,15,56,222,241 +- movdqa %xmm15,80(%rsp) +-.byte 102,15,56,222,249 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- jmp .Lxts_dec_loop6_enter +- +-.p2align 4 ++.byte 102,15,56,222,240 ++ movdqa %xmm8,80(%rsp) ++.byte 102,15,56,222,248 ++ movups 64(%r11),%xmm0 ++ leaq 64(%r11),%rcx ++ pshufd $95,%xmm15,%xmm9 ++ jmp .Lxts_dec_loop6 ++.p2align 5 + .Lxts_dec_loop6: + .byte 102,15,56,222,209 + .byte 102,15,56,222,217 +- decl %eax + .byte 102,15,56,222,225 + .byte 102,15,56,222,233 + .byte 102,15,56,222,241 + .byte 102,15,56,222,249 +-.Lxts_dec_loop6_enter: + movups 16(%rcx),%xmm1 ++ leaq 32(%rcx),%rcx ++ + .byte 102,15,56,222,208 + .byte 102,15,56,222,216 +- leaq 32(%rcx),%rcx + .byte 102,15,56,222,224 + .byte 102,15,56,222,232 + .byte 102,15,56,222,240 + .byte 102,15,56,222,248 + movups (%rcx),%xmm0 ++ decl %eax + jnz .Lxts_dec_loop6 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- paddq %xmm15,%xmm15 ++ movdqa (%r8),%xmm8 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,209 +- pand %xmm8,%xmm9 ++ paddq %xmm15,%xmm15 ++ psrad $31,%xmm14 + .byte 102,15,56,222,217 +- pcmpgtd %xmm15,%xmm14 ++ pand %xmm8,%xmm14 ++ movups (%r11),%xmm10 + .byte 102,15,56,222,225 +- pxor %xmm9,%xmm15 + .byte 102,15,56,222,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,241 ++ movaps %xmm10,%xmm11 + .byte 102,15,56,222,249 + movups 16(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm10 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,208 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm10 ++ psrad $31,%xmm14 + .byte 102,15,56,222,216 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,222,224 +- pxor %xmm9,%xmm15 + .byte 102,15,56,222,232 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,240 ++ movaps %xmm11,%xmm12 + .byte 102,15,56,222,248 + movups 32(%rcx),%xmm0 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm11 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,209 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm11 ++ psrad $31,%xmm14 + .byte 102,15,56,222,217 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,222,225 +- pxor %xmm9,%xmm15 ++ movdqa %xmm13,48(%rsp) + .byte 102,15,56,222,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,241 ++ movaps %xmm12,%xmm13 + .byte 102,15,56,222,249 ++ movups 48(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm12 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,222,208 ++ pxor %xmm15,%xmm12 ++ psrad $31,%xmm14 ++.byte 102,15,56,222,216 + paddq %xmm15,%xmm15 +-.byte 102,15,56,223,208 +- pand %xmm8,%xmm9 +-.byte 102,15,56,223,216 +- pcmpgtd %xmm15,%xmm14 +-.byte 102,15,56,223,224 +- pxor %xmm9,%xmm15 +-.byte 102,15,56,223,232 +-.byte 102,15,56,223,240 +-.byte 102,15,56,223,248 ++ pand %xmm8,%xmm14 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++ pxor %xmm14,%xmm15 ++.byte 102,15,56,222,240 ++ movaps %xmm13,%xmm14 ++.byte 102,15,56,222,248 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm13 ++ movdqa %xmm9,%xmm0 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,222,209 ++ pxor %xmm15,%xmm13 ++ psrad $31,%xmm0 ++.byte 102,15,56,222,217 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm0 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++ pxor %xmm0,%xmm15 ++ movups (%r11),%xmm0 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++ movups 16(%r11),%xmm1 ++ ++ pxor %xmm15,%xmm14 ++ psrad $31,%xmm9 ++.byte 102,15,56,223,84,36,0 + paddq %xmm15,%xmm15 +- xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm9 +- xorps 16(%rsp),%xmm3 +- pcmpgtd %xmm15,%xmm14 ++.byte 102,15,56,223,92,36,16 ++.byte 102,15,56,223,100,36,32 + pxor %xmm9,%xmm15 +- +- xorps 32(%rsp),%xmm4 +- movups %xmm2,0(%rsi) +- xorps 48(%rsp),%xmm5 +- movups %xmm3,16(%rsi) +- xorps 64(%rsp),%xmm6 +- movups %xmm4,32(%rsi) +- xorps 80(%rsp),%xmm7 +- movups %xmm5,48(%rsi) ++.byte 102,15,56,223,108,36,48 ++.byte 102,15,56,223,116,36,64 ++.byte 102,15,56,223,124,36,80 + movl %r10d,%eax +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) ++ + leaq 96(%rsi),%rsi ++ movups %xmm2,-96(%rsi) ++ movups %xmm3,-80(%rsi) ++ movups %xmm4,-64(%rsi) ++ movups %xmm5,-48(%rsi) ++ movups %xmm6,-32(%rsi) ++ movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc .Lxts_dec_grandloop + +- leal 3(%rax,%rax,1),%eax ++ leal 7(%rax,%rax,1),%eax + movq %r11,%rcx + movl %eax,%r10d + + .Lxts_dec_short: ++ pxor %xmm0,%xmm10 ++ pxor %xmm0,%xmm11 + addq $96,%rdx + jz .Lxts_dec_done + ++ pxor %xmm0,%xmm12 + cmpq $32,%rdx + jb .Lxts_dec_one ++ pxor %xmm0,%xmm13 + je .Lxts_dec_two + ++ pxor %xmm0,%xmm14 + cmpq $64,%rdx + jb .Lxts_dec_three + je .Lxts_dec_four + +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 +@@ -2058,7 +2324,7 @@ aesni_xts_decrypt: + xorps %xmm10,%xmm2 + movdqa %xmm13,%xmm10 + xorps %xmm11,%xmm3 +- movdqa %xmm15,%xmm11 ++ movdqa %xmm14,%xmm11 + xorps %xmm12,%xmm4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) +@@ -2068,14 +2334,8 @@ aesni_xts_decrypt: + + .p2align 4 + .Lxts_dec_four: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movups (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movups 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movups 32(%rdi),%xmm4 + xorps %xmm10,%xmm2 + movups 48(%rdi),%xmm5 +@@ -2086,16 +2346,16 @@ aesni_xts_decrypt: + + call _aesni_decrypt4 + +- xorps %xmm10,%xmm2 ++ pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 +- xorps %xmm11,%xmm3 ++ pxor %xmm11,%xmm3 + movdqa %xmm15,%xmm11 +- xorps %xmm12,%xmm4 +- movups %xmm2,(%rsi) +- xorps %xmm13,%xmm5 +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm3,16(%rsi) ++ movdqu %xmm4,32(%rsi) ++ movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp .Lxts_dec_done + +@@ -2155,17 +2415,18 @@ aesni_xts_decrypt: + movups %xmm2,(%rsi) + + .Lxts_dec_ret: +- movaps 96(%rsp),%xmm6 +- movaps 112(%rsp),%xmm7 +- movaps 128(%rsp),%xmm8 +- movaps 144(%rsp),%xmm9 +- movaps 160(%rsp),%xmm10 +- movaps 176(%rsp),%xmm11 +- movaps 192(%rsp),%xmm12 +- movaps 208(%rsp),%xmm13 +- movaps 224(%rsp),%xmm14 +- movaps 240(%rsp),%xmm15 +- leaq 264(%rsp),%rsp ++ movaps -160(%rbp),%xmm6 ++ movaps -144(%rbp),%xmm7 ++ movaps -128(%rbp),%xmm8 ++ movaps -112(%rbp),%xmm9 ++ movaps -96(%rbp),%xmm10 ++ movaps -80(%rbp),%xmm11 ++ movaps -64(%rbp),%xmm12 ++ movaps -48(%rbp),%xmm13 ++ movaps -32(%rbp),%xmm14 ++ movaps -16(%rbp),%xmm15 ++ leaq (%rbp),%rsp ++ popq %rbp + .Lxts_dec_epilogue: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi +@@ -2245,155 +2506,335 @@ aesni_cbc_encrypt: + + .p2align 4 + .Lcbc_decrypt: +- leaq -88(%rsp),%rsp +- movaps %xmm6,(%rsp) +- movaps %xmm7,16(%rsp) +- movaps %xmm8,32(%rsp) +- movaps %xmm9,48(%rsp) ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $176,%rsp ++ andq $-16,%rsp ++ movaps %xmm6,16(%rsp) ++ movaps %xmm7,32(%rsp) ++ movaps %xmm8,48(%rsp) ++ movaps %xmm9,64(%rsp) ++ movaps %xmm10,80(%rsp) ++ movaps %xmm11,96(%rsp) ++ movaps %xmm12,112(%rsp) ++ movaps %xmm13,128(%rsp) ++ movaps %xmm14,144(%rsp) ++ movaps %xmm15,160(%rsp) + .Lcbc_decrypt_body: +- movups (%r8),%xmm9 ++ leaq -8(%rax),%rbp ++ movups (%r8),%xmm10 + movl %r10d,%eax +- cmpq $112,%rdx ++ cmpq $80,%rdx + jbe .Lcbc_dec_tail +- shrl $1,%r10d ++ ++ movups (%rcx),%xmm0 ++ movdqu 0(%rdi),%xmm2 ++ movdqu 16(%rdi),%xmm3 ++ movdqa %xmm2,%xmm11 ++ movdqu 32(%rdi),%xmm4 ++ movdqa %xmm3,%xmm12 ++ movdqu 48(%rdi),%xmm5 ++ movdqa %xmm4,%xmm13 ++ movdqu 64(%rdi),%xmm6 ++ movdqa %xmm5,%xmm14 ++ movdqu 80(%rdi),%xmm7 ++ movdqa %xmm6,%xmm15 ++ cmpq $112,%rdx ++ jbe .Lcbc_dec_six_or_seven ++ + subq $112,%rdx +- movl %r10d,%eax +- movaps %xmm9,64(%rsp) ++ leaq 112(%rcx),%rcx + jmp .Lcbc_dec_loop8_enter + .p2align 4 + .Lcbc_dec_loop8: +- movaps %xmm0,64(%rsp) + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + .Lcbc_dec_loop8_enter: +- movups (%rcx),%xmm0 +- movups (%rdi),%xmm2 +- movups 16(%rdi),%xmm3 +- movups 16(%rcx),%xmm1 ++ movdqu 96(%rdi),%xmm8 ++ pxor %xmm0,%xmm2 ++ movdqu 112(%rdi),%xmm9 ++ pxor %xmm0,%xmm3 ++ movups 16-112(%rcx),%xmm1 ++ pxor %xmm0,%xmm4 ++ xorq %r11,%r11 ++ cmpq $112,%rdx ++ pxor %xmm0,%xmm5 ++ pxor %xmm0,%xmm6 ++ pxor %xmm0,%xmm7 ++ pxor %xmm0,%xmm8 + +- leaq 32(%rcx),%rcx +- movdqu 32(%rdi),%xmm4 +- xorps %xmm0,%xmm2 +- movdqu 48(%rdi),%xmm5 +- xorps %xmm0,%xmm3 +- movdqu 64(%rdi),%xmm6 + .byte 102,15,56,222,209 +- pxor %xmm0,%xmm4 +- movdqu 80(%rdi),%xmm7 ++ pxor %xmm0,%xmm9 ++ movups 32-112(%rcx),%xmm0 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++ setnc %r11b ++.byte 102,68,15,56,222,193 ++ shlq $7,%r11 ++.byte 102,68,15,56,222,201 ++ addq %rdi,%r11 ++ movups 48-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 64-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 80-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 96-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 + .byte 102,15,56,222,217 +- pxor %xmm0,%xmm5 +- movdqu 96(%rdi),%xmm8 + .byte 102,15,56,222,225 +- pxor %xmm0,%xmm6 +- movdqu 112(%rdi),%xmm9 + .byte 102,15,56,222,233 +- pxor %xmm0,%xmm7 +- decl %eax + .byte 102,15,56,222,241 +- pxor %xmm0,%xmm8 + .byte 102,15,56,222,249 +- pxor %xmm0,%xmm9 +- movups (%rcx),%xmm0 + .byte 102,68,15,56,222,193 + .byte 102,68,15,56,222,201 +- movups 16(%rcx),%xmm1 +- +- call .Ldec_loop8_enter ++ movups 112-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 128-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 144-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 160-112(%rcx),%xmm0 ++ cmpl $11,%eax ++ jb .Lcbc_dec_done ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 176-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 192-112(%rcx),%xmm0 ++ je .Lcbc_dec_done ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 208-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 224-112(%rcx),%xmm0 ++.Lcbc_dec_done: ++.byte 102,15,56,222,209 ++ pxor %xmm0,%xmm10 ++.byte 102,15,56,222,217 ++ pxor %xmm0,%xmm11 ++.byte 102,15,56,222,225 ++ pxor %xmm0,%xmm12 ++.byte 102,15,56,222,233 ++ pxor %xmm0,%xmm13 ++.byte 102,15,56,222,241 ++ pxor %xmm0,%xmm14 ++.byte 102,15,56,222,249 ++ pxor %xmm0,%xmm15 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movdqu 80(%rdi),%xmm1 ++ ++.byte 102,65,15,56,223,210 ++ movdqu 96(%rdi),%xmm10 ++ pxor %xmm0,%xmm1 ++.byte 102,65,15,56,223,219 ++ pxor %xmm0,%xmm10 ++ movdqu 112(%rdi),%xmm0 ++ leaq 128(%rdi),%rdi ++.byte 102,65,15,56,223,228 ++ movdqu 0(%r11),%xmm11 ++.byte 102,65,15,56,223,237 ++ movdqu 16(%r11),%xmm12 ++.byte 102,65,15,56,223,246 ++ movdqu 32(%r11),%xmm13 ++.byte 102,65,15,56,223,255 ++ movdqu 48(%r11),%xmm14 ++.byte 102,68,15,56,223,193 ++ movdqu 64(%r11),%xmm15 ++.byte 102,69,15,56,223,202 ++ movdqa %xmm0,%xmm10 ++ movdqu 80(%r11),%xmm1 ++ movups -112(%rcx),%xmm0 + +- movups (%rdi),%xmm1 +- movups 16(%rdi),%xmm0 +- xorps 64(%rsp),%xmm2 +- xorps %xmm1,%xmm3 +- movups 32(%rdi),%xmm1 +- xorps %xmm0,%xmm4 +- movups 48(%rdi),%xmm0 +- xorps %xmm1,%xmm5 +- movups 64(%rdi),%xmm1 +- xorps %xmm0,%xmm6 +- movups 80(%rdi),%xmm0 +- xorps %xmm1,%xmm7 +- movups 96(%rdi),%xmm1 +- xorps %xmm0,%xmm8 +- movups 112(%rdi),%xmm0 +- xorps %xmm1,%xmm9 + movups %xmm2,(%rsi) ++ movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) ++ movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) ++ movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) +- movl %r10d,%eax ++ movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) +- movq %r11,%rcx ++ movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) +- leaq 128(%rdi),%rdi ++ movdqa %xmm1,%xmm7 + movups %xmm8,96(%rsi) + leaq 112(%rsi),%rsi ++ + subq $128,%rdx + ja .Lcbc_dec_loop8 + + movaps %xmm9,%xmm2 +- movaps %xmm0,%xmm9 ++ leaq -112(%rcx),%rcx + addq $112,%rdx + jle .Lcbc_dec_tail_collected +- movups %xmm2,(%rsi) +- leal 1(%r10,%r10,1),%eax ++ movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi ++ cmpq $80,%rdx ++ jbe .Lcbc_dec_tail ++ ++ movaps %xmm11,%xmm2 ++.Lcbc_dec_six_or_seven: ++ cmpq $96,%rdx ++ ja .Lcbc_dec_seven ++ ++ movaps %xmm7,%xmm8 ++ call _aesni_decrypt6 ++ pxor %xmm10,%xmm2 ++ movaps %xmm8,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ pxor %xmm15,%xmm7 ++ movdqu %xmm6,64(%rsi) ++ leaq 80(%rsi),%rsi ++ movdqa %xmm7,%xmm2 ++ jmp .Lcbc_dec_tail_collected ++ ++.p2align 4 ++.Lcbc_dec_seven: ++ movups 96(%rdi),%xmm8 ++ xorps %xmm9,%xmm9 ++ call _aesni_decrypt8 ++ movups 80(%rdi),%xmm9 ++ pxor %xmm10,%xmm2 ++ movups 96(%rdi),%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ pxor %xmm15,%xmm7 ++ movdqu %xmm6,64(%rsi) ++ pxor %xmm9,%xmm8 ++ movdqu %xmm7,80(%rsi) ++ leaq 96(%rsi),%rsi ++ movdqa %xmm8,%xmm2 ++ jmp .Lcbc_dec_tail_collected ++ + .Lcbc_dec_tail: + movups (%rdi),%xmm2 +- movaps %xmm2,%xmm8 +- cmpq $16,%rdx ++ subq $16,%rdx + jbe .Lcbc_dec_one + + movups 16(%rdi),%xmm3 +- movaps %xmm3,%xmm7 +- cmpq $32,%rdx ++ movaps %xmm2,%xmm11 ++ subq $16,%rdx + jbe .Lcbc_dec_two + + movups 32(%rdi),%xmm4 +- movaps %xmm4,%xmm6 +- cmpq $48,%rdx ++ movaps %xmm3,%xmm12 ++ subq $16,%rdx + jbe .Lcbc_dec_three + + movups 48(%rdi),%xmm5 +- cmpq $64,%rdx ++ movaps %xmm4,%xmm13 ++ subq $16,%rdx + jbe .Lcbc_dec_four + + movups 64(%rdi),%xmm6 +- cmpq $80,%rdx +- jbe .Lcbc_dec_five +- +- movups 80(%rdi),%xmm7 +- cmpq $96,%rdx +- jbe .Lcbc_dec_six +- +- movups 96(%rdi),%xmm8 +- movaps %xmm9,64(%rsp) +- call _aesni_decrypt8 +- movups (%rdi),%xmm1 +- movups 16(%rdi),%xmm0 +- xorps 64(%rsp),%xmm2 +- xorps %xmm1,%xmm3 +- movups 32(%rdi),%xmm1 +- xorps %xmm0,%xmm4 +- movups 48(%rdi),%xmm0 +- xorps %xmm1,%xmm5 +- movups 64(%rdi),%xmm1 +- xorps %xmm0,%xmm6 +- movups 80(%rdi),%xmm0 +- xorps %xmm1,%xmm7 +- movups 96(%rdi),%xmm9 +- xorps %xmm0,%xmm8 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) +- leaq 96(%rsi),%rsi +- movaps %xmm8,%xmm2 +- subq $112,%rdx ++ movaps %xmm5,%xmm14 ++ movaps %xmm6,%xmm15 ++ xorps %xmm7,%xmm7 ++ call _aesni_decrypt6 ++ pxor %xmm10,%xmm2 ++ movaps %xmm15,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ leaq 64(%rsi),%rsi ++ movdqa %xmm6,%xmm2 ++ subq $16,%rdx + jmp .Lcbc_dec_tail_collected ++ + .p2align 4 + .Lcbc_dec_one: ++ movaps %xmm2,%xmm11 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx +@@ -2405,116 +2846,79 @@ aesni_cbc_encrypt: + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 + .byte 102,15,56,223,209 +- xorps %xmm9,%xmm2 +- movaps %xmm8,%xmm9 +- subq $16,%rdx ++ xorps %xmm10,%xmm2 ++ movaps %xmm11,%xmm10 + jmp .Lcbc_dec_tail_collected + .p2align 4 + .Lcbc_dec_two: ++ movaps %xmm3,%xmm12 + xorps %xmm4,%xmm4 + call _aesni_decrypt3 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- movaps %xmm7,%xmm9 +- movaps %xmm3,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm12,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ movdqa %xmm3,%xmm2 + leaq 16(%rsi),%rsi +- subq $32,%rdx + jmp .Lcbc_dec_tail_collected + .p2align 4 + .Lcbc_dec_three: ++ movaps %xmm4,%xmm13 + call _aesni_decrypt3 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- xorps %xmm7,%xmm4 +- movups %xmm3,16(%rsi) +- movaps %xmm6,%xmm9 +- movaps %xmm4,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm13,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ movdqa %xmm4,%xmm2 + leaq 32(%rsi),%rsi +- subq $48,%rdx + jmp .Lcbc_dec_tail_collected + .p2align 4 + .Lcbc_dec_four: ++ movaps %xmm5,%xmm14 + call _aesni_decrypt4 +- xorps %xmm9,%xmm2 +- movups 48(%rdi),%xmm9 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- xorps %xmm7,%xmm4 +- movups %xmm3,16(%rsi) +- xorps %xmm6,%xmm5 +- movups %xmm4,32(%rsi) +- movaps %xmm5,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm14,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ movdqa %xmm5,%xmm2 + leaq 48(%rsi),%rsi +- subq $64,%rdx +- jmp .Lcbc_dec_tail_collected +-.p2align 4 +-.Lcbc_dec_five: +- xorps %xmm7,%xmm7 +- call _aesni_decrypt6 +- movups 16(%rdi),%xmm1 +- movups 32(%rdi),%xmm0 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- xorps %xmm1,%xmm4 +- movups 48(%rdi),%xmm1 +- xorps %xmm0,%xmm5 +- movups 64(%rdi),%xmm9 +- xorps %xmm1,%xmm6 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- leaq 64(%rsi),%rsi +- movaps %xmm6,%xmm2 +- subq $80,%rdx +- jmp .Lcbc_dec_tail_collected +-.p2align 4 +-.Lcbc_dec_six: +- call _aesni_decrypt6 +- movups 16(%rdi),%xmm1 +- movups 32(%rdi),%xmm0 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- xorps %xmm1,%xmm4 +- movups 48(%rdi),%xmm1 +- xorps %xmm0,%xmm5 +- movups 64(%rdi),%xmm0 +- xorps %xmm1,%xmm6 +- movups 80(%rdi),%xmm9 +- xorps %xmm0,%xmm7 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- movups %xmm6,64(%rsi) +- leaq 80(%rsi),%rsi +- movaps %xmm7,%xmm2 +- subq $96,%rdx + jmp .Lcbc_dec_tail_collected ++ + .p2align 4 + .Lcbc_dec_tail_collected: ++ movups %xmm10,(%r8) + andq $15,%rdx +- movups %xmm9,(%r8) + jnz .Lcbc_dec_tail_partial + movups %xmm2,(%rsi) + jmp .Lcbc_dec_ret + .p2align 4 + .Lcbc_dec_tail_partial: +- movaps %xmm2,64(%rsp) ++ movaps %xmm2,(%rsp) + movq $16,%rcx + movq %rsi,%rdi + subq %rdx,%rcx +- leaq 64(%rsp),%rsi ++ leaq (%rsp),%rsi + .long 0x9066A4F3 + + .Lcbc_dec_ret: +- movaps (%rsp),%xmm6 +- movaps 16(%rsp),%xmm7 +- movaps 32(%rsp),%xmm8 +- movaps 48(%rsp),%xmm9 +- leaq 88(%rsp),%rsp ++ movaps 16(%rsp),%xmm6 ++ movaps 32(%rsp),%xmm7 ++ movaps 48(%rsp),%xmm8 ++ movaps 64(%rsp),%xmm9 ++ movaps 80(%rsp),%xmm10 ++ movaps 96(%rsp),%xmm11 ++ movaps 112(%rsp),%xmm12 ++ movaps 128(%rsp),%xmm13 ++ movaps 144(%rsp),%xmm14 ++ movaps 160(%rsp),%xmm15 ++ leaq (%rbp),%rsp ++ popq %rbp + .Lcbc_ret: + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi +@@ -2759,6 +3163,8 @@ __aesni_set_encrypt_key: + .long 1,0,0,0 + .Lxts_magic: + .long 0x87,0,1,0 ++.Lincrement1: ++.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 + + .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .p2align 6 +@@ -2823,45 +3229,9 @@ ccm64_se_handler: + jmp .Lcommon_seh_tail + + +-.def ctr32_se_handler; .scl 3; .type 32; .endef +-.p2align 4 +-ctr32_se_handler: +- pushq %rsi +- pushq %rdi +- pushq %rbx +- pushq %rbp +- pushq %r12 +- pushq %r13 +- pushq %r14 +- pushq %r15 +- pushfq +- subq $64,%rsp +- +- movq 120(%r8),%rax +- movq 248(%r8),%rbx +- +- leaq .Lctr32_body(%rip),%r10 +- cmpq %r10,%rbx +- jb .Lcommon_seh_tail +- +- movq 152(%r8),%rax +- +- leaq .Lctr32_ret(%rip),%r10 +- cmpq %r10,%rbx +- jae .Lcommon_seh_tail +- +- leaq 32(%rax),%rsi +- leaq 512(%r8),%rdi +- movl $20,%ecx +-.long 0xa548f3fc +- leaq 200(%rax),%rax +- +- jmp .Lcommon_seh_tail +- +- +-.def xts_se_handler; .scl 3; .type 32; .endef ++.def ctr_xts_se_handler; .scl 3; .type 32; .endef + .p2align 4 +-xts_se_handler: ++ctr_xts_se_handler: + pushq %rsi + pushq %rdi + pushq %rbx +@@ -2891,13 +3261,13 @@ xts_se_handler: + cmpq %r10,%rbx + jae .Lcommon_seh_tail + +- leaq 96(%rax),%rsi ++ movq 160(%r8),%rax ++ leaq -160(%rax),%rsi + leaq 512(%r8),%rdi + movl $20,%ecx + .long 0xa548f3fc +- leaq 104+160(%rax),%rax + +- jmp .Lcommon_seh_tail ++ jmp .Lcommon_rbp_tail + + .def cbc_se_handler; .scl 3; .type 32; .endef + .p2align 4 +@@ -2928,11 +3298,16 @@ cbc_se_handler: + cmpq %r10,%rbx + jae .Lcommon_seh_tail + +- leaq 0(%rax),%rsi ++ leaq 16(%rax),%rsi + leaq 512(%r8),%rdi +- movl $8,%ecx ++ movl $20,%ecx + .long 0xa548f3fc +- leaq 88(%rax),%rax ++ ++.Lcommon_rbp_tail: ++ movq 160(%r8),%rax ++ movq (%rax),%rbp ++ leaq 8(%rax),%rax ++ movq %rbp,160(%r8) + jmp .Lcommon_seh_tail + + .Lrestore_cbc_rax: +@@ -3029,14 +3404,15 @@ cbc_se_handler: + .rva .Lccm64_dec_body,.Lccm64_dec_ret + .LSEH_info_ctr32: + .byte 9,0,0,0 +-.rva ctr32_se_handler ++.rva ctr_xts_se_handler ++.rva .Lctr32_body,.Lctr32_epilogue + .LSEH_info_xts_enc: + .byte 9,0,0,0 +-.rva xts_se_handler ++.rva ctr_xts_se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue + .LSEH_info_xts_dec: + .byte 9,0,0,0 +-.rva xts_se_handler ++.rva ctr_xts_se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue + .LSEH_info_cbc: + .byte 9,0,0,0 +diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s +index 9f658ee..a3a0e30 100644 +--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s ++++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s +@@ -686,6 +686,501 @@ padlock_cbc_encrypt: + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 + .LSEH_end_padlock_cbc_encrypt: ++.globl padlock_cfb_encrypt ++.def padlock_cfb_encrypt; .scl 2; .type 32; .endef ++.p2align 4 ++padlock_cfb_encrypt: ++ movq %rdi,8(%rsp) ++ movq %rsi,16(%rsp) ++ movq %rsp,%rax ++.LSEH_begin_padlock_cfb_encrypt: ++ movq %rcx,%rdi ++ movq %rdx,%rsi ++ movq %r8,%rdx ++ movq %r9,%rcx ++ ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz .Lcfb_abort ++ testq $15,%rcx ++ jnz .Lcfb_abort ++ leaq .Lpadlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz .Lcfb_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz .Lcfb_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++ jmp .Lcfb_loop ++.p2align 4 ++.Lcfb_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz .Lcfb_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++.Lcfb_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,224 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz .Lcfb_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++.Lcfb_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jnz .Lcfb_loop ++ cmpq %rbp,%rsp ++ je .Lcfb_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++.Lcfb_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja .Lcfb_bzero ++ ++.Lcfb_done: ++ leaq (%rbp),%rsp ++ jmp .Lcfb_exit ++ ++.p2align 4 ++.Lcfb_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,224 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++.Lcfb_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++.Lcfb_abort: ++ popq %rbx ++ popq %rbp ++ movq 8(%rsp),%rdi ++ movq 16(%rsp),%rsi ++ .byte 0xf3,0xc3 ++.LSEH_end_padlock_cfb_encrypt: ++.globl padlock_ofb_encrypt ++.def padlock_ofb_encrypt; .scl 2; .type 32; .endef ++.p2align 4 ++padlock_ofb_encrypt: ++ movq %rdi,8(%rsp) ++ movq %rsi,16(%rsp) ++ movq %rsp,%rax ++.LSEH_begin_padlock_ofb_encrypt: ++ movq %rcx,%rdi ++ movq %rdx,%rsi ++ movq %r8,%rdx ++ movq %r9,%rcx ++ ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz .Lofb_abort ++ testq $15,%rcx ++ jnz .Lofb_abort ++ leaq .Lpadlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz .Lofb_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz .Lofb_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++ jmp .Lofb_loop ++.p2align 4 ++.Lofb_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz .Lofb_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++.Lofb_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,232 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz .Lofb_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++.Lofb_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jnz .Lofb_loop ++ cmpq %rbp,%rsp ++ je .Lofb_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++.Lofb_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja .Lofb_bzero ++ ++.Lofb_done: ++ leaq (%rbp),%rsp ++ jmp .Lofb_exit ++ ++.p2align 4 ++.Lofb_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,232 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++.Lofb_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++.Lofb_abort: ++ popq %rbx ++ popq %rbp ++ movq 8(%rsp),%rdi ++ movq 16(%rsp),%rsi ++ .byte 0xf3,0xc3 ++.LSEH_end_padlock_ofb_encrypt: ++.globl padlock_ctr32_encrypt ++.def padlock_ctr32_encrypt; .scl 2; .type 32; .endef ++.p2align 4 ++padlock_ctr32_encrypt: ++ movq %rdi,8(%rsp) ++ movq %rsi,16(%rsp) ++ movq %rsp,%rax ++.LSEH_begin_padlock_ctr32_encrypt: ++ movq %rcx,%rdi ++ movq %rdx,%rsi ++ movq %r8,%rdx ++ movq %r9,%rcx ++ ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz .Lctr32_abort ++ testq $15,%rcx ++ jnz .Lctr32_abort ++ leaq .Lpadlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz .Lctr32_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz .Lctr32_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++.Lctr32_reenter: ++ movl -4(%rdx),%eax ++ bswapl %eax ++ negl %eax ++ andl $31,%eax ++ movq $512,%rbx ++ shll $4,%eax ++ cmovzq %rbx,%rax ++ cmpq %rax,%rcx ++ cmovaq %rax,%rbx ++ cmovbeq %rcx,%rbx ++ cmpq %rbx,%rcx ++ ja .Lctr32_loop ++ movq %rsi,%rax ++ cmpq %rsp,%rbp ++ cmoveq %rdi,%rax ++ addq %rcx,%rax ++ negq %rax ++ andq $4095,%rax ++ cmpq $32,%rax ++ movq $-32,%rax ++ cmovaeq %rbx,%rax ++ andq %rax,%rbx ++ jz .Lctr32_unaligned_tail ++ jmp .Lctr32_loop ++.p2align 4 ++.Lctr32_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz .Lctr32_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++.Lctr32_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ movl -4(%rdx),%eax ++ testl $4294901760,%eax ++ jnz .Lctr32_no_carry ++ bswapl %eax ++ addl $65536,%eax ++ bswapl %eax ++ movl %eax,-4(%rdx) ++.Lctr32_no_carry: ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz .Lctr32_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++.Lctr32_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jz .Lctr32_break ++ cmpq %rbx,%rcx ++ jae .Lctr32_loop ++ movq %rcx,%rbx ++ movq %rsi,%rax ++ cmpq %rsp,%rbp ++ cmoveq %rdi,%rax ++ addq %rcx,%rax ++ negq %rax ++ andq $4095,%rax ++ cmpq $32,%rax ++ movq $-32,%rax ++ cmovaeq %rbx,%rax ++ andq %rax,%rbx ++ jnz .Lctr32_loop ++.Lctr32_unaligned_tail: ++ xorl %eax,%eax ++ cmpq %rsp,%rbp ++ cmoveq %rcx,%rax ++ movq %rdi,%r8 ++ movq %rcx,%rbx ++ subq %rax,%rsp ++ shrq $3,%rcx ++ leaq (%rsp),%rdi ++.byte 0xf3,0x48,0xa5 ++ movq %rsp,%rsi ++ movq %r8,%rdi ++ movq %rbx,%rcx ++ jmp .Lctr32_loop ++.p2align 4 ++.Lctr32_break: ++ cmpq %rbp,%rsp ++ je .Lctr32_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++.Lctr32_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja .Lctr32_bzero ++ ++.Lctr32_done: ++ leaq (%rbp),%rsp ++ jmp .Lctr32_exit ++ ++.p2align 4 ++.Lctr32_aligned: ++ movl -4(%rdx),%eax ++ bswapl %eax ++ negl %eax ++ andl $65535,%eax ++ movq $1048576,%rbx ++ shll $4,%eax ++ cmovzq %rbx,%rax ++ cmpq %rax,%rcx ++ cmovaq %rax,%rbx ++ cmovbeq %rcx,%rbx ++ jbe .Lctr32_aligned_skip ++ ++.Lctr32_aligned_loop: ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ ++ movl -4(%rdx),%eax ++ bswapl %eax ++ addl $65536,%eax ++ bswapl %eax ++ movl %eax,-4(%rdx) ++ ++ movq %r10,%rcx ++ subq %r11,%rcx ++ movq $1048576,%rbx ++ jz .Lctr32_exit ++ cmpq %rbx,%rcx ++ jae .Lctr32_aligned_loop ++ ++.Lctr32_aligned_skip: ++ leaq (%rsi,%rcx,1),%rbp ++ negq %rbp ++ andq $4095,%rbp ++ xorl %eax,%eax ++ cmpq $32,%rbp ++ movq $32-1,%rbp ++ cmovaeq %rax,%rbp ++ andq %rcx,%rbp ++ subq %rbp,%rcx ++ jz .Lctr32_aligned_tail ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ testq %rbp,%rbp ++ jz .Lctr32_exit ++ ++.Lctr32_aligned_tail: ++ movq %rdi,%r8 ++ movq %rbp,%rbx ++ movq %rbp,%rcx ++ leaq (%rsp),%rbp ++ subq %rcx,%rsp ++ shrq $3,%rcx ++ leaq (%rsp),%rdi ++.byte 0xf3,0x48,0xa5 ++ leaq (%r8),%rdi ++ leaq (%rsp),%rsi ++ movq %rbx,%rcx ++ jmp .Lctr32_loop ++.Lctr32_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++.Lctr32_abort: ++ popq %rbx ++ popq %rbp ++ movq 8(%rsp),%rdi ++ movq 16(%rsp),%rsi ++ .byte 0xf3,0xc3 ++.LSEH_end_padlock_ctr32_encrypt: + .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .p2align 4 + .data +diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s +index 69eb468..d969f30 100644 +--- a/lib/accelerated/x86/coff/padlock-x86-coff.s ++++ b/lib/accelerated/x86/coff/padlock-x86-coff.s +@@ -515,6 +515,354 @@ _padlock_cbc_encrypt: + popl %ebx + popl %ebp + ret ++.globl _padlock_cfb_encrypt ++.def _padlock_cfb_encrypt; .scl 2; .type 32; .endef ++.align 16 ++_padlock_cfb_encrypt: ++.L_padlock_cfb_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz .L028cfb_abort ++ testl $15,%ecx ++ jnz .L028cfb_abort ++ leal .Lpadlock_saved_context,%eax ++ pushfl ++ cld ++ call __padlock_verify_ctx ++.L029cfb_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%edx) ++ jnz .L030cfb_aligned ++ testl $15,%edi ++ setz %al ++ testl $15,%esi ++ setz %bl ++ testl %ebx,%eax ++ jnz .L030cfb_aligned ++ negl %eax ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp .L031cfb_loop ++.align 16 ++.L031cfb_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ testl $15,%edi ++ cmovnzl %esp,%edi ++ testl $15,%esi ++ jz .L032cfb_inp_aligned ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++ movl %ebx,%ecx ++ movl %edi,%esi ++.L032cfb_inp_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,224 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ testl $15,%edi ++ jz .L033cfb_out_aligned ++ movl %ebx,%ecx ++ leal (%esp),%esi ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++.L033cfb_out_aligned: ++ movl 4(%ebp),%esi ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz .L031cfb_loop ++ cmpl %ebp,%esp ++ je .L034cfb_done ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++.L035cfb_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja .L035cfb_bzero ++.L034cfb_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ jmp .L036cfb_exit ++.align 16 ++.L030cfb_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,224 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++.L036cfb_exit: ++ movl $1,%eax ++ leal 4(%esp),%esp ++.L028cfb_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret ++.globl _padlock_ofb_encrypt ++.def _padlock_ofb_encrypt; .scl 2; .type 32; .endef ++.align 16 ++_padlock_ofb_encrypt: ++.L_padlock_ofb_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz .L037ofb_abort ++ testl $15,%ecx ++ jnz .L037ofb_abort ++ leal .Lpadlock_saved_context,%eax ++ pushfl ++ cld ++ call __padlock_verify_ctx ++.L038ofb_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%edx) ++ jnz .L039ofb_aligned ++ testl $15,%edi ++ setz %al ++ testl $15,%esi ++ setz %bl ++ testl %ebx,%eax ++ jnz .L039ofb_aligned ++ negl %eax ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp .L040ofb_loop ++.align 16 ++.L040ofb_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ testl $15,%edi ++ cmovnzl %esp,%edi ++ testl $15,%esi ++ jz .L041ofb_inp_aligned ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++ movl %ebx,%ecx ++ movl %edi,%esi ++.L041ofb_inp_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,232 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ testl $15,%edi ++ jz .L042ofb_out_aligned ++ movl %ebx,%ecx ++ leal (%esp),%esi ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++.L042ofb_out_aligned: ++ movl 4(%ebp),%esi ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz .L040ofb_loop ++ cmpl %ebp,%esp ++ je .L043ofb_done ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++.L044ofb_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja .L044ofb_bzero ++.L043ofb_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ jmp .L045ofb_exit ++.align 16 ++.L039ofb_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,232 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++.L045ofb_exit: ++ movl $1,%eax ++ leal 4(%esp),%esp ++.L037ofb_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret ++.globl _padlock_ctr32_encrypt ++.def _padlock_ctr32_encrypt; .scl 2; .type 32; .endef ++.align 16 ++_padlock_ctr32_encrypt: ++.L_padlock_ctr32_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz .L046ctr32_abort ++ testl $15,%ecx ++ jnz .L046ctr32_abort ++ leal .Lpadlock_saved_context,%eax ++ pushfl ++ cld ++ call __padlock_verify_ctx ++.L047ctr32_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ movq -16(%edx),%mm0 ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp .L048ctr32_loop ++.align 16 ++.L048ctr32_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ movl -4(%edx),%ecx ++ xorl %edi,%edi ++ movl -8(%edx),%eax ++.L049ctr32_prepare: ++ movl %ecx,12(%esp,%edi,1) ++ bswap %ecx ++ movq %mm0,(%esp,%edi,1) ++ incl %ecx ++ movl %eax,8(%esp,%edi,1) ++ bswap %ecx ++ leal 16(%edi),%edi ++ cmpl %ebx,%edi ++ jb .L049ctr32_prepare ++ movl %ecx,-4(%edx) ++ leal (%esp),%esi ++ leal (%esp),%edi ++ movl %ebx,%ecx ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,200 ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ movl 4(%ebp),%esi ++ xorl %ecx,%ecx ++.L050ctr32_xor: ++ movups (%esi,%ecx,1),%xmm1 ++ leal 16(%ecx),%ecx ++ pxor -16(%esp,%ecx,1),%xmm1 ++ movups %xmm1,-16(%edi,%ecx,1) ++ cmpl %ebx,%ecx ++ jb .L050ctr32_xor ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz .L048ctr32_loop ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++.L051ctr32_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja .L051ctr32_bzero ++.L052ctr32_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ movl $1,%eax ++ leal 4(%esp),%esp ++ emms ++.L046ctr32_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret + .globl _padlock_xstore + .def _padlock_xstore; .scl 2; .type 32; .endef + .align 16 +@@ -533,10 +881,10 @@ __win32_segv_handler: + movl 4(%esp),%edx + movl 12(%esp),%ecx + cmpl $3221225477,(%edx) +- jne .L028ret ++ jne .L053ret + addl $4,184(%ecx) + movl $0,%eax +-.L028ret: ++.L053ret: + ret + .globl _padlock_sha1_oneshot + .def _padlock_sha1_oneshot; .scl 2; .type 32; .endef +diff --git a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s +index 8f2b96f..9755951 100644 +--- a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s ++++ b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s +@@ -697,6 +697,7 @@ gcm_ghash_4bit: + .type gcm_init_clmul,@function + .align 16 + gcm_init_clmul: ++.L_init_clmul: + movdqu (%rsi),%xmm2 + pshufd $78,%xmm2,%xmm2 + +@@ -715,15 +716,15 @@ gcm_init_clmul: + pxor %xmm5,%xmm2 + + ++ pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 ++ pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 +-.byte 102,15,58,68,220,0 ++.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + +@@ -733,44 +734,134 @@ gcm_init_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ pxor %xmm4,%xmm0 ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ pshufd $78,%xmm2,%xmm3 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm2,%xmm3 ++ movdqu %xmm2,0(%rdi) ++ pxor %xmm0,%xmm4 ++ movdqu %xmm0,16(%rdi) ++.byte 102,15,58,15,227,8 ++ movdqu %xmm4,32(%rdi) ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,15,58,68,194,0 ++.byte 102,15,58,68,202,17 ++.byte 102,15,58,68,222,0 ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ ++ movdqa %xmm3,%xmm4 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 ++ pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 ++ ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ movdqa %xmm0,%xmm5 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,15,58,68,194,0 ++.byte 102,15,58,68,202,17 ++.byte 102,15,58,68,222,0 ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ ++ movdqa %xmm3,%xmm4 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 ++ pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 ++ ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- movdqu %xmm2,(%rdi) +- movdqu %xmm0,16(%rdi) ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ pshufd $78,%xmm5,%xmm3 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm5,%xmm3 ++ movdqu %xmm5,48(%rdi) ++ pxor %xmm0,%xmm4 ++ movdqu %xmm0,64(%rdi) ++.byte 102,15,58,15,227,8 ++ movdqu %xmm4,80(%rdi) + .byte 0xf3,0xc3 + .size gcm_init_clmul,.-gcm_init_clmul + .globl gcm_gmult_clmul + .type gcm_gmult_clmul,@function + .align 16 + gcm_gmult_clmul: ++.L_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa .Lbswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 ++ movdqu 32(%rsi),%xmm4 + .byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 + .byte 102,15,58,68,220,0 +@@ -783,186 +874,358 @@ gcm_gmult_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + .byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 + .size gcm_gmult_clmul,.-gcm_gmult_clmul + .globl gcm_ghash_clmul + .type gcm_ghash_clmul,@function +-.align 16 ++.align 32 + gcm_ghash_clmul: ++.L_ghash_clmul: + movdqa .Lbswap_mask(%rip),%xmm5 ++ movq $11547335547999543296,%rax + + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm2 ++ movdqu 32(%rsi),%xmm10 + .byte 102,15,56,0,197 + + subq $16,%rcx + jz .Lodd_tail + +- movdqu 16(%rsi),%xmm8 ++ movdqu 16(%rsi),%xmm9 ++ cmpq $48,%rcx ++ jb .Lskip4x + ++ subq $48,%rcx ++ movdqu 48(%rsi),%xmm14 ++ movdqu 64(%rsi),%xmm15 + + + + +- movdqu (%rdx),%xmm3 +- movdqu 16(%rdx),%xmm6 +-.byte 102,15,56,0,221 ++ movdqu 48(%rdx),%xmm6 ++ movdqu 32(%rdx),%xmm11 + .byte 102,15,56,0,245 +- pxor %xmm3,%xmm0 +- movdqa %xmm6,%xmm7 +- pshufd $78,%xmm6,%xmm3 +- pshufd $78,%xmm2,%xmm4 +- pxor %xmm6,%xmm3 +- pxor %xmm2,%xmm4 ++.byte 102,68,15,56,0,221 ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm7 ++ pxor %xmm6,%xmm7 + .byte 102,15,58,68,242,0 +-.byte 102,15,58,68,250,17 +-.byte 102,15,58,68,220,0 +- pxor %xmm6,%xmm3 +- pxor %xmm7,%xmm3 ++.byte 102,68,15,58,68,194,17 ++.byte 102,65,15,58,68,250,0 + ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,217,0 ++.byte 102,69,15,58,68,233,17 ++ xorps %xmm11,%xmm6 ++.byte 102,69,15,58,68,226,16 ++ xorps %xmm13,%xmm8 ++ movups 80(%rsi),%xmm10 ++ xorps %xmm12,%xmm7 ++ ++ movdqu 16(%rdx),%xmm11 ++ movdqu 0(%rdx),%xmm3 ++.byte 102,68,15,56,0,221 ++.byte 102,15,56,0,221 ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm3,%xmm0 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,222,0 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,69,15,58,68,238,17 ++ xorps %xmm11,%xmm6 ++.byte 102,69,15,58,68,226,0 ++ xorps %xmm13,%xmm8 ++ ++ leaq 64(%rdx),%rdx ++ subq $64,%rcx ++ jc .Ltail4x ++ ++ jmp .Lmod4_loop ++.align 32 ++.Lmod4_loop: ++.byte 102,65,15,58,68,199,0 ++ xorps %xmm12,%xmm7 ++ movdqu 48(%rdx),%xmm11 ++.byte 102,68,15,56,0,221 ++.byte 102,65,15,58,68,207,17 ++ xorps %xmm6,%xmm0 ++ movdqu 32(%rdx),%xmm6 ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++.byte 102,65,15,58,68,218,16 ++ xorps %xmm8,%xmm1 ++ pxor %xmm11,%xmm12 ++.byte 102,15,56,0,245 ++ movups 32(%rsi),%xmm10 ++.byte 102,68,15,58,68,218,0 ++ xorps %xmm7,%xmm3 ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm7 ++ ++ pxor %xmm0,%xmm3 ++ pxor %xmm6,%xmm7 ++ pxor %xmm1,%xmm3 + movdqa %xmm3,%xmm4 +- psrldq $8,%xmm3 ++ pslldq $8,%xmm3 ++.byte 102,68,15,58,68,234,17 ++ psrldq $8,%xmm4 ++ pxor %xmm3,%xmm0 ++ movdqa .L7_mask(%rip),%xmm3 ++ pxor %xmm4,%xmm1 ++.byte 102,72,15,110,224 ++ ++ pand %xmm0,%xmm3 ++.byte 102,15,56,0,227 ++.byte 102,69,15,58,68,226,0 ++ pxor %xmm0,%xmm4 ++ psllq $57,%xmm4 ++ movdqa %xmm4,%xmm3 + pslldq $8,%xmm4 +- pxor %xmm3,%xmm7 +- pxor %xmm4,%xmm6 ++.byte 102,65,15,58,68,241,0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ movdqu 0(%rdx),%xmm3 ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++.byte 102,69,15,58,68,193,17 ++ xorps %xmm11,%xmm6 ++ movdqu 16(%rdx),%xmm11 ++.byte 102,68,15,56,0,221 ++.byte 102,65,15,58,68,250,16 ++ xorps %xmm13,%xmm8 ++ movups 80(%rsi),%xmm10 ++.byte 102,15,56,0,221 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ ++ movdqa %xmm11,%xmm13 ++ pxor %xmm12,%xmm7 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,222,0 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ psrlq $1,%xmm0 ++.byte 102,69,15,58,68,238,17 ++ xorps %xmm11,%xmm6 ++ pxor %xmm1,%xmm0 ++ ++.byte 102,69,15,58,68,226,0 ++ xorps %xmm13,%xmm8 ++ + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm8,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm8,%xmm4 + +- leaq 32(%rdx),%rdx +- subq $32,%rcx +- jbe .Leven_tail ++ leaq 64(%rdx),%rdx ++ subq $64,%rcx ++ jnc .Lmod4_loop ++ ++.Ltail4x: ++.byte 102,65,15,58,68,199,0 ++ xorps %xmm12,%xmm7 ++.byte 102,65,15,58,68,207,17 ++ xorps %xmm6,%xmm0 ++.byte 102,65,15,58,68,218,16 ++ xorps %xmm8,%xmm1 ++ pxor %xmm0,%xmm1 ++ pxor %xmm7,%xmm3 + +-.Lmod_loop: +-.byte 102,65,15,58,68,192,0 +-.byte 102,65,15,58,68,200,17 +-.byte 102,15,58,68,220,0 +- pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 ++ pxor %xmm0,%xmm1 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 +- movdqu (%rdx),%xmm3 +- pxor %xmm6,%xmm0 +- pxor %xmm7,%xmm1 + ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ pxor %xmm4,%xmm0 ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ addq $64,%rcx ++ jz .Ldone ++ movdqu 32(%rsi),%xmm10 ++ subq $16,%rcx ++ jz .Lodd_tail ++.Lskip4x: ++ ++ ++ ++ ++ ++ movdqu (%rdx),%xmm3 + movdqu 16(%rdx),%xmm6 + .byte 102,15,56,0,221 + .byte 102,15,56,0,245 ++ pxor %xmm3,%xmm0 ++ ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm3 ++ pxor %xmm6,%xmm3 ++.byte 102,15,58,68,242,0 ++.byte 102,68,15,58,68,194,17 ++.byte 102,65,15,58,68,218,0 ++ ++ leaq 32(%rdx),%rdx ++ subq $32,%rcx ++ jbe .Leven_tail ++ jmp .Lmod_loop + +- movdqa %xmm6,%xmm7 +- pshufd $78,%xmm6,%xmm9 +- pshufd $78,%xmm2,%xmm10 +- pxor %xmm6,%xmm9 +- pxor %xmm2,%xmm10 ++.align 32 ++.Lmod_loop: ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm0,%xmm4 ++ ++.byte 102,65,15,58,68,193,0 ++.byte 102,65,15,58,68,201,17 ++.byte 102,65,15,58,68,226,16 ++ ++ pxor %xmm6,%xmm0 ++ pxor %xmm8,%xmm1 ++ movdqu (%rdx),%xmm8 ++.byte 102,68,15,56,0,197 ++ movdqu 16(%rdx),%xmm6 ++ ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ pxor %xmm8,%xmm1 ++ pxor %xmm3,%xmm4 ++.byte 102,15,56,0,245 ++ movdqa %xmm4,%xmm3 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 + pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 + ++ movdqa %xmm6,%xmm8 ++ ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 +- pxor %xmm3,%xmm0 + .byte 102,15,58,68,242,0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ pshufd $78,%xmm8,%xmm3 ++ pxor %xmm8,%xmm3 + +-.byte 102,15,58,68,250,17 ++.byte 102,68,15,58,68,194,17 + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 +- +-.byte 102,69,15,58,68,202,0 +- movdqa %xmm0,%xmm1 +- pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm8,%xmm4 +- pxor %xmm0,%xmm3 +- pxor %xmm8,%xmm4 +- +- pxor %xmm6,%xmm9 +- pxor %xmm7,%xmm9 +- movdqa %xmm9,%xmm10 +- psrldq $8,%xmm9 +- pslldq $8,%xmm10 +- pxor %xmm9,%xmm7 +- pxor %xmm10,%xmm6 ++.byte 102,65,15,58,68,218,0 ++ pxor %xmm1,%xmm0 + + leaq 32(%rdx),%rdx + subq $32,%rcx + ja .Lmod_loop + + .Leven_tail: +-.byte 102,65,15,58,68,192,0 +-.byte 102,65,15,58,68,200,17 +-.byte 102,15,58,68,220,0 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm0,%xmm4 ++ ++.byte 102,65,15,58,68,193,0 ++.byte 102,65,15,58,68,201,17 ++.byte 102,65,15,58,68,226,16 ++ ++ pxor %xmm6,%xmm0 ++ pxor %xmm8,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 +- +- movdqa %xmm3,%xmm4 ++ pxor %xmm3,%xmm4 ++ movdqa %xmm4,%xmm3 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 +- pxor %xmm6,%xmm0 +- pxor %xmm7,%xmm1 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + testq %rcx,%rcx + jnz .Ldone + +@@ -972,12 +1235,10 @@ gcm_ghash_clmul: + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 +-.byte 102,15,58,68,220,0 ++.byte 102,65,15,58,68,218,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + +@@ -987,38 +1248,60 @@ gcm_ghash_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + .Ldone: + .byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 +-.LSEH_end_gcm_ghash_clmul: + .size gcm_ghash_clmul,.-gcm_ghash_clmul ++.globl gcm_init_avx ++.type gcm_init_avx,@function ++.align 32 ++gcm_init_avx: ++ jmp .L_init_clmul ++.size gcm_init_avx,.-gcm_init_avx ++.globl gcm_gmult_avx ++.type gcm_gmult_avx,@function ++.align 32 ++gcm_gmult_avx: ++ jmp .L_gmult_clmul ++.size gcm_gmult_avx,.-gcm_gmult_avx ++.globl gcm_ghash_avx ++.type gcm_ghash_avx,@function ++.align 32 ++gcm_ghash_avx: ++ jmp .L_ghash_clmul ++.size gcm_ghash_avx,.-gcm_ghash_avx + .align 64 + .Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + .L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 ++.L7_mask: ++.long 7,0,7,0 ++.L7_mask_poly: ++.long 7,0,450,0 + .align 64 + .type .Lrem_4bit,@object + .Lrem_4bit: +diff --git a/lib/accelerated/x86/elf/appro-aes-x86-64.s b/lib/accelerated/x86/elf/appro-aes-x86-64.s +index f48666f..d3734a6 100644 +--- a/lib/accelerated/x86/elf/appro-aes-x86-64.s ++++ b/lib/accelerated/x86/elf/appro-aes-x86-64.s +@@ -925,199 +925,412 @@ aesni_ccm64_decrypt_blocks: + .type aesni_ctr32_encrypt_blocks,@function + .align 16 + aesni_ctr32_encrypt_blocks: ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $128,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp ++ + cmpq $1,%rdx + je .Lctr32_one_shortcut + +- movdqu (%r8),%xmm14 +- movdqa .Lbswap_mask(%rip),%xmm15 +- xorl %eax,%eax +-.byte 102,69,15,58,22,242,3 +-.byte 102,68,15,58,34,240,3 ++ movdqu (%r8),%xmm2 ++ movdqu (%rcx),%xmm0 ++ movl 12(%r8),%r8d ++ pxor %xmm0,%xmm2 ++ movl 12(%rcx),%r11d ++ movdqa %xmm2,0(%rsp) ++ bswapl %r8d ++ movdqa %xmm2,%xmm3 ++ movdqa %xmm2,%xmm4 ++ movdqa %xmm2,%xmm5 ++ movdqa %xmm2,64(%rsp) ++ movdqa %xmm2,80(%rsp) ++ movdqa %xmm2,96(%rsp) ++ movdqa %xmm2,112(%rsp) + + movl 240(%rcx),%eax ++ ++ leaq 1(%r8),%r9 ++ leaq 2(%r8),%r10 ++ bswapl %r9d + bswapl %r10d +- pxor %xmm12,%xmm12 +- pxor %xmm13,%xmm13 +-.byte 102,69,15,58,34,226,0 +- leaq 3(%r10),%r11 +-.byte 102,69,15,58,34,235,0 +- incl %r10d +-.byte 102,69,15,58,34,226,1 +- incq %r11 +-.byte 102,69,15,58,34,235,1 +- incl %r10d +-.byte 102,69,15,58,34,226,2 +- incq %r11 +-.byte 102,69,15,58,34,235,2 +- movdqa %xmm12,-40(%rsp) +-.byte 102,69,15,56,0,231 +- movdqa %xmm13,-24(%rsp) +-.byte 102,69,15,56,0,239 +- +- pshufd $192,%xmm12,%xmm2 +- pshufd $128,%xmm12,%xmm3 +- pshufd $64,%xmm12,%xmm4 +- cmpq $6,%rdx +- jb .Lctr32_tail +- shrl $1,%eax +- movq %rcx,%r11 +- movl %eax,%r10d +- subq $6,%rdx +- jmp .Lctr32_loop6 ++ xorl %r11d,%r9d ++ xorl %r11d,%r10d ++.byte 102,65,15,58,34,217,3 ++ leaq 3(%r8),%r9 ++ movdqa %xmm3,16(%rsp) ++.byte 102,65,15,58,34,226,3 ++ bswapl %r9d ++ leaq 4(%r8),%r10 ++ movdqa %xmm4,32(%rsp) ++ xorl %r11d,%r9d ++ bswapl %r10d ++.byte 102,65,15,58,34,233,3 ++ xorl %r11d,%r10d ++ movdqa %xmm5,48(%rsp) ++ leaq 5(%r8),%r9 ++ movl %r10d,64+12(%rsp) ++ bswapl %r9d ++ leaq 6(%r8),%r10 ++ xorl %r11d,%r9d ++ bswapl %r10d ++ movl %r9d,80+12(%rsp) ++ xorl %r11d,%r10d ++ leaq 7(%r8),%r9 ++ movl %r10d,96+12(%rsp) ++ bswapl %r9d ++ xorl %r11d,%r9d ++ movl %r9d,112+12(%rsp) + +-.align 16 +-.Lctr32_loop6: +- pshufd $192,%xmm13,%xmm5 +- por %xmm14,%xmm2 +- movups (%r11),%xmm0 +- pshufd $128,%xmm13,%xmm6 +- por %xmm14,%xmm3 +- movups 16(%r11),%xmm1 +- pshufd $64,%xmm13,%xmm7 +- por %xmm14,%xmm4 +- por %xmm14,%xmm5 +- xorps %xmm0,%xmm2 +- por %xmm14,%xmm6 +- por %xmm14,%xmm7 ++ movups 16(%rcx),%xmm1 + ++ movdqa 64(%rsp),%xmm6 ++ movdqa 80(%rsp),%xmm7 + ++ cmpq $8,%rdx ++ jb .Lctr32_tail + ++ leaq 128(%rcx),%rcx ++ subq $8,%rdx ++ jmp .Lctr32_loop8 + +- pxor %xmm0,%xmm3 ++.align 32 ++.Lctr32_loop8: ++ addl $8,%r8d ++ movdqa 96(%rsp),%xmm8 + .byte 102,15,56,220,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++ movl %r8d,%r9d ++ movdqa 112(%rsp),%xmm9 + .byte 102,15,56,220,217 +- movdqa .Lincrement32(%rip),%xmm13 +- pxor %xmm0,%xmm5 ++ bswapl %r9d ++ movups 32-128(%rcx),%xmm0 + .byte 102,15,56,220,225 +- movdqa -40(%rsp),%xmm12 +- pxor %xmm0,%xmm6 ++ xorl %r11d,%r9d + .byte 102,15,56,220,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++ movl %r9d,0+12(%rsp) ++ leaq 1(%r8),%r9 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +- jmp .Lctr32_enc_loop6_enter +-.align 16 +-.Lctr32_enc_loop6: ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 48-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,16+12(%rsp) ++ leaq 2(%r8),%r9 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 64-128(%rcx),%xmm0 + .byte 102,15,56,220,209 + .byte 102,15,56,220,217 +- decl %eax ++ bswapl %r9d + .byte 102,15,56,220,225 ++ xorl %r11d,%r9d + .byte 102,15,56,220,233 ++ movl %r9d,32+12(%rsp) ++ leaq 3(%r8),%r9 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +-.Lctr32_enc_loop6_enter: +- movups 16(%rcx),%xmm1 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 80-128(%rcx),%xmm1 + .byte 102,15,56,220,208 + .byte 102,15,56,220,216 +- leaq 32(%rcx),%rcx ++ bswapl %r9d + .byte 102,15,56,220,224 ++ xorl %r11d,%r9d + .byte 102,15,56,220,232 ++ movl %r9d,48+12(%rsp) ++ leaq 4(%r8),%r9 + .byte 102,15,56,220,240 + .byte 102,15,56,220,248 +- movups (%rcx),%xmm0 +- jnz .Lctr32_enc_loop6 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 96-128(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++ bswapl %r9d ++.byte 102,15,56,220,225 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,233 ++ movl %r9d,64+12(%rsp) ++ leaq 5(%r8),%r9 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 112-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,80+12(%rsp) ++ leaq 6(%r8),%r9 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 128-128(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++ bswapl %r9d ++.byte 102,15,56,220,225 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,233 ++ movl %r9d,96+12(%rsp) ++ leaq 7(%r8),%r9 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 144-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,112+12(%rsp) ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++ movdqu 0(%rdi),%xmm10 ++.byte 102,68,15,56,220,200 ++ movups 160-128(%rcx),%xmm0 ++ ++ cmpl $11,%eax ++ jb .Lctr32_enc_done + + .byte 102,15,56,220,209 +- paddd %xmm13,%xmm12 + .byte 102,15,56,220,217 +- paddd -24(%rsp),%xmm13 + .byte 102,15,56,220,225 +- movdqa %xmm12,-40(%rsp) + .byte 102,15,56,220,233 +- movdqa %xmm13,-24(%rsp) + .byte 102,15,56,220,241 +-.byte 102,69,15,56,0,231 + .byte 102,15,56,220,249 +-.byte 102,69,15,56,0,239 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 176-128(%rcx),%xmm1 + +-.byte 102,15,56,221,208 +- movups (%rdi),%xmm8 +-.byte 102,15,56,221,216 +- movups 16(%rdi),%xmm9 +-.byte 102,15,56,221,224 +- movups 32(%rdi),%xmm10 +-.byte 102,15,56,221,232 +- movups 48(%rdi),%xmm11 +-.byte 102,15,56,221,240 +- movups 64(%rdi),%xmm1 +-.byte 102,15,56,221,248 +- movups 80(%rdi),%xmm0 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 192-128(%rcx),%xmm0 ++ je .Lctr32_enc_done + +- xorps %xmm2,%xmm8 +- pshufd $192,%xmm12,%xmm2 +- xorps %xmm3,%xmm9 +- pshufd $128,%xmm12,%xmm3 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- pshufd $64,%xmm12,%xmm4 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- xorps %xmm6,%xmm1 +- movups %xmm11,48(%rsi) +- xorps %xmm7,%xmm0 +- movups %xmm1,64(%rsi) +- movups %xmm0,80(%rsi) +- leaq 96(%rsi),%rsi +- movl %r10d,%eax +- subq $6,%rdx +- jnc .Lctr32_loop6 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 208-128(%rcx),%xmm1 ++ ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 224-128(%rcx),%xmm0 ++ ++.Lctr32_enc_done: ++ movdqu 16(%rdi),%xmm11 ++ pxor %xmm0,%xmm10 ++ movdqu 32(%rdi),%xmm12 ++ pxor %xmm0,%xmm11 ++ movdqu 48(%rdi),%xmm13 ++ pxor %xmm0,%xmm12 ++ movdqu 64(%rdi),%xmm14 ++ pxor %xmm0,%xmm13 ++ movdqu 80(%rdi),%xmm15 ++ pxor %xmm0,%xmm14 ++.byte 102,15,56,220,209 ++ pxor %xmm0,%xmm15 ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movdqu 96(%rdi),%xmm1 ++ ++.byte 102,65,15,56,221,210 ++ pxor %xmm0,%xmm1 ++ movdqu 112(%rdi),%xmm10 ++ leaq 128(%rdi),%rdi ++.byte 102,65,15,56,221,219 ++ pxor %xmm0,%xmm10 ++ movdqa 0(%rsp),%xmm11 ++.byte 102,65,15,56,221,228 ++ movdqa 16(%rsp),%xmm12 ++.byte 102,65,15,56,221,237 ++ movdqa 32(%rsp),%xmm13 ++.byte 102,65,15,56,221,246 ++ movdqa 48(%rsp),%xmm14 ++.byte 102,65,15,56,221,255 ++ movdqa 64(%rsp),%xmm15 ++.byte 102,68,15,56,221,193 ++ movdqa 80(%rsp),%xmm0 ++.byte 102,69,15,56,221,202 ++ movups 16-128(%rcx),%xmm1 ++ ++ movups %xmm2,(%rsi) ++ movdqa %xmm11,%xmm2 ++ movups %xmm3,16(%rsi) ++ movdqa %xmm12,%xmm3 ++ movups %xmm4,32(%rsi) ++ movdqa %xmm13,%xmm4 ++ movups %xmm5,48(%rsi) ++ movdqa %xmm14,%xmm5 ++ movups %xmm6,64(%rsi) ++ movdqa %xmm15,%xmm6 ++ movups %xmm7,80(%rsi) ++ movdqa %xmm0,%xmm7 ++ movups %xmm8,96(%rsi) ++ movups %xmm9,112(%rsi) ++ leaq 128(%rsi),%rsi ++ ++ subq $8,%rdx ++ jnc .Lctr32_loop8 + +- addq $6,%rdx ++ addq $8,%rdx + jz .Lctr32_done +- movq %r11,%rcx +- leal 1(%rax,%rax,1),%eax ++ leaq -128(%rcx),%rcx + + .Lctr32_tail: +- por %xmm14,%xmm2 +- movups (%rdi),%xmm8 +- cmpq $2,%rdx +- jb .Lctr32_one ++ leaq 16(%rcx),%rcx ++ cmpq $4,%rdx ++ jb .Lctr32_loop3 ++ je .Lctr32_loop4 + +- por %xmm14,%xmm3 +- movups 16(%rdi),%xmm9 +- je .Lctr32_two ++ movdqa 96(%rsp),%xmm8 ++ pxor %xmm9,%xmm9 + +- pshufd $192,%xmm13,%xmm5 +- por %xmm14,%xmm4 +- movups 32(%rdi),%xmm10 +- cmpq $4,%rdx +- jb .Lctr32_three ++ movups 16(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++ shrl $1,%eax ++.byte 102,15,56,220,225 ++ decl %eax ++.byte 102,15,56,220,233 ++ movups (%rdi),%xmm10 ++.byte 102,15,56,220,241 ++ movups 16(%rdi),%xmm11 ++.byte 102,15,56,220,249 ++ movups 32(%rdi),%xmm12 ++.byte 102,68,15,56,220,193 ++ movups 16(%rcx),%xmm1 + +- pshufd $128,%xmm13,%xmm6 +- por %xmm14,%xmm5 +- movups 48(%rdi),%xmm11 +- je .Lctr32_four ++ call .Lenc_loop8_enter + +- por %xmm14,%xmm6 +- xorps %xmm7,%xmm7 ++ movdqu 48(%rdi),%xmm13 ++ pxor %xmm10,%xmm2 ++ movdqu 64(%rdi),%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm10,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ movdqu %xmm6,64(%rsi) ++ cmpq $6,%rdx ++ jb .Lctr32_done + +- call _aesni_encrypt6 ++ movups 80(%rdi),%xmm11 ++ xorps %xmm11,%xmm7 ++ movups %xmm7,80(%rsi) ++ je .Lctr32_done + +- movups 64(%rdi),%xmm1 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- xorps %xmm6,%xmm1 +- movups %xmm11,48(%rsi) +- movups %xmm1,64(%rsi) ++ movups 96(%rdi),%xmm12 ++ xorps %xmm12,%xmm8 ++ movups %xmm8,96(%rsi) ++ jmp .Lctr32_done ++ ++.align 32 ++.Lctr32_loop4: ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++ movups (%rcx),%xmm1 ++ decl %eax ++ jnz .Lctr32_loop4 ++.byte 102,15,56,221,209 ++ movups (%rdi),%xmm10 ++.byte 102,15,56,221,217 ++ movups 16(%rdi),%xmm11 ++.byte 102,15,56,221,225 ++ movups 32(%rdi),%xmm12 ++.byte 102,15,56,221,233 ++ movups 48(%rdi),%xmm13 ++ ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) ++ xorps %xmm11,%xmm3 ++ movups %xmm3,16(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm5,48(%rsi) ++ jmp .Lctr32_done ++ ++.align 32 ++.Lctr32_loop3: ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++ movups (%rcx),%xmm1 ++ decl %eax ++ jnz .Lctr32_loop3 ++.byte 102,15,56,221,209 ++.byte 102,15,56,221,217 ++.byte 102,15,56,221,225 ++ ++ movups (%rdi),%xmm10 ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) ++ cmpq $2,%rdx ++ jb .Lctr32_done ++ ++ movups 16(%rdi),%xmm11 ++ xorps %xmm11,%xmm3 ++ movups %xmm3,16(%rsi) ++ je .Lctr32_done ++ ++ movups 32(%rdi),%xmm12 ++ xorps %xmm12,%xmm4 ++ movups %xmm4,32(%rsi) + jmp .Lctr32_done + + .align 16 + .Lctr32_one_shortcut: + movups (%r8),%xmm2 +- movups (%rdi),%xmm8 ++ movups (%rdi),%xmm10 + movl 240(%rcx),%eax +-.Lctr32_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx +@@ -1129,51 +1342,26 @@ aesni_ctr32_encrypt_blocks: + leaq 16(%rcx),%rcx + jnz .Loop_enc1_7 + .byte 102,15,56,221,209 +- xorps %xmm2,%xmm8 +- movups %xmm8,(%rsi) +- jmp .Lctr32_done +- +-.align 16 +-.Lctr32_two: +- xorps %xmm4,%xmm4 +- call _aesni_encrypt3 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- movups %xmm9,16(%rsi) +- jmp .Lctr32_done +- +-.align 16 +-.Lctr32_three: +- call _aesni_encrypt3 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- movups %xmm10,32(%rsi) ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) + jmp .Lctr32_done + + .align 16 +-.Lctr32_four: +- call _aesni_encrypt4 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- movups %xmm11,48(%rsi) +- + .Lctr32_done: ++ leaq (%rbp),%rsp ++ popq %rbp ++.Lctr32_epilogue: + .byte 0xf3,0xc3 + .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks + .globl aesni_xts_encrypt + .type aesni_xts_encrypt,@function + .align 16 + aesni_xts_encrypt: +- leaq -104(%rsp),%rsp ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $112,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp + movups (%r9),%xmm15 + movl 240(%r8),%eax + movl 240(%rcx),%r10d +@@ -1188,228 +1376,266 @@ aesni_xts_encrypt: + leaq 16(%r8),%r8 + jnz .Loop_enc1_8 + .byte 102,68,15,56,221,249 ++ movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax ++ shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + ++ movups 16(%rcx,%r10,1),%xmm1 ++ movl %eax,%r10d ++ + movdqa .Lxts_magic(%rip),%xmm8 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pshufd $95,%xmm15,%xmm9 ++ pxor %xmm0,%xmm1 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm10 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm11 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm12 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 ++ psrad $31,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm13 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm15,%xmm14 ++ psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 ++ pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 ++ movaps %xmm1,96(%rsp) ++ + subq $96,%rdx + jc .Lxts_enc_short + + shrl $1,%eax +- subl $1,%eax ++ subl $3,%eax ++ movups 16(%r11),%xmm1 + movl %eax,%r10d ++ leaq .Lxts_magic(%rip),%r8 + jmp .Lxts_enc_grandloop + +-.align 16 ++.align 32 + .Lxts_enc_grandloop: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu 0(%rdi),%xmm2 +- pand %xmm8,%xmm9 ++ movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- +- movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 +- movdqu 48(%rdi),%xmm5 ++ movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +- movdqu 64(%rdi),%xmm6 ++.byte 102,15,56,220,209 ++ movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +- movdqu 80(%rdi),%xmm7 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,220,217 ++ movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +- movups (%r11),%xmm0 ++.byte 102,15,56,220,225 ++ movdqu 80(%rdi),%xmm7 ++ pxor %xmm15,%xmm8 ++ movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +- pxor %xmm15,%xmm7 +- +- ++.byte 102,15,56,220,233 ++ movups 32(%r11),%xmm0 ++ leaq 96(%rdi),%rdi ++ pxor %xmm8,%xmm7 + +- movups 16(%r11),%xmm1 +- pxor %xmm0,%xmm2 +- pxor %xmm0,%xmm3 ++ pxor %xmm9,%xmm10 ++.byte 102,15,56,220,241 ++ pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +-.byte 102,15,56,220,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++.byte 102,15,56,220,249 ++ movups 48(%r11),%xmm1 ++ ++.byte 102,15,56,220,208 ++ pxor %xmm9,%xmm12 + movdqa %xmm11,16(%rsp) +-.byte 102,15,56,220,217 +- pxor %xmm0,%xmm5 ++.byte 102,15,56,220,216 ++ pxor %xmm9,%xmm13 + movdqa %xmm12,32(%rsp) +-.byte 102,15,56,220,225 +- pxor %xmm0,%xmm6 +- movdqa %xmm13,48(%rsp) +-.byte 102,15,56,220,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++.byte 102,15,56,220,224 ++ pxor %xmm9,%xmm14 ++.byte 102,15,56,220,232 ++ pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +-.byte 102,15,56,220,241 +- movdqa %xmm15,80(%rsp) +-.byte 102,15,56,220,249 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- jmp .Lxts_enc_loop6_enter +- +-.align 16 ++.byte 102,15,56,220,240 ++ movdqa %xmm8,80(%rsp) ++.byte 102,15,56,220,248 ++ movups 64(%r11),%xmm0 ++ leaq 64(%r11),%rcx ++ pshufd $95,%xmm15,%xmm9 ++ jmp .Lxts_enc_loop6 ++.align 32 + .Lxts_enc_loop6: + .byte 102,15,56,220,209 + .byte 102,15,56,220,217 +- decl %eax + .byte 102,15,56,220,225 + .byte 102,15,56,220,233 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +-.Lxts_enc_loop6_enter: + movups 16(%rcx),%xmm1 ++ leaq 32(%rcx),%rcx ++ + .byte 102,15,56,220,208 + .byte 102,15,56,220,216 +- leaq 32(%rcx),%rcx + .byte 102,15,56,220,224 + .byte 102,15,56,220,232 + .byte 102,15,56,220,240 + .byte 102,15,56,220,248 + movups (%rcx),%xmm0 ++ decl %eax + jnz .Lxts_enc_loop6 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- paddq %xmm15,%xmm15 ++ movdqa (%r8),%xmm8 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,209 +- pand %xmm8,%xmm9 ++ paddq %xmm15,%xmm15 ++ psrad $31,%xmm14 + .byte 102,15,56,220,217 +- pcmpgtd %xmm15,%xmm14 ++ pand %xmm8,%xmm14 ++ movups (%r11),%xmm10 + .byte 102,15,56,220,225 +- pxor %xmm9,%xmm15 + .byte 102,15,56,220,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,241 ++ movaps %xmm10,%xmm11 + .byte 102,15,56,220,249 + movups 16(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm10 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,208 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm10 ++ psrad $31,%xmm14 + .byte 102,15,56,220,216 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,220,224 +- pxor %xmm9,%xmm15 + .byte 102,15,56,220,232 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,240 ++ movaps %xmm11,%xmm12 + .byte 102,15,56,220,248 + movups 32(%rcx),%xmm0 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm11 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,209 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm11 ++ psrad $31,%xmm14 + .byte 102,15,56,220,217 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,220,225 +- pxor %xmm9,%xmm15 ++ movdqa %xmm13,48(%rsp) + .byte 102,15,56,220,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,241 ++ movaps %xmm12,%xmm13 + .byte 102,15,56,220,249 ++ movups 48(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm12 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,220,208 ++ pxor %xmm15,%xmm12 ++ psrad $31,%xmm14 ++.byte 102,15,56,220,216 + paddq %xmm15,%xmm15 +-.byte 102,15,56,221,208 +- pand %xmm8,%xmm9 +-.byte 102,15,56,221,216 +- pcmpgtd %xmm15,%xmm14 +-.byte 102,15,56,221,224 +- pxor %xmm9,%xmm15 +-.byte 102,15,56,221,232 +-.byte 102,15,56,221,240 +-.byte 102,15,56,221,248 ++ pand %xmm8,%xmm14 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++ pxor %xmm14,%xmm15 ++.byte 102,15,56,220,240 ++ movaps %xmm13,%xmm14 ++.byte 102,15,56,220,248 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm13 ++ movdqa %xmm9,%xmm0 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,220,209 ++ pxor %xmm15,%xmm13 ++ psrad $31,%xmm0 ++.byte 102,15,56,220,217 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm0 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++ pxor %xmm0,%xmm15 ++ movups (%r11),%xmm0 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++ movups 16(%r11),%xmm1 ++ ++ pxor %xmm15,%xmm14 ++ psrad $31,%xmm9 ++.byte 102,15,56,221,84,36,0 + paddq %xmm15,%xmm15 +- xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm9 +- xorps 16(%rsp),%xmm3 +- pcmpgtd %xmm15,%xmm14 ++.byte 102,15,56,221,92,36,16 ++.byte 102,15,56,221,100,36,32 + pxor %xmm9,%xmm15 +- +- xorps 32(%rsp),%xmm4 +- movups %xmm2,0(%rsi) +- xorps 48(%rsp),%xmm5 +- movups %xmm3,16(%rsi) +- xorps 64(%rsp),%xmm6 +- movups %xmm4,32(%rsi) +- xorps 80(%rsp),%xmm7 +- movups %xmm5,48(%rsi) ++.byte 102,15,56,221,108,36,48 ++.byte 102,15,56,221,116,36,64 ++.byte 102,15,56,221,124,36,80 + movl %r10d,%eax +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) ++ + leaq 96(%rsi),%rsi ++ movups %xmm2,-96(%rsi) ++ movups %xmm3,-80(%rsi) ++ movups %xmm4,-64(%rsi) ++ movups %xmm5,-48(%rsi) ++ movups %xmm6,-32(%rsi) ++ movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc .Lxts_enc_grandloop + +- leal 3(%rax,%rax,1),%eax ++ leal 7(%rax,%rax,1),%eax + movq %r11,%rcx + movl %eax,%r10d + + .Lxts_enc_short: ++ pxor %xmm0,%xmm10 + addq $96,%rdx + jz .Lxts_enc_done + ++ pxor %xmm0,%xmm11 + cmpq $32,%rdx + jb .Lxts_enc_one ++ pxor %xmm0,%xmm12 + je .Lxts_enc_two + ++ pxor %xmm0,%xmm13 + cmpq $64,%rdx + jb .Lxts_enc_three ++ pxor %xmm0,%xmm14 + je .Lxts_enc_four + +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 +@@ -1512,15 +1738,15 @@ aesni_xts_encrypt: + + call _aesni_encrypt4 + +- xorps %xmm10,%xmm2 +- movdqa %xmm15,%xmm10 +- xorps %xmm11,%xmm3 +- xorps %xmm12,%xmm4 +- movups %xmm2,(%rsi) +- xorps %xmm13,%xmm5 +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) ++ pxor %xmm10,%xmm2 ++ movdqa %xmm14,%xmm10 ++ pxor %xmm11,%xmm3 ++ pxor %xmm12,%xmm4 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm3,16(%rsi) ++ movdqu %xmm4,32(%rsi) ++ movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp .Lxts_enc_done + +@@ -1561,7 +1787,8 @@ aesni_xts_encrypt: + movups %xmm2,-16(%rsi) + + .Lxts_enc_ret: +- leaq 104(%rsp),%rsp ++ leaq (%rbp),%rsp ++ popq %rbp + .Lxts_enc_epilogue: + .byte 0xf3,0xc3 + .size aesni_xts_encrypt,.-aesni_xts_encrypt +@@ -1569,7 +1796,11 @@ aesni_xts_encrypt: + .type aesni_xts_decrypt,@function + .align 16 + aesni_xts_decrypt: +- leaq -104(%rsp),%rsp ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $112,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp + movups (%r9),%xmm15 + movl 240(%r8),%eax + movl 240(%rcx),%r10d +@@ -1590,228 +1821,266 @@ aesni_xts_decrypt: + shlq $4,%rax + subq %rax,%rdx + ++ movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax ++ shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + ++ movups 16(%rcx,%r10,1),%xmm1 ++ movl %eax,%r10d ++ + movdqa .Lxts_magic(%rip),%xmm8 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pshufd $95,%xmm15,%xmm9 ++ pxor %xmm0,%xmm1 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm10 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm11 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm12 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 ++ psrad $31,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm13 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm15,%xmm14 ++ psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 ++ pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 ++ movaps %xmm1,96(%rsp) ++ + subq $96,%rdx + jc .Lxts_dec_short + + shrl $1,%eax +- subl $1,%eax ++ subl $3,%eax ++ movups 16(%r11),%xmm1 + movl %eax,%r10d ++ leaq .Lxts_magic(%rip),%r8 + jmp .Lxts_dec_grandloop + +-.align 16 ++.align 32 + .Lxts_dec_grandloop: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu 0(%rdi),%xmm2 +- pand %xmm8,%xmm9 ++ movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- +- movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 +- movdqu 48(%rdi),%xmm5 ++ movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +- movdqu 64(%rdi),%xmm6 ++.byte 102,15,56,222,209 ++ movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +- movdqu 80(%rdi),%xmm7 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,222,217 ++ movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +- movups (%r11),%xmm0 ++.byte 102,15,56,222,225 ++ movdqu 80(%rdi),%xmm7 ++ pxor %xmm15,%xmm8 ++ movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +- pxor %xmm15,%xmm7 +- +- ++.byte 102,15,56,222,233 ++ movups 32(%r11),%xmm0 ++ leaq 96(%rdi),%rdi ++ pxor %xmm8,%xmm7 + +- movups 16(%r11),%xmm1 +- pxor %xmm0,%xmm2 +- pxor %xmm0,%xmm3 ++ pxor %xmm9,%xmm10 ++.byte 102,15,56,222,241 ++ pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +-.byte 102,15,56,222,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++.byte 102,15,56,222,249 ++ movups 48(%r11),%xmm1 ++ ++.byte 102,15,56,222,208 ++ pxor %xmm9,%xmm12 + movdqa %xmm11,16(%rsp) +-.byte 102,15,56,222,217 +- pxor %xmm0,%xmm5 ++.byte 102,15,56,222,216 ++ pxor %xmm9,%xmm13 + movdqa %xmm12,32(%rsp) +-.byte 102,15,56,222,225 +- pxor %xmm0,%xmm6 +- movdqa %xmm13,48(%rsp) +-.byte 102,15,56,222,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++.byte 102,15,56,222,224 ++ pxor %xmm9,%xmm14 ++.byte 102,15,56,222,232 ++ pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +-.byte 102,15,56,222,241 +- movdqa %xmm15,80(%rsp) +-.byte 102,15,56,222,249 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- jmp .Lxts_dec_loop6_enter +- +-.align 16 ++.byte 102,15,56,222,240 ++ movdqa %xmm8,80(%rsp) ++.byte 102,15,56,222,248 ++ movups 64(%r11),%xmm0 ++ leaq 64(%r11),%rcx ++ pshufd $95,%xmm15,%xmm9 ++ jmp .Lxts_dec_loop6 ++.align 32 + .Lxts_dec_loop6: + .byte 102,15,56,222,209 + .byte 102,15,56,222,217 +- decl %eax + .byte 102,15,56,222,225 + .byte 102,15,56,222,233 + .byte 102,15,56,222,241 + .byte 102,15,56,222,249 +-.Lxts_dec_loop6_enter: + movups 16(%rcx),%xmm1 ++ leaq 32(%rcx),%rcx ++ + .byte 102,15,56,222,208 + .byte 102,15,56,222,216 +- leaq 32(%rcx),%rcx + .byte 102,15,56,222,224 + .byte 102,15,56,222,232 + .byte 102,15,56,222,240 + .byte 102,15,56,222,248 + movups (%rcx),%xmm0 ++ decl %eax + jnz .Lxts_dec_loop6 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- paddq %xmm15,%xmm15 ++ movdqa (%r8),%xmm8 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,209 +- pand %xmm8,%xmm9 ++ paddq %xmm15,%xmm15 ++ psrad $31,%xmm14 + .byte 102,15,56,222,217 +- pcmpgtd %xmm15,%xmm14 ++ pand %xmm8,%xmm14 ++ movups (%r11),%xmm10 + .byte 102,15,56,222,225 +- pxor %xmm9,%xmm15 + .byte 102,15,56,222,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,241 ++ movaps %xmm10,%xmm11 + .byte 102,15,56,222,249 + movups 16(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm10 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,208 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm10 ++ psrad $31,%xmm14 + .byte 102,15,56,222,216 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,222,224 +- pxor %xmm9,%xmm15 + .byte 102,15,56,222,232 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,240 ++ movaps %xmm11,%xmm12 + .byte 102,15,56,222,248 + movups 32(%rcx),%xmm0 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm11 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,209 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm11 ++ psrad $31,%xmm14 + .byte 102,15,56,222,217 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,222,225 +- pxor %xmm9,%xmm15 ++ movdqa %xmm13,48(%rsp) + .byte 102,15,56,222,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,241 ++ movaps %xmm12,%xmm13 + .byte 102,15,56,222,249 ++ movups 48(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm12 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,222,208 ++ pxor %xmm15,%xmm12 ++ psrad $31,%xmm14 ++.byte 102,15,56,222,216 + paddq %xmm15,%xmm15 +-.byte 102,15,56,223,208 +- pand %xmm8,%xmm9 +-.byte 102,15,56,223,216 +- pcmpgtd %xmm15,%xmm14 +-.byte 102,15,56,223,224 +- pxor %xmm9,%xmm15 +-.byte 102,15,56,223,232 +-.byte 102,15,56,223,240 +-.byte 102,15,56,223,248 ++ pand %xmm8,%xmm14 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++ pxor %xmm14,%xmm15 ++.byte 102,15,56,222,240 ++ movaps %xmm13,%xmm14 ++.byte 102,15,56,222,248 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm13 ++ movdqa %xmm9,%xmm0 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,222,209 ++ pxor %xmm15,%xmm13 ++ psrad $31,%xmm0 ++.byte 102,15,56,222,217 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm0 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++ pxor %xmm0,%xmm15 ++ movups (%r11),%xmm0 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++ movups 16(%r11),%xmm1 ++ ++ pxor %xmm15,%xmm14 ++ psrad $31,%xmm9 ++.byte 102,15,56,223,84,36,0 + paddq %xmm15,%xmm15 +- xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm9 +- xorps 16(%rsp),%xmm3 +- pcmpgtd %xmm15,%xmm14 ++.byte 102,15,56,223,92,36,16 ++.byte 102,15,56,223,100,36,32 + pxor %xmm9,%xmm15 +- +- xorps 32(%rsp),%xmm4 +- movups %xmm2,0(%rsi) +- xorps 48(%rsp),%xmm5 +- movups %xmm3,16(%rsi) +- xorps 64(%rsp),%xmm6 +- movups %xmm4,32(%rsi) +- xorps 80(%rsp),%xmm7 +- movups %xmm5,48(%rsi) ++.byte 102,15,56,223,108,36,48 ++.byte 102,15,56,223,116,36,64 ++.byte 102,15,56,223,124,36,80 + movl %r10d,%eax +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) ++ + leaq 96(%rsi),%rsi ++ movups %xmm2,-96(%rsi) ++ movups %xmm3,-80(%rsi) ++ movups %xmm4,-64(%rsi) ++ movups %xmm5,-48(%rsi) ++ movups %xmm6,-32(%rsi) ++ movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc .Lxts_dec_grandloop + +- leal 3(%rax,%rax,1),%eax ++ leal 7(%rax,%rax,1),%eax + movq %r11,%rcx + movl %eax,%r10d + + .Lxts_dec_short: ++ pxor %xmm0,%xmm10 ++ pxor %xmm0,%xmm11 + addq $96,%rdx + jz .Lxts_dec_done + ++ pxor %xmm0,%xmm12 + cmpq $32,%rdx + jb .Lxts_dec_one ++ pxor %xmm0,%xmm13 + je .Lxts_dec_two + ++ pxor %xmm0,%xmm14 + cmpq $64,%rdx + jb .Lxts_dec_three + je .Lxts_dec_four + +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 +@@ -1904,7 +2173,7 @@ aesni_xts_decrypt: + xorps %xmm10,%xmm2 + movdqa %xmm13,%xmm10 + xorps %xmm11,%xmm3 +- movdqa %xmm15,%xmm11 ++ movdqa %xmm14,%xmm11 + xorps %xmm12,%xmm4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) +@@ -1914,14 +2183,8 @@ aesni_xts_decrypt: + + .align 16 + .Lxts_dec_four: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movups (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movups 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movups 32(%rdi),%xmm4 + xorps %xmm10,%xmm2 + movups 48(%rdi),%xmm5 +@@ -1932,16 +2195,16 @@ aesni_xts_decrypt: + + call _aesni_decrypt4 + +- xorps %xmm10,%xmm2 ++ pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 +- xorps %xmm11,%xmm3 ++ pxor %xmm11,%xmm3 + movdqa %xmm15,%xmm11 +- xorps %xmm12,%xmm4 +- movups %xmm2,(%rsi) +- xorps %xmm13,%xmm5 +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm3,16(%rsi) ++ movdqu %xmm4,32(%rsi) ++ movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp .Lxts_dec_done + +@@ -2001,7 +2264,8 @@ aesni_xts_decrypt: + movups %xmm2,(%rsi) + + .Lxts_dec_ret: +- leaq 104(%rsp),%rsp ++ leaq (%rbp),%rsp ++ popq %rbp + .Lxts_dec_epilogue: + .byte 0xf3,0xc3 + .size aesni_xts_decrypt,.-aesni_xts_decrypt +@@ -2068,149 +2332,324 @@ aesni_cbc_encrypt: + + .align 16 + .Lcbc_decrypt: +- movups (%r8),%xmm9 ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $16,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp ++ movups (%r8),%xmm10 + movl %r10d,%eax +- cmpq $112,%rdx ++ cmpq $80,%rdx + jbe .Lcbc_dec_tail +- shrl $1,%r10d ++ ++ movups (%rcx),%xmm0 ++ movdqu 0(%rdi),%xmm2 ++ movdqu 16(%rdi),%xmm3 ++ movdqa %xmm2,%xmm11 ++ movdqu 32(%rdi),%xmm4 ++ movdqa %xmm3,%xmm12 ++ movdqu 48(%rdi),%xmm5 ++ movdqa %xmm4,%xmm13 ++ movdqu 64(%rdi),%xmm6 ++ movdqa %xmm5,%xmm14 ++ movdqu 80(%rdi),%xmm7 ++ movdqa %xmm6,%xmm15 ++ cmpq $112,%rdx ++ jbe .Lcbc_dec_six_or_seven ++ + subq $112,%rdx +- movl %r10d,%eax +- movaps %xmm9,-24(%rsp) ++ leaq 112(%rcx),%rcx + jmp .Lcbc_dec_loop8_enter + .align 16 + .Lcbc_dec_loop8: +- movaps %xmm0,-24(%rsp) + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + .Lcbc_dec_loop8_enter: +- movups (%rcx),%xmm0 +- movups (%rdi),%xmm2 +- movups 16(%rdi),%xmm3 +- movups 16(%rcx),%xmm1 ++ movdqu 96(%rdi),%xmm8 ++ pxor %xmm0,%xmm2 ++ movdqu 112(%rdi),%xmm9 ++ pxor %xmm0,%xmm3 ++ movups 16-112(%rcx),%xmm1 ++ pxor %xmm0,%xmm4 ++ xorq %r11,%r11 ++ cmpq $112,%rdx ++ pxor %xmm0,%xmm5 ++ pxor %xmm0,%xmm6 ++ pxor %xmm0,%xmm7 ++ pxor %xmm0,%xmm8 + +- leaq 32(%rcx),%rcx +- movdqu 32(%rdi),%xmm4 +- xorps %xmm0,%xmm2 +- movdqu 48(%rdi),%xmm5 +- xorps %xmm0,%xmm3 +- movdqu 64(%rdi),%xmm6 + .byte 102,15,56,222,209 +- pxor %xmm0,%xmm4 +- movdqu 80(%rdi),%xmm7 ++ pxor %xmm0,%xmm9 ++ movups 32-112(%rcx),%xmm0 + .byte 102,15,56,222,217 +- pxor %xmm0,%xmm5 +- movdqu 96(%rdi),%xmm8 + .byte 102,15,56,222,225 +- pxor %xmm0,%xmm6 +- movdqu 112(%rdi),%xmm9 + .byte 102,15,56,222,233 +- pxor %xmm0,%xmm7 +- decl %eax + .byte 102,15,56,222,241 +- pxor %xmm0,%xmm8 + .byte 102,15,56,222,249 +- pxor %xmm0,%xmm9 +- movups (%rcx),%xmm0 ++ setnc %r11b + .byte 102,68,15,56,222,193 ++ shlq $7,%r11 + .byte 102,68,15,56,222,201 +- movups 16(%rcx),%xmm1 +- +- call .Ldec_loop8_enter ++ addq %rdi,%r11 ++ movups 48-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 64-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 80-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 96-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 112-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 128-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 144-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 160-112(%rcx),%xmm0 ++ cmpl $11,%eax ++ jb .Lcbc_dec_done ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 176-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 192-112(%rcx),%xmm0 ++ je .Lcbc_dec_done ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 208-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 224-112(%rcx),%xmm0 ++.Lcbc_dec_done: ++.byte 102,15,56,222,209 ++ pxor %xmm0,%xmm10 ++.byte 102,15,56,222,217 ++ pxor %xmm0,%xmm11 ++.byte 102,15,56,222,225 ++ pxor %xmm0,%xmm12 ++.byte 102,15,56,222,233 ++ pxor %xmm0,%xmm13 ++.byte 102,15,56,222,241 ++ pxor %xmm0,%xmm14 ++.byte 102,15,56,222,249 ++ pxor %xmm0,%xmm15 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movdqu 80(%rdi),%xmm1 ++ ++.byte 102,65,15,56,223,210 ++ movdqu 96(%rdi),%xmm10 ++ pxor %xmm0,%xmm1 ++.byte 102,65,15,56,223,219 ++ pxor %xmm0,%xmm10 ++ movdqu 112(%rdi),%xmm0 ++ leaq 128(%rdi),%rdi ++.byte 102,65,15,56,223,228 ++ movdqu 0(%r11),%xmm11 ++.byte 102,65,15,56,223,237 ++ movdqu 16(%r11),%xmm12 ++.byte 102,65,15,56,223,246 ++ movdqu 32(%r11),%xmm13 ++.byte 102,65,15,56,223,255 ++ movdqu 48(%r11),%xmm14 ++.byte 102,68,15,56,223,193 ++ movdqu 64(%r11),%xmm15 ++.byte 102,69,15,56,223,202 ++ movdqa %xmm0,%xmm10 ++ movdqu 80(%r11),%xmm1 ++ movups -112(%rcx),%xmm0 + +- movups (%rdi),%xmm1 +- movups 16(%rdi),%xmm0 +- xorps -24(%rsp),%xmm2 +- xorps %xmm1,%xmm3 +- movups 32(%rdi),%xmm1 +- xorps %xmm0,%xmm4 +- movups 48(%rdi),%xmm0 +- xorps %xmm1,%xmm5 +- movups 64(%rdi),%xmm1 +- xorps %xmm0,%xmm6 +- movups 80(%rdi),%xmm0 +- xorps %xmm1,%xmm7 +- movups 96(%rdi),%xmm1 +- xorps %xmm0,%xmm8 +- movups 112(%rdi),%xmm0 +- xorps %xmm1,%xmm9 + movups %xmm2,(%rsi) ++ movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) ++ movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) ++ movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) +- movl %r10d,%eax ++ movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) +- movq %r11,%rcx ++ movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) +- leaq 128(%rdi),%rdi ++ movdqa %xmm1,%xmm7 + movups %xmm8,96(%rsi) + leaq 112(%rsi),%rsi ++ + subq $128,%rdx + ja .Lcbc_dec_loop8 + + movaps %xmm9,%xmm2 +- movaps %xmm0,%xmm9 ++ leaq -112(%rcx),%rcx + addq $112,%rdx + jle .Lcbc_dec_tail_collected +- movups %xmm2,(%rsi) +- leal 1(%r10,%r10,1),%eax ++ movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi ++ cmpq $80,%rdx ++ jbe .Lcbc_dec_tail ++ ++ movaps %xmm11,%xmm2 ++.Lcbc_dec_six_or_seven: ++ cmpq $96,%rdx ++ ja .Lcbc_dec_seven ++ ++ movaps %xmm7,%xmm8 ++ call _aesni_decrypt6 ++ pxor %xmm10,%xmm2 ++ movaps %xmm8,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ pxor %xmm15,%xmm7 ++ movdqu %xmm6,64(%rsi) ++ leaq 80(%rsi),%rsi ++ movdqa %xmm7,%xmm2 ++ jmp .Lcbc_dec_tail_collected ++ ++.align 16 ++.Lcbc_dec_seven: ++ movups 96(%rdi),%xmm8 ++ xorps %xmm9,%xmm9 ++ call _aesni_decrypt8 ++ movups 80(%rdi),%xmm9 ++ pxor %xmm10,%xmm2 ++ movups 96(%rdi),%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ pxor %xmm15,%xmm7 ++ movdqu %xmm6,64(%rsi) ++ pxor %xmm9,%xmm8 ++ movdqu %xmm7,80(%rsi) ++ leaq 96(%rsi),%rsi ++ movdqa %xmm8,%xmm2 ++ jmp .Lcbc_dec_tail_collected ++ + .Lcbc_dec_tail: + movups (%rdi),%xmm2 +- movaps %xmm2,%xmm8 +- cmpq $16,%rdx ++ subq $16,%rdx + jbe .Lcbc_dec_one + + movups 16(%rdi),%xmm3 +- movaps %xmm3,%xmm7 +- cmpq $32,%rdx ++ movaps %xmm2,%xmm11 ++ subq $16,%rdx + jbe .Lcbc_dec_two + + movups 32(%rdi),%xmm4 +- movaps %xmm4,%xmm6 +- cmpq $48,%rdx ++ movaps %xmm3,%xmm12 ++ subq $16,%rdx + jbe .Lcbc_dec_three + + movups 48(%rdi),%xmm5 +- cmpq $64,%rdx ++ movaps %xmm4,%xmm13 ++ subq $16,%rdx + jbe .Lcbc_dec_four + + movups 64(%rdi),%xmm6 +- cmpq $80,%rdx +- jbe .Lcbc_dec_five +- +- movups 80(%rdi),%xmm7 +- cmpq $96,%rdx +- jbe .Lcbc_dec_six +- +- movups 96(%rdi),%xmm8 +- movaps %xmm9,-24(%rsp) +- call _aesni_decrypt8 +- movups (%rdi),%xmm1 +- movups 16(%rdi),%xmm0 +- xorps -24(%rsp),%xmm2 +- xorps %xmm1,%xmm3 +- movups 32(%rdi),%xmm1 +- xorps %xmm0,%xmm4 +- movups 48(%rdi),%xmm0 +- xorps %xmm1,%xmm5 +- movups 64(%rdi),%xmm1 +- xorps %xmm0,%xmm6 +- movups 80(%rdi),%xmm0 +- xorps %xmm1,%xmm7 +- movups 96(%rdi),%xmm9 +- xorps %xmm0,%xmm8 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) +- leaq 96(%rsi),%rsi +- movaps %xmm8,%xmm2 +- subq $112,%rdx ++ movaps %xmm5,%xmm14 ++ movaps %xmm6,%xmm15 ++ xorps %xmm7,%xmm7 ++ call _aesni_decrypt6 ++ pxor %xmm10,%xmm2 ++ movaps %xmm15,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ leaq 64(%rsi),%rsi ++ movdqa %xmm6,%xmm2 ++ subq $16,%rdx + jmp .Lcbc_dec_tail_collected ++ + .align 16 + .Lcbc_dec_one: ++ movaps %xmm2,%xmm11 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx +@@ -2222,111 +2661,69 @@ aesni_cbc_encrypt: + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 + .byte 102,15,56,223,209 +- xorps %xmm9,%xmm2 +- movaps %xmm8,%xmm9 +- subq $16,%rdx ++ xorps %xmm10,%xmm2 ++ movaps %xmm11,%xmm10 + jmp .Lcbc_dec_tail_collected + .align 16 + .Lcbc_dec_two: ++ movaps %xmm3,%xmm12 + xorps %xmm4,%xmm4 + call _aesni_decrypt3 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- movaps %xmm7,%xmm9 +- movaps %xmm3,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm12,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ movdqa %xmm3,%xmm2 + leaq 16(%rsi),%rsi +- subq $32,%rdx + jmp .Lcbc_dec_tail_collected + .align 16 + .Lcbc_dec_three: ++ movaps %xmm4,%xmm13 + call _aesni_decrypt3 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- xorps %xmm7,%xmm4 +- movups %xmm3,16(%rsi) +- movaps %xmm6,%xmm9 +- movaps %xmm4,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm13,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ movdqa %xmm4,%xmm2 + leaq 32(%rsi),%rsi +- subq $48,%rdx + jmp .Lcbc_dec_tail_collected + .align 16 + .Lcbc_dec_four: ++ movaps %xmm5,%xmm14 + call _aesni_decrypt4 +- xorps %xmm9,%xmm2 +- movups 48(%rdi),%xmm9 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- xorps %xmm7,%xmm4 +- movups %xmm3,16(%rsi) +- xorps %xmm6,%xmm5 +- movups %xmm4,32(%rsi) +- movaps %xmm5,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm14,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ movdqa %xmm5,%xmm2 + leaq 48(%rsi),%rsi +- subq $64,%rdx +- jmp .Lcbc_dec_tail_collected +-.align 16 +-.Lcbc_dec_five: +- xorps %xmm7,%xmm7 +- call _aesni_decrypt6 +- movups 16(%rdi),%xmm1 +- movups 32(%rdi),%xmm0 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- xorps %xmm1,%xmm4 +- movups 48(%rdi),%xmm1 +- xorps %xmm0,%xmm5 +- movups 64(%rdi),%xmm9 +- xorps %xmm1,%xmm6 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- leaq 64(%rsi),%rsi +- movaps %xmm6,%xmm2 +- subq $80,%rdx +- jmp .Lcbc_dec_tail_collected +-.align 16 +-.Lcbc_dec_six: +- call _aesni_decrypt6 +- movups 16(%rdi),%xmm1 +- movups 32(%rdi),%xmm0 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- xorps %xmm1,%xmm4 +- movups 48(%rdi),%xmm1 +- xorps %xmm0,%xmm5 +- movups 64(%rdi),%xmm0 +- xorps %xmm1,%xmm6 +- movups 80(%rdi),%xmm9 +- xorps %xmm0,%xmm7 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- movups %xmm6,64(%rsi) +- leaq 80(%rsi),%rsi +- movaps %xmm7,%xmm2 +- subq $96,%rdx + jmp .Lcbc_dec_tail_collected ++ + .align 16 + .Lcbc_dec_tail_collected: ++ movups %xmm10,(%r8) + andq $15,%rdx +- movups %xmm9,(%r8) + jnz .Lcbc_dec_tail_partial + movups %xmm2,(%rsi) + jmp .Lcbc_dec_ret + .align 16 + .Lcbc_dec_tail_partial: +- movaps %xmm2,-24(%rsp) ++ movaps %xmm2,(%rsp) + movq $16,%rcx + movq %rsi,%rdi + subq %rdx,%rcx +- leaq -24(%rsp),%rsi ++ leaq (%rsp),%rsi + .long 0x9066A4F3 + + .Lcbc_dec_ret: ++ leaq (%rbp),%rsp ++ popq %rbp + .Lcbc_ret: + .byte 0xf3,0xc3 + .size aesni_cbc_encrypt,.-aesni_cbc_encrypt +@@ -2569,6 +2966,8 @@ __aesni_set_encrypt_key: + .long 1,0,0,0 + .Lxts_magic: + .long 0x87,0,1,0 ++.Lincrement1: ++.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 + + .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .align 64 +diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s +index 4709ac2..2ac113d 100644 +--- a/lib/accelerated/x86/elf/padlock-x86-64.s ++++ b/lib/accelerated/x86/elf/padlock-x86-64.s +@@ -595,6 +595,468 @@ padlock_cbc_encrypt: + popq %rbp + .byte 0xf3,0xc3 + .size padlock_cbc_encrypt,.-padlock_cbc_encrypt ++.globl padlock_cfb_encrypt ++.type padlock_cfb_encrypt,@function ++.align 16 ++padlock_cfb_encrypt: ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz .Lcfb_abort ++ testq $15,%rcx ++ jnz .Lcfb_abort ++ leaq .Lpadlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz .Lcfb_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz .Lcfb_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++ jmp .Lcfb_loop ++.align 16 ++.Lcfb_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz .Lcfb_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++.Lcfb_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,224 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz .Lcfb_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++.Lcfb_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jnz .Lcfb_loop ++ cmpq %rbp,%rsp ++ je .Lcfb_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++.Lcfb_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja .Lcfb_bzero ++ ++.Lcfb_done: ++ leaq (%rbp),%rsp ++ jmp .Lcfb_exit ++ ++.align 16 ++.Lcfb_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,224 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++.Lcfb_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++.Lcfb_abort: ++ popq %rbx ++ popq %rbp ++ .byte 0xf3,0xc3 ++.size padlock_cfb_encrypt,.-padlock_cfb_encrypt ++.globl padlock_ofb_encrypt ++.type padlock_ofb_encrypt,@function ++.align 16 ++padlock_ofb_encrypt: ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz .Lofb_abort ++ testq $15,%rcx ++ jnz .Lofb_abort ++ leaq .Lpadlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz .Lofb_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz .Lofb_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++ jmp .Lofb_loop ++.align 16 ++.Lofb_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz .Lofb_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++.Lofb_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,232 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz .Lofb_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++.Lofb_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jnz .Lofb_loop ++ cmpq %rbp,%rsp ++ je .Lofb_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++.Lofb_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja .Lofb_bzero ++ ++.Lofb_done: ++ leaq (%rbp),%rsp ++ jmp .Lofb_exit ++ ++.align 16 ++.Lofb_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,232 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++.Lofb_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++.Lofb_abort: ++ popq %rbx ++ popq %rbp ++ .byte 0xf3,0xc3 ++.size padlock_ofb_encrypt,.-padlock_ofb_encrypt ++.globl padlock_ctr32_encrypt ++.type padlock_ctr32_encrypt,@function ++.align 16 ++padlock_ctr32_encrypt: ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz .Lctr32_abort ++ testq $15,%rcx ++ jnz .Lctr32_abort ++ leaq .Lpadlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz .Lctr32_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz .Lctr32_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++.Lctr32_reenter: ++ movl -4(%rdx),%eax ++ bswapl %eax ++ negl %eax ++ andl $31,%eax ++ movq $512,%rbx ++ shll $4,%eax ++ cmovzq %rbx,%rax ++ cmpq %rax,%rcx ++ cmovaq %rax,%rbx ++ cmovbeq %rcx,%rbx ++ cmpq %rbx,%rcx ++ ja .Lctr32_loop ++ movq %rsi,%rax ++ cmpq %rsp,%rbp ++ cmoveq %rdi,%rax ++ addq %rcx,%rax ++ negq %rax ++ andq $4095,%rax ++ cmpq $32,%rax ++ movq $-32,%rax ++ cmovaeq %rbx,%rax ++ andq %rax,%rbx ++ jz .Lctr32_unaligned_tail ++ jmp .Lctr32_loop ++.align 16 ++.Lctr32_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz .Lctr32_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++.Lctr32_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ movl -4(%rdx),%eax ++ testl $4294901760,%eax ++ jnz .Lctr32_no_carry ++ bswapl %eax ++ addl $65536,%eax ++ bswapl %eax ++ movl %eax,-4(%rdx) ++.Lctr32_no_carry: ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz .Lctr32_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++.Lctr32_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jz .Lctr32_break ++ cmpq %rbx,%rcx ++ jae .Lctr32_loop ++ movq %rcx,%rbx ++ movq %rsi,%rax ++ cmpq %rsp,%rbp ++ cmoveq %rdi,%rax ++ addq %rcx,%rax ++ negq %rax ++ andq $4095,%rax ++ cmpq $32,%rax ++ movq $-32,%rax ++ cmovaeq %rbx,%rax ++ andq %rax,%rbx ++ jnz .Lctr32_loop ++.Lctr32_unaligned_tail: ++ xorl %eax,%eax ++ cmpq %rsp,%rbp ++ cmoveq %rcx,%rax ++ movq %rdi,%r8 ++ movq %rcx,%rbx ++ subq %rax,%rsp ++ shrq $3,%rcx ++ leaq (%rsp),%rdi ++.byte 0xf3,0x48,0xa5 ++ movq %rsp,%rsi ++ movq %r8,%rdi ++ movq %rbx,%rcx ++ jmp .Lctr32_loop ++.align 16 ++.Lctr32_break: ++ cmpq %rbp,%rsp ++ je .Lctr32_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++.Lctr32_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja .Lctr32_bzero ++ ++.Lctr32_done: ++ leaq (%rbp),%rsp ++ jmp .Lctr32_exit ++ ++.align 16 ++.Lctr32_aligned: ++ movl -4(%rdx),%eax ++ bswapl %eax ++ negl %eax ++ andl $65535,%eax ++ movq $1048576,%rbx ++ shll $4,%eax ++ cmovzq %rbx,%rax ++ cmpq %rax,%rcx ++ cmovaq %rax,%rbx ++ cmovbeq %rcx,%rbx ++ jbe .Lctr32_aligned_skip ++ ++.Lctr32_aligned_loop: ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ ++ movl -4(%rdx),%eax ++ bswapl %eax ++ addl $65536,%eax ++ bswapl %eax ++ movl %eax,-4(%rdx) ++ ++ movq %r10,%rcx ++ subq %r11,%rcx ++ movq $1048576,%rbx ++ jz .Lctr32_exit ++ cmpq %rbx,%rcx ++ jae .Lctr32_aligned_loop ++ ++.Lctr32_aligned_skip: ++ leaq (%rsi,%rcx,1),%rbp ++ negq %rbp ++ andq $4095,%rbp ++ xorl %eax,%eax ++ cmpq $32,%rbp ++ movq $32-1,%rbp ++ cmovaeq %rax,%rbp ++ andq %rcx,%rbp ++ subq %rbp,%rcx ++ jz .Lctr32_aligned_tail ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ testq %rbp,%rbp ++ jz .Lctr32_exit ++ ++.Lctr32_aligned_tail: ++ movq %rdi,%r8 ++ movq %rbp,%rbx ++ movq %rbp,%rcx ++ leaq (%rsp),%rbp ++ subq %rcx,%rsp ++ shrq $3,%rcx ++ leaq (%rsp),%rdi ++.byte 0xf3,0x48,0xa5 ++ leaq (%r8),%rdi ++ leaq (%rsp),%rsi ++ movq %rbx,%rcx ++ jmp .Lctr32_loop ++.Lctr32_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++.Lctr32_abort: ++ popq %rbx ++ popq %rbp ++ .byte 0xf3,0xc3 ++.size padlock_ctr32_encrypt,.-padlock_ctr32_encrypt + .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .align 16 + .data +diff --git a/lib/accelerated/x86/elf/padlock-x86.s b/lib/accelerated/x86/elf/padlock-x86.s +index ea982ec..2199255 100644 +--- a/lib/accelerated/x86/elf/padlock-x86.s ++++ b/lib/accelerated/x86/elf/padlock-x86.s +@@ -187,16 +187,14 @@ padlock_ecb_encrypt: + leal 16(%edx),%edx + xorl %eax,%eax + xorl %ebx,%ebx +- cmpl $128,%ecx +- jbe .L006ecb_short + testl $32,(%edx) +- jnz .L007ecb_aligned ++ jnz .L006ecb_aligned + testl $15,%edi + setz %al + testl $15,%esi + setz %bl + testl %ebx,%eax +- jnz .L007ecb_aligned ++ jnz .L006ecb_aligned + negl %eax + movl $512,%ebx + notl %eax +@@ -208,10 +206,28 @@ padlock_ecb_encrypt: + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp + andl $-16,%esp +- jmp .L008ecb_loop ++ movl %eax,16(%ebp) ++ cmpl %ebx,%ecx ++ ja .L007ecb_loop ++ movl %esi,%eax ++ cmpl %esp,%ebp ++ cmovel %edi,%eax ++ addl %ecx,%eax ++ negl %eax ++ andl $4095,%eax ++ cmpl $128,%eax ++ movl $-128,%eax ++ cmovael %ebx,%eax ++ andl %eax,%ebx ++ jz .L008ecb_unaligned_tail ++ jmp .L007ecb_loop + .align 16 +-.L008ecb_loop: ++.L007ecb_loop: + movl %edi,(%ebp) + movl %esi,4(%ebp) + movl %ecx,8(%ebp) +@@ -236,8 +252,8 @@ padlock_ecb_encrypt: + testl $15,%edi + jz .L010ecb_out_aligned + movl %ebx,%ecx +- shrl $2,%ecx + leal (%esp),%esi ++ shrl $2,%ecx + .byte 243,165 + subl %ebx,%edi + .L010ecb_out_aligned: +@@ -247,43 +263,75 @@ padlock_ecb_encrypt: + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx +- jnz .L008ecb_loop ++ jz .L011ecb_break ++ cmpl %ebx,%ecx ++ jae .L007ecb_loop ++.L008ecb_unaligned_tail: ++ xorl %eax,%eax + cmpl %ebp,%esp +- je .L011ecb_done ++ cmovel %ecx,%eax ++ subl %eax,%esp ++ movl %edi,%eax ++ movl %ecx,%ebx ++ shrl $2,%ecx ++ leal (%esp),%edi ++.byte 243,165 ++ movl %esp,%esi ++ movl %eax,%edi ++ movl %ebx,%ecx ++ jmp .L007ecb_loop ++.align 16 ++.L011ecb_break: ++ cmpl %ebp,%esp ++ je .L012ecb_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +-.L012ecb_bzero: ++.L013ecb_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp +- ja .L012ecb_bzero +-.L011ecb_done: ++ ja .L013ecb_bzero ++.L012ecb_done: ++ movl 16(%ebp),%ebp + leal 24(%ebp),%esp +- jmp .L013ecb_exit ++ jmp .L014ecb_exit + .align 16 +-.L006ecb_short: ++.L006ecb_aligned: ++ leal (%esi,%ecx,1),%ebp ++ negl %ebp ++ andl $4095,%ebp + xorl %eax,%eax +- leal -24(%esp),%ebp +- subl %ecx,%eax +- leal (%eax,%ebp,1),%esp +- andl $-16,%esp +- xorl %ebx,%ebx +-.L014ecb_short_copy: +- movups (%esi,%ebx,1),%xmm0 +- leal 16(%ebx),%ebx +- cmpl %ebx,%ecx +- movaps %xmm0,-16(%esp,%ebx,1) +- ja .L014ecb_short_copy +- movl %esp,%esi +- movl %ecx,%ebx +- jmp .L008ecb_loop +-.align 16 +-.L007ecb_aligned: ++ cmpl $128,%ebp ++ movl $127,%ebp ++ cmovael %eax,%ebp ++ andl %ecx,%ebp ++ subl %ebp,%ecx ++ jz .L015ecb_aligned_tail + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx + .byte 243,15,167,200 +-.L013ecb_exit: ++ testl %ebp,%ebp ++ jz .L014ecb_exit ++.L015ecb_aligned_tail: ++ movl %ebp,%ecx ++ leal -24(%esp),%ebp ++ movl %ebp,%esp ++ movl %ebp,%eax ++ subl %ecx,%esp ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ movl %edi,%eax ++ movl %ecx,%ebx ++ shrl $2,%ecx ++ leal (%esp),%edi ++.byte 243,165 ++ movl %esp,%esi ++ movl %eax,%edi ++ movl %ebx,%ecx ++ jmp .L007ecb_loop ++.L014ecb_exit: + movl $1,%eax + leal 4(%esp),%esp + .L004ecb_abort: +@@ -307,19 +355,17 @@ padlock_cbc_encrypt: + movl 28(%esp),%edx + movl 32(%esp),%ecx + testl $15,%edx +- jnz .L015cbc_abort ++ jnz .L016cbc_abort + testl $15,%ecx +- jnz .L015cbc_abort +- leal .Lpadlock_saved_context-.L016cbc_pic_point,%eax ++ jnz .L016cbc_abort ++ leal .Lpadlock_saved_context-.L017cbc_pic_point,%eax + pushfl + cld + call _padlock_verify_ctx +-.L016cbc_pic_point: ++.L017cbc_pic_point: + leal 16(%edx),%edx + xorl %eax,%eax + xorl %ebx,%ebx +- cmpl $64,%ecx +- jbe .L017cbc_short + testl $32,(%edx) + jnz .L018cbc_aligned + testl $15,%edi +@@ -339,7 +385,25 @@ padlock_cbc_encrypt: + negl %eax + andl $511,%ebx + leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp + andl $-16,%esp ++ movl %eax,16(%ebp) ++ cmpl %ebx,%ecx ++ ja .L019cbc_loop ++ movl %esi,%eax ++ cmpl %esp,%ebp ++ cmovel %edi,%eax ++ addl %ecx,%eax ++ negl %eax ++ andl $4095,%eax ++ cmpl $64,%eax ++ movl $-64,%eax ++ cmovael %ebx,%eax ++ andl %eax,%ebx ++ jz .L020cbc_unaligned_tail + jmp .L019cbc_loop + .align 16 + .L019cbc_loop: +@@ -351,13 +415,13 @@ padlock_cbc_encrypt: + testl $15,%edi + cmovnzl %esp,%edi + testl $15,%esi +- jz .L020cbc_inp_aligned ++ jz .L021cbc_inp_aligned + shrl $2,%ecx + .byte 243,165 + subl %ebx,%edi + movl %ebx,%ecx + movl %edi,%esi +-.L020cbc_inp_aligned: ++.L021cbc_inp_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +@@ -367,67 +431,450 @@ padlock_cbc_encrypt: + movl (%ebp),%edi + movl 12(%ebp),%ebx + testl $15,%edi +- jz .L021cbc_out_aligned ++ jz .L022cbc_out_aligned + movl %ebx,%ecx +- shrl $2,%ecx + leal (%esp),%esi ++ shrl $2,%ecx + .byte 243,165 + subl %ebx,%edi +-.L021cbc_out_aligned: ++.L022cbc_out_aligned: + movl 4(%ebp),%esi + movl 8(%ebp),%ecx + addl %ebx,%edi + addl %ebx,%esi + subl %ebx,%ecx + movl $512,%ebx +- jnz .L019cbc_loop ++ jz .L023cbc_break ++ cmpl %ebx,%ecx ++ jae .L019cbc_loop ++.L020cbc_unaligned_tail: ++ xorl %eax,%eax + cmpl %ebp,%esp +- je .L022cbc_done ++ cmovel %ecx,%eax ++ subl %eax,%esp ++ movl %edi,%eax ++ movl %ecx,%ebx ++ shrl $2,%ecx ++ leal (%esp),%edi ++.byte 243,165 ++ movl %esp,%esi ++ movl %eax,%edi ++ movl %ebx,%ecx ++ jmp .L019cbc_loop ++.align 16 ++.L023cbc_break: ++ cmpl %ebp,%esp ++ je .L024cbc_done + pxor %xmm0,%xmm0 + leal (%esp),%eax +-.L023cbc_bzero: ++.L025cbc_bzero: + movaps %xmm0,(%eax) + leal 16(%eax),%eax + cmpl %eax,%ebp +- ja .L023cbc_bzero +-.L022cbc_done: ++ ja .L025cbc_bzero ++.L024cbc_done: ++ movl 16(%ebp),%ebp + leal 24(%ebp),%esp +- jmp .L024cbc_exit ++ jmp .L026cbc_exit + .align 16 +-.L017cbc_short: ++.L018cbc_aligned: ++ leal (%esi,%ecx,1),%ebp ++ negl %ebp ++ andl $4095,%ebp + xorl %eax,%eax ++ cmpl $64,%ebp ++ movl $63,%ebp ++ cmovael %eax,%ebp ++ andl %ecx,%ebp ++ subl %ebp,%ecx ++ jz .L027cbc_aligned_tail ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,208 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++ testl %ebp,%ebp ++ jz .L026cbc_exit ++.L027cbc_aligned_tail: ++ movl %ebp,%ecx + leal -24(%esp),%ebp +- subl %ecx,%eax ++ movl %ebp,%esp ++ movl %ebp,%eax ++ subl %ecx,%esp ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ movl %edi,%eax ++ movl %ecx,%ebx ++ shrl $2,%ecx ++ leal (%esp),%edi ++.byte 243,165 ++ movl %esp,%esi ++ movl %eax,%edi ++ movl %ebx,%ecx ++ jmp .L019cbc_loop ++.L026cbc_exit: ++ movl $1,%eax ++ leal 4(%esp),%esp ++.L016cbc_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret ++.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin ++.globl padlock_cfb_encrypt ++.type padlock_cfb_encrypt,@function ++.align 16 ++padlock_cfb_encrypt: ++.L_padlock_cfb_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz .L028cfb_abort ++ testl $15,%ecx ++ jnz .L028cfb_abort ++ leal .Lpadlock_saved_context-.L029cfb_pic_point,%eax ++ pushfl ++ cld ++ call _padlock_verify_ctx ++.L029cfb_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%edx) ++ jnz .L030cfb_aligned ++ testl $15,%edi ++ setz %al ++ testl $15,%esi ++ setz %bl ++ testl %ebx,%eax ++ jnz .L030cfb_aligned ++ negl %eax ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx + leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp + andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp .L031cfb_loop ++.align 16 ++.L031cfb_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ testl $15,%edi ++ cmovnzl %esp,%edi ++ testl $15,%esi ++ jz .L032cfb_inp_aligned ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++ movl %ebx,%ecx ++ movl %edi,%esi ++.L032cfb_inp_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,224 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ testl $15,%edi ++ jz .L033cfb_out_aligned ++ movl %ebx,%ecx ++ leal (%esp),%esi ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++.L033cfb_out_aligned: ++ movl 4(%ebp),%esi ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz .L031cfb_loop ++ cmpl %ebp,%esp ++ je .L034cfb_done ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++.L035cfb_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja .L035cfb_bzero ++.L034cfb_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ jmp .L036cfb_exit ++.align 16 ++.L030cfb_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,224 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++.L036cfb_exit: ++ movl $1,%eax ++ leal 4(%esp),%esp ++.L028cfb_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret ++.size padlock_cfb_encrypt,.-.L_padlock_cfb_encrypt_begin ++.globl padlock_ofb_encrypt ++.type padlock_ofb_encrypt,@function ++.align 16 ++padlock_ofb_encrypt: ++.L_padlock_ofb_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz .L037ofb_abort ++ testl $15,%ecx ++ jnz .L037ofb_abort ++ leal .Lpadlock_saved_context-.L038ofb_pic_point,%eax ++ pushfl ++ cld ++ call _padlock_verify_ctx ++.L038ofb_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax + xorl %ebx,%ebx +-.L025cbc_short_copy: +- movups (%esi,%ebx,1),%xmm0 +- leal 16(%ebx),%ebx ++ testl $32,(%edx) ++ jnz .L039ofb_aligned ++ testl $15,%edi ++ setz %al ++ testl $15,%esi ++ setz %bl ++ testl %ebx,%eax ++ jnz .L039ofb_aligned ++ negl %eax ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp + cmpl %ebx,%ecx +- movaps %xmm0,-16(%esp,%ebx,1) +- ja .L025cbc_short_copy +- movl %esp,%esi ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax + movl %ecx,%ebx +- jmp .L019cbc_loop ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp .L040ofb_loop + .align 16 +-.L018cbc_aligned: ++.L040ofb_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ testl $15,%edi ++ cmovnzl %esp,%edi ++ testl $15,%esi ++ jz .L041ofb_inp_aligned ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++ movl %ebx,%ecx ++ movl %edi,%esi ++.L041ofb_inp_aligned: + leal -16(%edx),%eax + leal 16(%edx),%ebx + shrl $4,%ecx +-.byte 243,15,167,208 ++.byte 243,15,167,232 + movaps (%eax),%xmm0 + movaps %xmm0,-16(%edx) +-.L024cbc_exit: ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ testl $15,%edi ++ jz .L042ofb_out_aligned ++ movl %ebx,%ecx ++ leal (%esp),%esi ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++.L042ofb_out_aligned: ++ movl 4(%ebp),%esi ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz .L040ofb_loop ++ cmpl %ebp,%esp ++ je .L043ofb_done ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++.L044ofb_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja .L044ofb_bzero ++.L043ofb_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ jmp .L045ofb_exit ++.align 16 ++.L039ofb_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,232 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++.L045ofb_exit: + movl $1,%eax + leal 4(%esp),%esp +-.L015cbc_abort: ++.L037ofb_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +-.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin ++.size padlock_ofb_encrypt,.-.L_padlock_ofb_encrypt_begin ++.globl padlock_ctr32_encrypt ++.type padlock_ctr32_encrypt,@function ++.align 16 ++padlock_ctr32_encrypt: ++.L_padlock_ctr32_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz .L046ctr32_abort ++ testl $15,%ecx ++ jnz .L046ctr32_abort ++ leal .Lpadlock_saved_context-.L047ctr32_pic_point,%eax ++ pushfl ++ cld ++ call _padlock_verify_ctx ++.L047ctr32_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ movq -16(%edx),%mm0 ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp .L048ctr32_loop ++.align 16 ++.L048ctr32_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ movl -4(%edx),%ecx ++ xorl %edi,%edi ++ movl -8(%edx),%eax ++.L049ctr32_prepare: ++ movl %ecx,12(%esp,%edi,1) ++ bswap %ecx ++ movq %mm0,(%esp,%edi,1) ++ incl %ecx ++ movl %eax,8(%esp,%edi,1) ++ bswap %ecx ++ leal 16(%edi),%edi ++ cmpl %ebx,%edi ++ jb .L049ctr32_prepare ++ movl %ecx,-4(%edx) ++ leal (%esp),%esi ++ leal (%esp),%edi ++ movl %ebx,%ecx ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,200 ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ movl 4(%ebp),%esi ++ xorl %ecx,%ecx ++.L050ctr32_xor: ++ movups (%esi,%ecx,1),%xmm1 ++ leal 16(%ecx),%ecx ++ pxor -16(%esp,%ecx,1),%xmm1 ++ movups %xmm1,-16(%edi,%ecx,1) ++ cmpl %ebx,%ecx ++ jb .L050ctr32_xor ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz .L048ctr32_loop ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++.L051ctr32_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja .L051ctr32_bzero ++.L052ctr32_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ movl $1,%eax ++ leal 4(%esp),%esp ++ emms ++.L046ctr32_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret ++.size padlock_ctr32_encrypt,.-.L_padlock_ctr32_encrypt_begin + .globl padlock_xstore + .type padlock_xstore,@function + .align 16 +@@ -447,10 +894,10 @@ _win32_segv_handler: + movl 4(%esp),%edx + movl 12(%esp),%ecx + cmpl $3221225477,(%edx) +- jne .L026ret ++ jne .L053ret + addl $4,184(%ecx) + movl $0,%eax +-.L026ret: ++.L053ret: + ret + .size _win32_segv_handler,.-_win32_segv_handler + .globl padlock_sha1_oneshot +diff --git a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s +index cfac705..eac88ae 100644 +--- a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s ++++ b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s +@@ -699,6 +699,7 @@ L$ghash_epilogue: + + .p2align 4 + _gcm_init_clmul: ++L$_init_clmul: + movdqu (%rsi),%xmm2 + pshufd $78,%xmm2,%xmm2 + +@@ -717,15 +718,15 @@ _gcm_init_clmul: + pxor %xmm5,%xmm2 + + ++ pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 ++ pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 +-.byte 102,15,58,68,220,0 ++.byte 102,15,58,68,222,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + +@@ -735,44 +736,134 @@ _gcm_init_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ pxor %xmm4,%xmm0 ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ pshufd $78,%xmm2,%xmm3 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm2,%xmm3 ++ movdqu %xmm2,0(%rdi) ++ pxor %xmm0,%xmm4 ++ movdqu %xmm0,16(%rdi) ++.byte 102,15,58,15,227,8 ++ movdqu %xmm4,32(%rdi) ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,15,58,68,194,0 ++.byte 102,15,58,68,202,17 ++.byte 102,15,58,68,222,0 ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ ++ movdqa %xmm3,%xmm4 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 ++ pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 ++ ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ movdqa %xmm0,%xmm5 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,15,58,68,194,0 ++.byte 102,15,58,68,202,17 ++.byte 102,15,58,68,222,0 ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ ++ movdqa %xmm3,%xmm4 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 ++ pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 ++ ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- movdqu %xmm2,(%rdi) +- movdqu %xmm0,16(%rdi) ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ pshufd $78,%xmm5,%xmm3 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm5,%xmm3 ++ movdqu %xmm5,48(%rdi) ++ pxor %xmm0,%xmm4 ++ movdqu %xmm0,64(%rdi) ++.byte 102,15,58,15,227,8 ++ movdqu %xmm4,80(%rdi) + .byte 0xf3,0xc3 + + .globl _gcm_gmult_clmul + + .p2align 4 + _gcm_gmult_clmul: ++L$_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa L$bswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 ++ movdqu 32(%rsi),%xmm4 + .byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 + .byte 102,15,58,68,220,0 +@@ -785,186 +876,358 @@ _gcm_gmult_clmul: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + .byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 + + .globl _gcm_ghash_clmul + +-.p2align 4 ++.p2align 5 + _gcm_ghash_clmul: ++L$_ghash_clmul: + movdqa L$bswap_mask(%rip),%xmm5 ++ movq $11547335547999543296,%rax + + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm2 ++ movdqu 32(%rsi),%xmm10 + .byte 102,15,56,0,197 + + subq $16,%rcx + jz L$odd_tail + +- movdqu 16(%rsi),%xmm8 ++ movdqu 16(%rsi),%xmm9 ++ cmpq $48,%rcx ++ jb L$skip4x + ++ subq $48,%rcx ++ movdqu 48(%rsi),%xmm14 ++ movdqu 64(%rsi),%xmm15 + + + + +- movdqu (%rdx),%xmm3 +- movdqu 16(%rdx),%xmm6 +-.byte 102,15,56,0,221 ++ movdqu 48(%rdx),%xmm6 ++ movdqu 32(%rdx),%xmm11 + .byte 102,15,56,0,245 +- pxor %xmm3,%xmm0 +- movdqa %xmm6,%xmm7 +- pshufd $78,%xmm6,%xmm3 +- pshufd $78,%xmm2,%xmm4 +- pxor %xmm6,%xmm3 +- pxor %xmm2,%xmm4 ++.byte 102,68,15,56,0,221 ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm7 ++ pxor %xmm6,%xmm7 + .byte 102,15,58,68,242,0 +-.byte 102,15,58,68,250,17 +-.byte 102,15,58,68,220,0 +- pxor %xmm6,%xmm3 +- pxor %xmm7,%xmm3 ++.byte 102,68,15,58,68,194,17 ++.byte 102,65,15,58,68,250,0 ++ ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,217,0 ++.byte 102,69,15,58,68,233,17 ++ xorps %xmm11,%xmm6 ++.byte 102,69,15,58,68,226,16 ++ xorps %xmm13,%xmm8 ++ movups 80(%rsi),%xmm10 ++ xorps %xmm12,%xmm7 ++ ++ movdqu 16(%rdx),%xmm11 ++ movdqu 0(%rdx),%xmm3 ++.byte 102,68,15,56,0,221 ++.byte 102,15,56,0,221 ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm3,%xmm0 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,222,0 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm3 ++ pxor %xmm0,%xmm3 ++.byte 102,69,15,58,68,238,17 ++ xorps %xmm11,%xmm6 ++.byte 102,69,15,58,68,226,0 ++ xorps %xmm13,%xmm8 ++ ++ leaq 64(%rdx),%rdx ++ subq $64,%rcx ++ jc L$tail4x ++ ++ jmp L$mod4_loop ++.p2align 5 ++L$mod4_loop: ++.byte 102,65,15,58,68,199,0 ++ xorps %xmm12,%xmm7 ++ movdqu 48(%rdx),%xmm11 ++.byte 102,68,15,56,0,221 ++.byte 102,65,15,58,68,207,17 ++ xorps %xmm6,%xmm0 ++ movdqu 32(%rdx),%xmm6 ++ movdqa %xmm11,%xmm13 ++ pshufd $78,%xmm11,%xmm12 ++.byte 102,65,15,58,68,218,16 ++ xorps %xmm8,%xmm1 ++ pxor %xmm11,%xmm12 ++.byte 102,15,56,0,245 ++ movups 32(%rsi),%xmm10 ++.byte 102,68,15,58,68,218,0 ++ xorps %xmm7,%xmm3 ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm7 + ++ pxor %xmm0,%xmm3 ++ pxor %xmm6,%xmm7 ++ pxor %xmm1,%xmm3 + movdqa %xmm3,%xmm4 +- psrldq $8,%xmm3 ++ pslldq $8,%xmm3 ++.byte 102,68,15,58,68,234,17 ++ psrldq $8,%xmm4 ++ pxor %xmm3,%xmm0 ++ movdqa L$7_mask(%rip),%xmm3 ++ pxor %xmm4,%xmm1 ++.byte 102,72,15,110,224 ++ ++ pand %xmm0,%xmm3 ++.byte 102,15,56,0,227 ++.byte 102,69,15,58,68,226,0 ++ pxor %xmm0,%xmm4 ++ psllq $57,%xmm4 ++ movdqa %xmm4,%xmm3 + pslldq $8,%xmm4 +- pxor %xmm3,%xmm7 +- pxor %xmm4,%xmm6 ++.byte 102,65,15,58,68,241,0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ movdqu 0(%rdx),%xmm3 ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++.byte 102,69,15,58,68,193,17 ++ xorps %xmm11,%xmm6 ++ movdqu 16(%rdx),%xmm11 ++.byte 102,68,15,56,0,221 ++.byte 102,65,15,58,68,250,16 ++ xorps %xmm13,%xmm8 ++ movups 80(%rsi),%xmm10 ++.byte 102,15,56,0,221 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ ++ movdqa %xmm11,%xmm13 ++ pxor %xmm12,%xmm7 ++ pshufd $78,%xmm11,%xmm12 ++ pxor %xmm11,%xmm12 ++.byte 102,69,15,58,68,222,0 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ psrlq $1,%xmm0 ++.byte 102,69,15,58,68,238,17 ++ xorps %xmm11,%xmm6 ++ pxor %xmm1,%xmm0 ++ ++.byte 102,69,15,58,68,226,0 ++ xorps %xmm13,%xmm8 ++ + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm8,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm8,%xmm4 + +- leaq 32(%rdx),%rdx +- subq $32,%rcx +- jbe L$even_tail ++ leaq 64(%rdx),%rdx ++ subq $64,%rcx ++ jnc L$mod4_loop ++ ++L$tail4x: ++.byte 102,65,15,58,68,199,0 ++ xorps %xmm12,%xmm7 ++.byte 102,65,15,58,68,207,17 ++ xorps %xmm6,%xmm0 ++.byte 102,65,15,58,68,218,16 ++ xorps %xmm8,%xmm1 ++ pxor %xmm0,%xmm1 ++ pxor %xmm7,%xmm3 + +-L$mod_loop: +-.byte 102,65,15,58,68,192,0 +-.byte 102,65,15,58,68,200,17 +-.byte 102,15,58,68,220,0 +- pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 ++ pxor %xmm0,%xmm1 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 +- movdqu (%rdx),%xmm3 +- pxor %xmm6,%xmm0 +- pxor %xmm7,%xmm1 + ++ movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 ++ psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 ++ psllq $57,%xmm0 ++ movdqa %xmm0,%xmm3 ++ pslldq $8,%xmm0 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ ++ ++ movdqa %xmm0,%xmm4 ++ psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 ++ pxor %xmm4,%xmm0 ++ psrlq $1,%xmm0 ++ pxor %xmm1,%xmm0 ++ addq $64,%rcx ++ jz L$done ++ movdqu 32(%rsi),%xmm10 ++ subq $16,%rcx ++ jz L$odd_tail ++L$skip4x: ++ ++ ++ ++ ++ ++ movdqu (%rdx),%xmm3 + movdqu 16(%rdx),%xmm6 + .byte 102,15,56,0,221 + .byte 102,15,56,0,245 ++ pxor %xmm3,%xmm0 ++ ++ movdqa %xmm6,%xmm8 ++ pshufd $78,%xmm6,%xmm3 ++ pxor %xmm6,%xmm3 ++.byte 102,15,58,68,242,0 ++.byte 102,68,15,58,68,194,17 ++.byte 102,65,15,58,68,218,0 ++ ++ leaq 32(%rdx),%rdx ++ subq $32,%rcx ++ jbe L$even_tail ++ jmp L$mod_loop ++ ++.p2align 5 ++L$mod_loop: ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm0,%xmm4 ++ ++.byte 102,65,15,58,68,193,0 ++.byte 102,65,15,58,68,201,17 ++.byte 102,65,15,58,68,226,16 ++ ++ pxor %xmm6,%xmm0 ++ pxor %xmm8,%xmm1 ++ movdqu (%rdx),%xmm8 ++.byte 102,68,15,56,0,197 ++ movdqu 16(%rdx),%xmm6 + +- movdqa %xmm6,%xmm7 +- pshufd $78,%xmm6,%xmm9 +- pshufd $78,%xmm2,%xmm10 +- pxor %xmm6,%xmm9 +- pxor %xmm2,%xmm10 ++ pxor %xmm0,%xmm3 ++ pxor %xmm1,%xmm3 ++ pxor %xmm8,%xmm1 ++ pxor %xmm3,%xmm4 ++.byte 102,15,56,0,245 ++ movdqa %xmm4,%xmm3 ++ psrldq $8,%xmm3 ++ pslldq $8,%xmm4 + pxor %xmm3,%xmm1 ++ pxor %xmm4,%xmm0 + ++ movdqa %xmm6,%xmm8 ++ ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 +- pxor %xmm3,%xmm0 + .byte 102,15,58,68,242,0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 ++ pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 ++ pshufd $78,%xmm8,%xmm3 ++ pxor %xmm8,%xmm3 + +-.byte 102,15,58,68,250,17 ++.byte 102,68,15,58,68,194,17 + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 +- +-.byte 102,69,15,58,68,202,0 +- movdqa %xmm0,%xmm1 +- pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm8,%xmm4 +- pxor %xmm0,%xmm3 +- pxor %xmm8,%xmm4 +- +- pxor %xmm6,%xmm9 +- pxor %xmm7,%xmm9 +- movdqa %xmm9,%xmm10 +- psrldq $8,%xmm9 +- pslldq $8,%xmm10 +- pxor %xmm9,%xmm7 +- pxor %xmm10,%xmm6 ++.byte 102,65,15,58,68,218,0 ++ pxor %xmm1,%xmm0 + + leaq 32(%rdx),%rdx + subq $32,%rcx + ja L$mod_loop + + L$even_tail: +-.byte 102,65,15,58,68,192,0 +-.byte 102,65,15,58,68,200,17 +-.byte 102,15,58,68,220,0 ++ movdqa %xmm0,%xmm1 ++ pshufd $78,%xmm0,%xmm4 ++ pxor %xmm0,%xmm4 ++ ++.byte 102,65,15,58,68,193,0 ++.byte 102,65,15,58,68,201,17 ++.byte 102,65,15,58,68,226,16 ++ ++ pxor %xmm6,%xmm0 ++ pxor %xmm8,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 +- +- movdqa %xmm3,%xmm4 ++ pxor %xmm3,%xmm4 ++ movdqa %xmm4,%xmm3 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 +- pxor %xmm6,%xmm0 +- pxor %xmm7,%xmm1 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + testq %rcx,%rcx + jnz L$done + +@@ -974,12 +1237,10 @@ L$odd_tail: + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 +- pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 +- pxor %xmm2,%xmm4 + .byte 102,15,58,68,194,0 + .byte 102,15,58,68,202,17 +-.byte 102,15,58,68,220,0 ++.byte 102,65,15,58,68,218,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + +@@ -989,38 +1250,60 @@ L$odd_tail: + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + ++ movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 +- psllq $1,%xmm0 +- pxor %xmm3,%xmm0 + psllq $5,%xmm0 ++ pxor %xmm0,%xmm3 ++ psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 +- movdqa %xmm0,%xmm4 ++ movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 +- psrldq $8,%xmm4 +- pxor %xmm3,%xmm0 +- pxor %xmm4,%xmm1 ++ psrldq $8,%xmm3 ++ pxor %xmm4,%xmm0 ++ pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 +- psrlq $5,%xmm0 +- pxor %xmm4,%xmm0 + psrlq $1,%xmm0 ++ pxor %xmm4,%xmm1 ++ pxor %xmm0,%xmm4 ++ psrlq $5,%xmm0 + pxor %xmm4,%xmm0 +- pxor %xmm1,%xmm4 + psrlq $1,%xmm0 +- pxor %xmm4,%xmm0 ++ pxor %xmm1,%xmm0 + L$done: + .byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 +-L$SEH_end_gcm_ghash_clmul: ++ ++.globl _gcm_init_avx ++ ++.p2align 5 ++_gcm_init_avx: ++ jmp L$_init_clmul ++ ++.globl _gcm_gmult_avx ++ ++.p2align 5 ++_gcm_gmult_avx: ++ jmp L$_gmult_clmul ++ ++.globl _gcm_ghash_avx ++ ++.p2align 5 ++_gcm_ghash_avx: ++ jmp L$_ghash_clmul + + .p2align 6 + L$bswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + L$0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 ++L$7_mask: ++.long 7,0,7,0 ++L$7_mask_poly: ++.long 7,0,450,0 + .p2align 6 + + L$rem_4bit: +diff --git a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s +index a82f0a5..e2cfa17 100644 +--- a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s ++++ b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s +@@ -927,199 +927,412 @@ L$oop_enc1_6: + + .p2align 4 + _aesni_ctr32_encrypt_blocks: ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $128,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp ++ + cmpq $1,%rdx + je L$ctr32_one_shortcut + +- movdqu (%r8),%xmm14 +- movdqa L$bswap_mask(%rip),%xmm15 +- xorl %eax,%eax +-.byte 102,69,15,58,22,242,3 +-.byte 102,68,15,58,34,240,3 ++ movdqu (%r8),%xmm2 ++ movdqu (%rcx),%xmm0 ++ movl 12(%r8),%r8d ++ pxor %xmm0,%xmm2 ++ movl 12(%rcx),%r11d ++ movdqa %xmm2,0(%rsp) ++ bswapl %r8d ++ movdqa %xmm2,%xmm3 ++ movdqa %xmm2,%xmm4 ++ movdqa %xmm2,%xmm5 ++ movdqa %xmm2,64(%rsp) ++ movdqa %xmm2,80(%rsp) ++ movdqa %xmm2,96(%rsp) ++ movdqa %xmm2,112(%rsp) + + movl 240(%rcx),%eax ++ ++ leaq 1(%r8),%r9 ++ leaq 2(%r8),%r10 ++ bswapl %r9d + bswapl %r10d +- pxor %xmm12,%xmm12 +- pxor %xmm13,%xmm13 +-.byte 102,69,15,58,34,226,0 +- leaq 3(%r10),%r11 +-.byte 102,69,15,58,34,235,0 +- incl %r10d +-.byte 102,69,15,58,34,226,1 +- incq %r11 +-.byte 102,69,15,58,34,235,1 +- incl %r10d +-.byte 102,69,15,58,34,226,2 +- incq %r11 +-.byte 102,69,15,58,34,235,2 +- movdqa %xmm12,-40(%rsp) +-.byte 102,69,15,56,0,231 +- movdqa %xmm13,-24(%rsp) +-.byte 102,69,15,56,0,239 +- +- pshufd $192,%xmm12,%xmm2 +- pshufd $128,%xmm12,%xmm3 +- pshufd $64,%xmm12,%xmm4 +- cmpq $6,%rdx +- jb L$ctr32_tail +- shrl $1,%eax +- movq %rcx,%r11 +- movl %eax,%r10d +- subq $6,%rdx +- jmp L$ctr32_loop6 ++ xorl %r11d,%r9d ++ xorl %r11d,%r10d ++.byte 102,65,15,58,34,217,3 ++ leaq 3(%r8),%r9 ++ movdqa %xmm3,16(%rsp) ++.byte 102,65,15,58,34,226,3 ++ bswapl %r9d ++ leaq 4(%r8),%r10 ++ movdqa %xmm4,32(%rsp) ++ xorl %r11d,%r9d ++ bswapl %r10d ++.byte 102,65,15,58,34,233,3 ++ xorl %r11d,%r10d ++ movdqa %xmm5,48(%rsp) ++ leaq 5(%r8),%r9 ++ movl %r10d,64+12(%rsp) ++ bswapl %r9d ++ leaq 6(%r8),%r10 ++ xorl %r11d,%r9d ++ bswapl %r10d ++ movl %r9d,80+12(%rsp) ++ xorl %r11d,%r10d ++ leaq 7(%r8),%r9 ++ movl %r10d,96+12(%rsp) ++ bswapl %r9d ++ xorl %r11d,%r9d ++ movl %r9d,112+12(%rsp) + +-.p2align 4 +-L$ctr32_loop6: +- pshufd $192,%xmm13,%xmm5 +- por %xmm14,%xmm2 +- movups (%r11),%xmm0 +- pshufd $128,%xmm13,%xmm6 +- por %xmm14,%xmm3 +- movups 16(%r11),%xmm1 +- pshufd $64,%xmm13,%xmm7 +- por %xmm14,%xmm4 +- por %xmm14,%xmm5 +- xorps %xmm0,%xmm2 +- por %xmm14,%xmm6 +- por %xmm14,%xmm7 ++ movups 16(%rcx),%xmm1 + ++ movdqa 64(%rsp),%xmm6 ++ movdqa 80(%rsp),%xmm7 + ++ cmpq $8,%rdx ++ jb L$ctr32_tail + ++ leaq 128(%rcx),%rcx ++ subq $8,%rdx ++ jmp L$ctr32_loop8 + +- pxor %xmm0,%xmm3 ++.p2align 5 ++L$ctr32_loop8: ++ addl $8,%r8d ++ movdqa 96(%rsp),%xmm8 + .byte 102,15,56,220,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++ movl %r8d,%r9d ++ movdqa 112(%rsp),%xmm9 + .byte 102,15,56,220,217 +- movdqa L$increment32(%rip),%xmm13 +- pxor %xmm0,%xmm5 ++ bswapl %r9d ++ movups 32-128(%rcx),%xmm0 + .byte 102,15,56,220,225 +- movdqa -40(%rsp),%xmm12 +- pxor %xmm0,%xmm6 ++ xorl %r11d,%r9d + .byte 102,15,56,220,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++ movl %r9d,0+12(%rsp) ++ leaq 1(%r8),%r9 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +- jmp L$ctr32_enc_loop6_enter +-.p2align 4 +-L$ctr32_enc_loop6: ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 48-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,16+12(%rsp) ++ leaq 2(%r8),%r9 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 64-128(%rcx),%xmm0 + .byte 102,15,56,220,209 + .byte 102,15,56,220,217 +- decl %eax ++ bswapl %r9d + .byte 102,15,56,220,225 ++ xorl %r11d,%r9d + .byte 102,15,56,220,233 ++ movl %r9d,32+12(%rsp) ++ leaq 3(%r8),%r9 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +-L$ctr32_enc_loop6_enter: +- movups 16(%rcx),%xmm1 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 80-128(%rcx),%xmm1 + .byte 102,15,56,220,208 + .byte 102,15,56,220,216 +- leaq 32(%rcx),%rcx ++ bswapl %r9d + .byte 102,15,56,220,224 ++ xorl %r11d,%r9d + .byte 102,15,56,220,232 ++ movl %r9d,48+12(%rsp) ++ leaq 4(%r8),%r9 + .byte 102,15,56,220,240 + .byte 102,15,56,220,248 +- movups (%rcx),%xmm0 +- jnz L$ctr32_enc_loop6 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 96-128(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++ bswapl %r9d ++.byte 102,15,56,220,225 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,233 ++ movl %r9d,64+12(%rsp) ++ leaq 5(%r8),%r9 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 112-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,80+12(%rsp) ++ leaq 6(%r8),%r9 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 128-128(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++ bswapl %r9d ++.byte 102,15,56,220,225 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,233 ++ movl %r9d,96+12(%rsp) ++ leaq 7(%r8),%r9 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 144-128(%rcx),%xmm1 ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++ bswapl %r9d ++.byte 102,15,56,220,224 ++ xorl %r11d,%r9d ++.byte 102,15,56,220,232 ++ movl %r9d,112+12(%rsp) ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++ movdqu 0(%rdi),%xmm10 ++.byte 102,68,15,56,220,200 ++ movups 160-128(%rcx),%xmm0 ++ ++ cmpl $11,%eax ++ jb L$ctr32_enc_done + + .byte 102,15,56,220,209 +- paddd %xmm13,%xmm12 + .byte 102,15,56,220,217 +- paddd -24(%rsp),%xmm13 + .byte 102,15,56,220,225 +- movdqa %xmm12,-40(%rsp) + .byte 102,15,56,220,233 +- movdqa %xmm13,-24(%rsp) + .byte 102,15,56,220,241 +-.byte 102,69,15,56,0,231 + .byte 102,15,56,220,249 +-.byte 102,69,15,56,0,239 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 176-128(%rcx),%xmm1 + +-.byte 102,15,56,221,208 +- movups (%rdi),%xmm8 +-.byte 102,15,56,221,216 +- movups 16(%rdi),%xmm9 +-.byte 102,15,56,221,224 +- movups 32(%rdi),%xmm10 +-.byte 102,15,56,221,232 +- movups 48(%rdi),%xmm11 +-.byte 102,15,56,221,240 +- movups 64(%rdi),%xmm1 +-.byte 102,15,56,221,248 +- movups 80(%rdi),%xmm0 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 192-128(%rcx),%xmm0 ++ je L$ctr32_enc_done + +- xorps %xmm2,%xmm8 +- pshufd $192,%xmm12,%xmm2 +- xorps %xmm3,%xmm9 +- pshufd $128,%xmm12,%xmm3 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- pshufd $64,%xmm12,%xmm4 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- xorps %xmm6,%xmm1 +- movups %xmm11,48(%rsi) +- xorps %xmm7,%xmm0 +- movups %xmm1,64(%rsi) +- movups %xmm0,80(%rsi) +- leaq 96(%rsi),%rsi +- movl %r10d,%eax +- subq $6,%rdx +- jnc L$ctr32_loop6 ++.byte 102,15,56,220,209 ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movups 208-128(%rcx),%xmm1 ++ ++.byte 102,15,56,220,208 ++.byte 102,15,56,220,216 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++.byte 102,15,56,220,240 ++.byte 102,15,56,220,248 ++.byte 102,68,15,56,220,192 ++.byte 102,68,15,56,220,200 ++ movups 224-128(%rcx),%xmm0 ++ ++L$ctr32_enc_done: ++ movdqu 16(%rdi),%xmm11 ++ pxor %xmm0,%xmm10 ++ movdqu 32(%rdi),%xmm12 ++ pxor %xmm0,%xmm11 ++ movdqu 48(%rdi),%xmm13 ++ pxor %xmm0,%xmm12 ++ movdqu 64(%rdi),%xmm14 ++ pxor %xmm0,%xmm13 ++ movdqu 80(%rdi),%xmm15 ++ pxor %xmm0,%xmm14 ++.byte 102,15,56,220,209 ++ pxor %xmm0,%xmm15 ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++.byte 102,68,15,56,220,193 ++.byte 102,68,15,56,220,201 ++ movdqu 96(%rdi),%xmm1 ++ ++.byte 102,65,15,56,221,210 ++ pxor %xmm0,%xmm1 ++ movdqu 112(%rdi),%xmm10 ++ leaq 128(%rdi),%rdi ++.byte 102,65,15,56,221,219 ++ pxor %xmm0,%xmm10 ++ movdqa 0(%rsp),%xmm11 ++.byte 102,65,15,56,221,228 ++ movdqa 16(%rsp),%xmm12 ++.byte 102,65,15,56,221,237 ++ movdqa 32(%rsp),%xmm13 ++.byte 102,65,15,56,221,246 ++ movdqa 48(%rsp),%xmm14 ++.byte 102,65,15,56,221,255 ++ movdqa 64(%rsp),%xmm15 ++.byte 102,68,15,56,221,193 ++ movdqa 80(%rsp),%xmm0 ++.byte 102,69,15,56,221,202 ++ movups 16-128(%rcx),%xmm1 ++ ++ movups %xmm2,(%rsi) ++ movdqa %xmm11,%xmm2 ++ movups %xmm3,16(%rsi) ++ movdqa %xmm12,%xmm3 ++ movups %xmm4,32(%rsi) ++ movdqa %xmm13,%xmm4 ++ movups %xmm5,48(%rsi) ++ movdqa %xmm14,%xmm5 ++ movups %xmm6,64(%rsi) ++ movdqa %xmm15,%xmm6 ++ movups %xmm7,80(%rsi) ++ movdqa %xmm0,%xmm7 ++ movups %xmm8,96(%rsi) ++ movups %xmm9,112(%rsi) ++ leaq 128(%rsi),%rsi ++ ++ subq $8,%rdx ++ jnc L$ctr32_loop8 + +- addq $6,%rdx ++ addq $8,%rdx + jz L$ctr32_done +- movq %r11,%rcx +- leal 1(%rax,%rax,1),%eax ++ leaq -128(%rcx),%rcx + + L$ctr32_tail: +- por %xmm14,%xmm2 +- movups (%rdi),%xmm8 +- cmpq $2,%rdx +- jb L$ctr32_one ++ leaq 16(%rcx),%rcx ++ cmpq $4,%rdx ++ jb L$ctr32_loop3 ++ je L$ctr32_loop4 + +- por %xmm14,%xmm3 +- movups 16(%rdi),%xmm9 +- je L$ctr32_two ++ movdqa 96(%rsp),%xmm8 ++ pxor %xmm9,%xmm9 + +- pshufd $192,%xmm13,%xmm5 +- por %xmm14,%xmm4 +- movups 32(%rdi),%xmm10 +- cmpq $4,%rdx +- jb L$ctr32_three ++ movups 16(%rcx),%xmm0 ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++ shrl $1,%eax ++.byte 102,15,56,220,225 ++ decl %eax ++.byte 102,15,56,220,233 ++ movups (%rdi),%xmm10 ++.byte 102,15,56,220,241 ++ movups 16(%rdi),%xmm11 ++.byte 102,15,56,220,249 ++ movups 32(%rdi),%xmm12 ++.byte 102,68,15,56,220,193 ++ movups 16(%rcx),%xmm1 + +- pshufd $128,%xmm13,%xmm6 +- por %xmm14,%xmm5 +- movups 48(%rdi),%xmm11 +- je L$ctr32_four ++ call L$enc_loop8_enter + +- por %xmm14,%xmm6 +- xorps %xmm7,%xmm7 ++ movdqu 48(%rdi),%xmm13 ++ pxor %xmm10,%xmm2 ++ movdqu 64(%rdi),%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm10,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ movdqu %xmm6,64(%rsi) ++ cmpq $6,%rdx ++ jb L$ctr32_done + +- call _aesni_encrypt6 ++ movups 80(%rdi),%xmm11 ++ xorps %xmm11,%xmm7 ++ movups %xmm7,80(%rsi) ++ je L$ctr32_done + +- movups 64(%rdi),%xmm1 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- xorps %xmm6,%xmm1 +- movups %xmm11,48(%rsi) +- movups %xmm1,64(%rsi) ++ movups 96(%rdi),%xmm12 ++ xorps %xmm12,%xmm8 ++ movups %xmm8,96(%rsi) ++ jmp L$ctr32_done ++ ++.p2align 5 ++L$ctr32_loop4: ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++ movups (%rcx),%xmm1 ++ decl %eax ++ jnz L$ctr32_loop4 ++.byte 102,15,56,221,209 ++ movups (%rdi),%xmm10 ++.byte 102,15,56,221,217 ++ movups 16(%rdi),%xmm11 ++.byte 102,15,56,221,225 ++ movups 32(%rdi),%xmm12 ++.byte 102,15,56,221,233 ++ movups 48(%rdi),%xmm13 ++ ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) ++ xorps %xmm11,%xmm3 ++ movups %xmm3,16(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm5,48(%rsi) ++ jmp L$ctr32_done ++ ++.p2align 5 ++L$ctr32_loop3: ++.byte 102,15,56,220,209 ++ leaq 16(%rcx),%rcx ++.byte 102,15,56,220,217 ++.byte 102,15,56,220,225 ++ movups (%rcx),%xmm1 ++ decl %eax ++ jnz L$ctr32_loop3 ++.byte 102,15,56,221,209 ++.byte 102,15,56,221,217 ++.byte 102,15,56,221,225 ++ ++ movups (%rdi),%xmm10 ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) ++ cmpq $2,%rdx ++ jb L$ctr32_done ++ ++ movups 16(%rdi),%xmm11 ++ xorps %xmm11,%xmm3 ++ movups %xmm3,16(%rsi) ++ je L$ctr32_done ++ ++ movups 32(%rdi),%xmm12 ++ xorps %xmm12,%xmm4 ++ movups %xmm4,32(%rsi) + jmp L$ctr32_done + + .p2align 4 + L$ctr32_one_shortcut: + movups (%r8),%xmm2 +- movups (%rdi),%xmm8 ++ movups (%rdi),%xmm10 + movl 240(%rcx),%eax +-L$ctr32_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx +@@ -1131,51 +1344,26 @@ L$oop_enc1_7: + leaq 16(%rcx),%rcx + jnz L$oop_enc1_7 + .byte 102,15,56,221,209 +- xorps %xmm2,%xmm8 +- movups %xmm8,(%rsi) +- jmp L$ctr32_done +- +-.p2align 4 +-L$ctr32_two: +- xorps %xmm4,%xmm4 +- call _aesni_encrypt3 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- movups %xmm9,16(%rsi) +- jmp L$ctr32_done +- +-.p2align 4 +-L$ctr32_three: +- call _aesni_encrypt3 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- movups %xmm10,32(%rsi) ++ xorps %xmm10,%xmm2 ++ movups %xmm2,(%rsi) + jmp L$ctr32_done + + .p2align 4 +-L$ctr32_four: +- call _aesni_encrypt4 +- xorps %xmm2,%xmm8 +- xorps %xmm3,%xmm9 +- movups %xmm8,(%rsi) +- xorps %xmm4,%xmm10 +- movups %xmm9,16(%rsi) +- xorps %xmm5,%xmm11 +- movups %xmm10,32(%rsi) +- movups %xmm11,48(%rsi) +- + L$ctr32_done: ++ leaq (%rbp),%rsp ++ popq %rbp ++L$ctr32_epilogue: + .byte 0xf3,0xc3 + + .globl _aesni_xts_encrypt + + .p2align 4 + _aesni_xts_encrypt: +- leaq -104(%rsp),%rsp ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $112,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp + movups (%r9),%xmm15 + movl 240(%r8),%eax + movl 240(%rcx),%r10d +@@ -1190,228 +1378,266 @@ L$oop_enc1_8: + leaq 16(%r8),%r8 + jnz L$oop_enc1_8 + .byte 102,68,15,56,221,249 ++ movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax ++ shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + ++ movups 16(%rcx,%r10,1),%xmm1 ++ movl %eax,%r10d ++ + movdqa L$xts_magic(%rip),%xmm8 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pshufd $95,%xmm15,%xmm9 ++ pxor %xmm0,%xmm1 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm10 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm11 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm12 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 ++ psrad $31,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm13 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm15,%xmm14 ++ psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 ++ pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 ++ movaps %xmm1,96(%rsp) ++ + subq $96,%rdx + jc L$xts_enc_short + + shrl $1,%eax +- subl $1,%eax ++ subl $3,%eax ++ movups 16(%r11),%xmm1 + movl %eax,%r10d ++ leaq L$xts_magic(%rip),%r8 + jmp L$xts_enc_grandloop + +-.p2align 4 ++.p2align 5 + L$xts_enc_grandloop: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu 0(%rdi),%xmm2 +- pand %xmm8,%xmm9 ++ movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- +- movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 +- movdqu 48(%rdi),%xmm5 ++ movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +- movdqu 64(%rdi),%xmm6 ++.byte 102,15,56,220,209 ++ movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +- movdqu 80(%rdi),%xmm7 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,220,217 ++ movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +- movups (%r11),%xmm0 ++.byte 102,15,56,220,225 ++ movdqu 80(%rdi),%xmm7 ++ pxor %xmm15,%xmm8 ++ movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +- pxor %xmm15,%xmm7 +- +- ++.byte 102,15,56,220,233 ++ movups 32(%r11),%xmm0 ++ leaq 96(%rdi),%rdi ++ pxor %xmm8,%xmm7 + +- movups 16(%r11),%xmm1 +- pxor %xmm0,%xmm2 +- pxor %xmm0,%xmm3 ++ pxor %xmm9,%xmm10 ++.byte 102,15,56,220,241 ++ pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +-.byte 102,15,56,220,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++.byte 102,15,56,220,249 ++ movups 48(%r11),%xmm1 ++ ++.byte 102,15,56,220,208 ++ pxor %xmm9,%xmm12 + movdqa %xmm11,16(%rsp) +-.byte 102,15,56,220,217 +- pxor %xmm0,%xmm5 ++.byte 102,15,56,220,216 ++ pxor %xmm9,%xmm13 + movdqa %xmm12,32(%rsp) +-.byte 102,15,56,220,225 +- pxor %xmm0,%xmm6 +- movdqa %xmm13,48(%rsp) +-.byte 102,15,56,220,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++.byte 102,15,56,220,224 ++ pxor %xmm9,%xmm14 ++.byte 102,15,56,220,232 ++ pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +-.byte 102,15,56,220,241 +- movdqa %xmm15,80(%rsp) +-.byte 102,15,56,220,249 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- jmp L$xts_enc_loop6_enter +- +-.p2align 4 ++.byte 102,15,56,220,240 ++ movdqa %xmm8,80(%rsp) ++.byte 102,15,56,220,248 ++ movups 64(%r11),%xmm0 ++ leaq 64(%r11),%rcx ++ pshufd $95,%xmm15,%xmm9 ++ jmp L$xts_enc_loop6 ++.p2align 5 + L$xts_enc_loop6: + .byte 102,15,56,220,209 + .byte 102,15,56,220,217 +- decl %eax + .byte 102,15,56,220,225 + .byte 102,15,56,220,233 + .byte 102,15,56,220,241 + .byte 102,15,56,220,249 +-L$xts_enc_loop6_enter: + movups 16(%rcx),%xmm1 ++ leaq 32(%rcx),%rcx ++ + .byte 102,15,56,220,208 + .byte 102,15,56,220,216 +- leaq 32(%rcx),%rcx + .byte 102,15,56,220,224 + .byte 102,15,56,220,232 + .byte 102,15,56,220,240 + .byte 102,15,56,220,248 + movups (%rcx),%xmm0 ++ decl %eax + jnz L$xts_enc_loop6 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- paddq %xmm15,%xmm15 ++ movdqa (%r8),%xmm8 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,209 +- pand %xmm8,%xmm9 ++ paddq %xmm15,%xmm15 ++ psrad $31,%xmm14 + .byte 102,15,56,220,217 +- pcmpgtd %xmm15,%xmm14 ++ pand %xmm8,%xmm14 ++ movups (%r11),%xmm10 + .byte 102,15,56,220,225 +- pxor %xmm9,%xmm15 + .byte 102,15,56,220,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,241 ++ movaps %xmm10,%xmm11 + .byte 102,15,56,220,249 + movups 16(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm10 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,208 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm10 ++ psrad $31,%xmm14 + .byte 102,15,56,220,216 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,220,224 +- pxor %xmm9,%xmm15 + .byte 102,15,56,220,232 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,240 ++ movaps %xmm11,%xmm12 + .byte 102,15,56,220,248 + movups 32(%rcx),%xmm0 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm11 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,220,209 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm11 ++ psrad $31,%xmm14 + .byte 102,15,56,220,217 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,220,225 +- pxor %xmm9,%xmm15 ++ movdqa %xmm13,48(%rsp) + .byte 102,15,56,220,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,220,241 ++ movaps %xmm12,%xmm13 + .byte 102,15,56,220,249 ++ movups 48(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm12 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,220,208 ++ pxor %xmm15,%xmm12 ++ psrad $31,%xmm14 ++.byte 102,15,56,220,216 + paddq %xmm15,%xmm15 +-.byte 102,15,56,221,208 +- pand %xmm8,%xmm9 +-.byte 102,15,56,221,216 +- pcmpgtd %xmm15,%xmm14 +-.byte 102,15,56,221,224 +- pxor %xmm9,%xmm15 +-.byte 102,15,56,221,232 +-.byte 102,15,56,221,240 +-.byte 102,15,56,221,248 ++ pand %xmm8,%xmm14 ++.byte 102,15,56,220,224 ++.byte 102,15,56,220,232 ++ pxor %xmm14,%xmm15 ++.byte 102,15,56,220,240 ++ movaps %xmm13,%xmm14 ++.byte 102,15,56,220,248 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm13 ++ movdqa %xmm9,%xmm0 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,220,209 ++ pxor %xmm15,%xmm13 ++ psrad $31,%xmm0 ++.byte 102,15,56,220,217 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm0 ++.byte 102,15,56,220,225 ++.byte 102,15,56,220,233 ++ pxor %xmm0,%xmm15 ++ movups (%r11),%xmm0 ++.byte 102,15,56,220,241 ++.byte 102,15,56,220,249 ++ movups 16(%r11),%xmm1 ++ ++ pxor %xmm15,%xmm14 ++ psrad $31,%xmm9 ++.byte 102,15,56,221,84,36,0 + paddq %xmm15,%xmm15 +- xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm9 +- xorps 16(%rsp),%xmm3 +- pcmpgtd %xmm15,%xmm14 ++.byte 102,15,56,221,92,36,16 ++.byte 102,15,56,221,100,36,32 + pxor %xmm9,%xmm15 +- +- xorps 32(%rsp),%xmm4 +- movups %xmm2,0(%rsi) +- xorps 48(%rsp),%xmm5 +- movups %xmm3,16(%rsi) +- xorps 64(%rsp),%xmm6 +- movups %xmm4,32(%rsi) +- xorps 80(%rsp),%xmm7 +- movups %xmm5,48(%rsi) ++.byte 102,15,56,221,108,36,48 ++.byte 102,15,56,221,116,36,64 ++.byte 102,15,56,221,124,36,80 + movl %r10d,%eax +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) ++ + leaq 96(%rsi),%rsi ++ movups %xmm2,-96(%rsi) ++ movups %xmm3,-80(%rsi) ++ movups %xmm4,-64(%rsi) ++ movups %xmm5,-48(%rsi) ++ movups %xmm6,-32(%rsi) ++ movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc L$xts_enc_grandloop + +- leal 3(%rax,%rax,1),%eax ++ leal 7(%rax,%rax,1),%eax + movq %r11,%rcx + movl %eax,%r10d + + L$xts_enc_short: ++ pxor %xmm0,%xmm10 + addq $96,%rdx + jz L$xts_enc_done + ++ pxor %xmm0,%xmm11 + cmpq $32,%rdx + jb L$xts_enc_one ++ pxor %xmm0,%xmm12 + je L$xts_enc_two + ++ pxor %xmm0,%xmm13 + cmpq $64,%rdx + jb L$xts_enc_three ++ pxor %xmm0,%xmm14 + je L$xts_enc_four + +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 +@@ -1514,15 +1740,15 @@ L$xts_enc_four: + + call _aesni_encrypt4 + +- xorps %xmm10,%xmm2 +- movdqa %xmm15,%xmm10 +- xorps %xmm11,%xmm3 +- xorps %xmm12,%xmm4 +- movups %xmm2,(%rsi) +- xorps %xmm13,%xmm5 +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) ++ pxor %xmm10,%xmm2 ++ movdqa %xmm14,%xmm10 ++ pxor %xmm11,%xmm3 ++ pxor %xmm12,%xmm4 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm3,16(%rsi) ++ movdqu %xmm4,32(%rsi) ++ movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp L$xts_enc_done + +@@ -1563,7 +1789,8 @@ L$oop_enc1_10: + movups %xmm2,-16(%rsi) + + L$xts_enc_ret: +- leaq 104(%rsp),%rsp ++ leaq (%rbp),%rsp ++ popq %rbp + L$xts_enc_epilogue: + .byte 0xf3,0xc3 + +@@ -1571,7 +1798,11 @@ L$xts_enc_epilogue: + + .p2align 4 + _aesni_xts_decrypt: +- leaq -104(%rsp),%rsp ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $112,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp + movups (%r9),%xmm15 + movl 240(%r8),%eax + movl 240(%rcx),%r10d +@@ -1592,228 +1823,266 @@ L$oop_enc1_11: + shlq $4,%rax + subq %rax,%rdx + ++ movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %r10d,%eax ++ shll $4,%r10d + movq %rdx,%r9 + andq $-16,%rdx + ++ movups 16(%rcx,%r10,1),%xmm1 ++ movl %eax,%r10d ++ + movdqa L$xts_magic(%rip),%xmm8 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pshufd $95,%xmm15,%xmm9 ++ pxor %xmm0,%xmm1 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm10 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm10 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm11 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm11 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm12 ++ psrad $31,%xmm14 + paddq %xmm15,%xmm15 +- pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 +- pxor %xmm9,%xmm15 +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm12 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + movdqa %xmm15,%xmm13 ++ psrad $31,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 ++ pxor %xmm0,%xmm13 ++ pxor %xmm14,%xmm15 ++ movdqa %xmm15,%xmm14 ++ psrad $31,%xmm9 + paddq %xmm15,%xmm15 + pand %xmm8,%xmm9 +- pcmpgtd %xmm15,%xmm14 ++ pxor %xmm0,%xmm14 + pxor %xmm9,%xmm15 ++ movaps %xmm1,96(%rsp) ++ + subq $96,%rdx + jc L$xts_dec_short + + shrl $1,%eax +- subl $1,%eax ++ subl $3,%eax ++ movups 16(%r11),%xmm1 + movl %eax,%r10d ++ leaq L$xts_magic(%rip),%r8 + jmp L$xts_dec_grandloop + +-.p2align 4 ++.p2align 5 + L$xts_dec_grandloop: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu 0(%rdi),%xmm2 +- pand %xmm8,%xmm9 ++ movdqa %xmm0,%xmm8 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- +- movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 +- movdqu 48(%rdi),%xmm5 ++ movdqu 32(%rdi),%xmm4 + pxor %xmm11,%xmm3 +- movdqu 64(%rdi),%xmm6 ++.byte 102,15,56,222,209 ++ movdqu 48(%rdi),%xmm5 + pxor %xmm12,%xmm4 +- movdqu 80(%rdi),%xmm7 +- leaq 96(%rdi),%rdi ++.byte 102,15,56,222,217 ++ movdqu 64(%rdi),%xmm6 + pxor %xmm13,%xmm5 +- movups (%r11),%xmm0 ++.byte 102,15,56,222,225 ++ movdqu 80(%rdi),%xmm7 ++ pxor %xmm15,%xmm8 ++ movdqa 96(%rsp),%xmm9 + pxor %xmm14,%xmm6 +- pxor %xmm15,%xmm7 +- +- ++.byte 102,15,56,222,233 ++ movups 32(%r11),%xmm0 ++ leaq 96(%rdi),%rdi ++ pxor %xmm8,%xmm7 + +- movups 16(%r11),%xmm1 +- pxor %xmm0,%xmm2 +- pxor %xmm0,%xmm3 ++ pxor %xmm9,%xmm10 ++.byte 102,15,56,222,241 ++ pxor %xmm9,%xmm11 + movdqa %xmm10,0(%rsp) +-.byte 102,15,56,222,209 +- leaq 32(%r11),%rcx +- pxor %xmm0,%xmm4 ++.byte 102,15,56,222,249 ++ movups 48(%r11),%xmm1 ++ ++.byte 102,15,56,222,208 ++ pxor %xmm9,%xmm12 + movdqa %xmm11,16(%rsp) +-.byte 102,15,56,222,217 +- pxor %xmm0,%xmm5 ++.byte 102,15,56,222,216 ++ pxor %xmm9,%xmm13 + movdqa %xmm12,32(%rsp) +-.byte 102,15,56,222,225 +- pxor %xmm0,%xmm6 +- movdqa %xmm13,48(%rsp) +-.byte 102,15,56,222,233 +- pxor %xmm0,%xmm7 +- movups (%rcx),%xmm0 +- decl %eax ++.byte 102,15,56,222,224 ++ pxor %xmm9,%xmm14 ++.byte 102,15,56,222,232 ++ pxor %xmm9,%xmm8 + movdqa %xmm14,64(%rsp) +-.byte 102,15,56,222,241 +- movdqa %xmm15,80(%rsp) +-.byte 102,15,56,222,249 +- pxor %xmm14,%xmm14 +- pcmpgtd %xmm15,%xmm14 +- jmp L$xts_dec_loop6_enter +- +-.p2align 4 ++.byte 102,15,56,222,240 ++ movdqa %xmm8,80(%rsp) ++.byte 102,15,56,222,248 ++ movups 64(%r11),%xmm0 ++ leaq 64(%r11),%rcx ++ pshufd $95,%xmm15,%xmm9 ++ jmp L$xts_dec_loop6 ++.p2align 5 + L$xts_dec_loop6: + .byte 102,15,56,222,209 + .byte 102,15,56,222,217 +- decl %eax + .byte 102,15,56,222,225 + .byte 102,15,56,222,233 + .byte 102,15,56,222,241 + .byte 102,15,56,222,249 +-L$xts_dec_loop6_enter: + movups 16(%rcx),%xmm1 ++ leaq 32(%rcx),%rcx ++ + .byte 102,15,56,222,208 + .byte 102,15,56,222,216 +- leaq 32(%rcx),%rcx + .byte 102,15,56,222,224 + .byte 102,15,56,222,232 + .byte 102,15,56,222,240 + .byte 102,15,56,222,248 + movups (%rcx),%xmm0 ++ decl %eax + jnz L$xts_dec_loop6 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- paddq %xmm15,%xmm15 ++ movdqa (%r8),%xmm8 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,209 +- pand %xmm8,%xmm9 ++ paddq %xmm15,%xmm15 ++ psrad $31,%xmm14 + .byte 102,15,56,222,217 +- pcmpgtd %xmm15,%xmm14 ++ pand %xmm8,%xmm14 ++ movups (%r11),%xmm10 + .byte 102,15,56,222,225 +- pxor %xmm9,%xmm15 + .byte 102,15,56,222,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,241 ++ movaps %xmm10,%xmm11 + .byte 102,15,56,222,249 + movups 16(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm10 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,208 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm10 ++ psrad $31,%xmm14 + .byte 102,15,56,222,216 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,222,224 +- pxor %xmm9,%xmm15 + .byte 102,15,56,222,232 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,240 ++ movaps %xmm11,%xmm12 + .byte 102,15,56,222,248 + movups 32(%rcx),%xmm0 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm11 +- paddq %xmm15,%xmm15 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 + .byte 102,15,56,222,209 +- pand %xmm8,%xmm9 ++ pxor %xmm15,%xmm11 ++ psrad $31,%xmm14 + .byte 102,15,56,222,217 +- pcmpgtd %xmm15,%xmm14 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm14 + .byte 102,15,56,222,225 +- pxor %xmm9,%xmm15 ++ movdqa %xmm13,48(%rsp) + .byte 102,15,56,222,233 ++ pxor %xmm14,%xmm15 + .byte 102,15,56,222,241 ++ movaps %xmm12,%xmm13 + .byte 102,15,56,222,249 ++ movups 48(%rcx),%xmm1 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm12 ++ movdqa %xmm9,%xmm14 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,222,208 ++ pxor %xmm15,%xmm12 ++ psrad $31,%xmm14 ++.byte 102,15,56,222,216 + paddq %xmm15,%xmm15 +-.byte 102,15,56,223,208 +- pand %xmm8,%xmm9 +-.byte 102,15,56,223,216 +- pcmpgtd %xmm15,%xmm14 +-.byte 102,15,56,223,224 +- pxor %xmm9,%xmm15 +-.byte 102,15,56,223,232 +-.byte 102,15,56,223,240 +-.byte 102,15,56,223,248 ++ pand %xmm8,%xmm14 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++ pxor %xmm14,%xmm15 ++.byte 102,15,56,222,240 ++ movaps %xmm13,%xmm14 ++.byte 102,15,56,222,248 + +- pshufd $19,%xmm14,%xmm9 +- pxor %xmm14,%xmm14 +- movdqa %xmm15,%xmm13 ++ movdqa %xmm9,%xmm0 ++ paddd %xmm9,%xmm9 ++.byte 102,15,56,222,209 ++ pxor %xmm15,%xmm13 ++ psrad $31,%xmm0 ++.byte 102,15,56,222,217 ++ paddq %xmm15,%xmm15 ++ pand %xmm8,%xmm0 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++ pxor %xmm0,%xmm15 ++ movups (%r11),%xmm0 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++ movups 16(%r11),%xmm1 ++ ++ pxor %xmm15,%xmm14 ++ psrad $31,%xmm9 ++.byte 102,15,56,223,84,36,0 + paddq %xmm15,%xmm15 +- xorps 0(%rsp),%xmm2 + pand %xmm8,%xmm9 +- xorps 16(%rsp),%xmm3 +- pcmpgtd %xmm15,%xmm14 ++.byte 102,15,56,223,92,36,16 ++.byte 102,15,56,223,100,36,32 + pxor %xmm9,%xmm15 +- +- xorps 32(%rsp),%xmm4 +- movups %xmm2,0(%rsi) +- xorps 48(%rsp),%xmm5 +- movups %xmm3,16(%rsi) +- xorps 64(%rsp),%xmm6 +- movups %xmm4,32(%rsi) +- xorps 80(%rsp),%xmm7 +- movups %xmm5,48(%rsi) ++.byte 102,15,56,223,108,36,48 ++.byte 102,15,56,223,116,36,64 ++.byte 102,15,56,223,124,36,80 + movl %r10d,%eax +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) ++ + leaq 96(%rsi),%rsi ++ movups %xmm2,-96(%rsi) ++ movups %xmm3,-80(%rsi) ++ movups %xmm4,-64(%rsi) ++ movups %xmm5,-48(%rsi) ++ movups %xmm6,-32(%rsi) ++ movups %xmm7,-16(%rsi) + subq $96,%rdx + jnc L$xts_dec_grandloop + +- leal 3(%rax,%rax,1),%eax ++ leal 7(%rax,%rax,1),%eax + movq %r11,%rcx + movl %eax,%r10d + + L$xts_dec_short: ++ pxor %xmm0,%xmm10 ++ pxor %xmm0,%xmm11 + addq $96,%rdx + jz L$xts_dec_done + ++ pxor %xmm0,%xmm12 + cmpq $32,%rdx + jb L$xts_dec_one ++ pxor %xmm0,%xmm13 + je L$xts_dec_two + ++ pxor %xmm0,%xmm14 + cmpq $64,%rdx + jb L$xts_dec_three + je L$xts_dec_four + +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movdqu (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movdqu 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movdqu 32(%rdi),%xmm4 + pxor %xmm10,%xmm2 + movdqu 48(%rdi),%xmm5 +@@ -1906,7 +2175,7 @@ L$xts_dec_three: + xorps %xmm10,%xmm2 + movdqa %xmm13,%xmm10 + xorps %xmm11,%xmm3 +- movdqa %xmm15,%xmm11 ++ movdqa %xmm14,%xmm11 + xorps %xmm12,%xmm4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) +@@ -1916,14 +2185,8 @@ L$xts_dec_three: + + .p2align 4 + L$xts_dec_four: +- pshufd $19,%xmm14,%xmm9 +- movdqa %xmm15,%xmm14 +- paddq %xmm15,%xmm15 + movups (%rdi),%xmm2 +- pand %xmm8,%xmm9 + movups 16(%rdi),%xmm3 +- pxor %xmm9,%xmm15 +- + movups 32(%rdi),%xmm4 + xorps %xmm10,%xmm2 + movups 48(%rdi),%xmm5 +@@ -1934,16 +2197,16 @@ L$xts_dec_four: + + call _aesni_decrypt4 + +- xorps %xmm10,%xmm2 ++ pxor %xmm10,%xmm2 + movdqa %xmm14,%xmm10 +- xorps %xmm11,%xmm3 ++ pxor %xmm11,%xmm3 + movdqa %xmm15,%xmm11 +- xorps %xmm12,%xmm4 +- movups %xmm2,(%rsi) +- xorps %xmm13,%xmm5 +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm3,16(%rsi) ++ movdqu %xmm4,32(%rsi) ++ movdqu %xmm5,48(%rsi) + leaq 64(%rsi),%rsi + jmp L$xts_dec_done + +@@ -2003,7 +2266,8 @@ L$oop_dec1_14: + movups %xmm2,(%rsi) + + L$xts_dec_ret: +- leaq 104(%rsp),%rsp ++ leaq (%rbp),%rsp ++ popq %rbp + L$xts_dec_epilogue: + .byte 0xf3,0xc3 + +@@ -2070,149 +2334,324 @@ L$cbc_enc_tail: + + .p2align 4 + L$cbc_decrypt: +- movups (%r8),%xmm9 ++ leaq (%rsp),%rax ++ pushq %rbp ++ subq $16,%rsp ++ andq $-16,%rsp ++ leaq -8(%rax),%rbp ++ movups (%r8),%xmm10 + movl %r10d,%eax +- cmpq $112,%rdx ++ cmpq $80,%rdx + jbe L$cbc_dec_tail +- shrl $1,%r10d ++ ++ movups (%rcx),%xmm0 ++ movdqu 0(%rdi),%xmm2 ++ movdqu 16(%rdi),%xmm3 ++ movdqa %xmm2,%xmm11 ++ movdqu 32(%rdi),%xmm4 ++ movdqa %xmm3,%xmm12 ++ movdqu 48(%rdi),%xmm5 ++ movdqa %xmm4,%xmm13 ++ movdqu 64(%rdi),%xmm6 ++ movdqa %xmm5,%xmm14 ++ movdqu 80(%rdi),%xmm7 ++ movdqa %xmm6,%xmm15 ++ cmpq $112,%rdx ++ jbe L$cbc_dec_six_or_seven ++ + subq $112,%rdx +- movl %r10d,%eax +- movaps %xmm9,-24(%rsp) ++ leaq 112(%rcx),%rcx + jmp L$cbc_dec_loop8_enter + .p2align 4 + L$cbc_dec_loop8: +- movaps %xmm0,-24(%rsp) + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + L$cbc_dec_loop8_enter: +- movups (%rcx),%xmm0 +- movups (%rdi),%xmm2 +- movups 16(%rdi),%xmm3 +- movups 16(%rcx),%xmm1 ++ movdqu 96(%rdi),%xmm8 ++ pxor %xmm0,%xmm2 ++ movdqu 112(%rdi),%xmm9 ++ pxor %xmm0,%xmm3 ++ movups 16-112(%rcx),%xmm1 ++ pxor %xmm0,%xmm4 ++ xorq %r11,%r11 ++ cmpq $112,%rdx ++ pxor %xmm0,%xmm5 ++ pxor %xmm0,%xmm6 ++ pxor %xmm0,%xmm7 ++ pxor %xmm0,%xmm8 + +- leaq 32(%rcx),%rcx +- movdqu 32(%rdi),%xmm4 +- xorps %xmm0,%xmm2 +- movdqu 48(%rdi),%xmm5 +- xorps %xmm0,%xmm3 +- movdqu 64(%rdi),%xmm6 + .byte 102,15,56,222,209 +- pxor %xmm0,%xmm4 +- movdqu 80(%rdi),%xmm7 ++ pxor %xmm0,%xmm9 ++ movups 32-112(%rcx),%xmm0 + .byte 102,15,56,222,217 +- pxor %xmm0,%xmm5 +- movdqu 96(%rdi),%xmm8 + .byte 102,15,56,222,225 +- pxor %xmm0,%xmm6 +- movdqu 112(%rdi),%xmm9 + .byte 102,15,56,222,233 +- pxor %xmm0,%xmm7 +- decl %eax + .byte 102,15,56,222,241 +- pxor %xmm0,%xmm8 + .byte 102,15,56,222,249 +- pxor %xmm0,%xmm9 +- movups (%rcx),%xmm0 ++ setnc %r11b + .byte 102,68,15,56,222,193 ++ shlq $7,%r11 + .byte 102,68,15,56,222,201 +- movups 16(%rcx),%xmm1 +- +- call L$dec_loop8_enter ++ addq %rdi,%r11 ++ movups 48-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 64-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 80-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 96-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 112-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 128-112(%rcx),%xmm0 ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 144-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 160-112(%rcx),%xmm0 ++ cmpl $11,%eax ++ jb L$cbc_dec_done ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 176-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 192-112(%rcx),%xmm0 ++ je L$cbc_dec_done ++.byte 102,15,56,222,209 ++.byte 102,15,56,222,217 ++.byte 102,15,56,222,225 ++.byte 102,15,56,222,233 ++.byte 102,15,56,222,241 ++.byte 102,15,56,222,249 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movups 208-112(%rcx),%xmm1 ++.byte 102,15,56,222,208 ++.byte 102,15,56,222,216 ++.byte 102,15,56,222,224 ++.byte 102,15,56,222,232 ++.byte 102,15,56,222,240 ++.byte 102,15,56,222,248 ++.byte 102,68,15,56,222,192 ++.byte 102,68,15,56,222,200 ++ movups 224-112(%rcx),%xmm0 ++L$cbc_dec_done: ++.byte 102,15,56,222,209 ++ pxor %xmm0,%xmm10 ++.byte 102,15,56,222,217 ++ pxor %xmm0,%xmm11 ++.byte 102,15,56,222,225 ++ pxor %xmm0,%xmm12 ++.byte 102,15,56,222,233 ++ pxor %xmm0,%xmm13 ++.byte 102,15,56,222,241 ++ pxor %xmm0,%xmm14 ++.byte 102,15,56,222,249 ++ pxor %xmm0,%xmm15 ++.byte 102,68,15,56,222,193 ++.byte 102,68,15,56,222,201 ++ movdqu 80(%rdi),%xmm1 ++ ++.byte 102,65,15,56,223,210 ++ movdqu 96(%rdi),%xmm10 ++ pxor %xmm0,%xmm1 ++.byte 102,65,15,56,223,219 ++ pxor %xmm0,%xmm10 ++ movdqu 112(%rdi),%xmm0 ++ leaq 128(%rdi),%rdi ++.byte 102,65,15,56,223,228 ++ movdqu 0(%r11),%xmm11 ++.byte 102,65,15,56,223,237 ++ movdqu 16(%r11),%xmm12 ++.byte 102,65,15,56,223,246 ++ movdqu 32(%r11),%xmm13 ++.byte 102,65,15,56,223,255 ++ movdqu 48(%r11),%xmm14 ++.byte 102,68,15,56,223,193 ++ movdqu 64(%r11),%xmm15 ++.byte 102,69,15,56,223,202 ++ movdqa %xmm0,%xmm10 ++ movdqu 80(%r11),%xmm1 ++ movups -112(%rcx),%xmm0 + +- movups (%rdi),%xmm1 +- movups 16(%rdi),%xmm0 +- xorps -24(%rsp),%xmm2 +- xorps %xmm1,%xmm3 +- movups 32(%rdi),%xmm1 +- xorps %xmm0,%xmm4 +- movups 48(%rdi),%xmm0 +- xorps %xmm1,%xmm5 +- movups 64(%rdi),%xmm1 +- xorps %xmm0,%xmm6 +- movups 80(%rdi),%xmm0 +- xorps %xmm1,%xmm7 +- movups 96(%rdi),%xmm1 +- xorps %xmm0,%xmm8 +- movups 112(%rdi),%xmm0 +- xorps %xmm1,%xmm9 + movups %xmm2,(%rsi) ++ movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) ++ movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) ++ movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) +- movl %r10d,%eax ++ movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) +- movq %r11,%rcx ++ movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) +- leaq 128(%rdi),%rdi ++ movdqa %xmm1,%xmm7 + movups %xmm8,96(%rsi) + leaq 112(%rsi),%rsi ++ + subq $128,%rdx + ja L$cbc_dec_loop8 + + movaps %xmm9,%xmm2 +- movaps %xmm0,%xmm9 ++ leaq -112(%rcx),%rcx + addq $112,%rdx + jle L$cbc_dec_tail_collected +- movups %xmm2,(%rsi) +- leal 1(%r10,%r10,1),%eax ++ movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi ++ cmpq $80,%rdx ++ jbe L$cbc_dec_tail ++ ++ movaps %xmm11,%xmm2 ++L$cbc_dec_six_or_seven: ++ cmpq $96,%rdx ++ ja L$cbc_dec_seven ++ ++ movaps %xmm7,%xmm8 ++ call _aesni_decrypt6 ++ pxor %xmm10,%xmm2 ++ movaps %xmm8,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ pxor %xmm15,%xmm7 ++ movdqu %xmm6,64(%rsi) ++ leaq 80(%rsi),%rsi ++ movdqa %xmm7,%xmm2 ++ jmp L$cbc_dec_tail_collected ++ ++.p2align 4 ++L$cbc_dec_seven: ++ movups 96(%rdi),%xmm8 ++ xorps %xmm9,%xmm9 ++ call _aesni_decrypt8 ++ movups 80(%rdi),%xmm9 ++ pxor %xmm10,%xmm2 ++ movups 96(%rdi),%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ pxor %xmm15,%xmm7 ++ movdqu %xmm6,64(%rsi) ++ pxor %xmm9,%xmm8 ++ movdqu %xmm7,80(%rsi) ++ leaq 96(%rsi),%rsi ++ movdqa %xmm8,%xmm2 ++ jmp L$cbc_dec_tail_collected ++ + L$cbc_dec_tail: + movups (%rdi),%xmm2 +- movaps %xmm2,%xmm8 +- cmpq $16,%rdx ++ subq $16,%rdx + jbe L$cbc_dec_one + + movups 16(%rdi),%xmm3 +- movaps %xmm3,%xmm7 +- cmpq $32,%rdx ++ movaps %xmm2,%xmm11 ++ subq $16,%rdx + jbe L$cbc_dec_two + + movups 32(%rdi),%xmm4 +- movaps %xmm4,%xmm6 +- cmpq $48,%rdx ++ movaps %xmm3,%xmm12 ++ subq $16,%rdx + jbe L$cbc_dec_three + + movups 48(%rdi),%xmm5 +- cmpq $64,%rdx ++ movaps %xmm4,%xmm13 ++ subq $16,%rdx + jbe L$cbc_dec_four + + movups 64(%rdi),%xmm6 +- cmpq $80,%rdx +- jbe L$cbc_dec_five +- +- movups 80(%rdi),%xmm7 +- cmpq $96,%rdx +- jbe L$cbc_dec_six +- +- movups 96(%rdi),%xmm8 +- movaps %xmm9,-24(%rsp) +- call _aesni_decrypt8 +- movups (%rdi),%xmm1 +- movups 16(%rdi),%xmm0 +- xorps -24(%rsp),%xmm2 +- xorps %xmm1,%xmm3 +- movups 32(%rdi),%xmm1 +- xorps %xmm0,%xmm4 +- movups 48(%rdi),%xmm0 +- xorps %xmm1,%xmm5 +- movups 64(%rdi),%xmm1 +- xorps %xmm0,%xmm6 +- movups 80(%rdi),%xmm0 +- xorps %xmm1,%xmm7 +- movups 96(%rdi),%xmm9 +- xorps %xmm0,%xmm8 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- movups %xmm6,64(%rsi) +- movups %xmm7,80(%rsi) +- leaq 96(%rsi),%rsi +- movaps %xmm8,%xmm2 +- subq $112,%rdx ++ movaps %xmm5,%xmm14 ++ movaps %xmm6,%xmm15 ++ xorps %xmm7,%xmm7 ++ call _aesni_decrypt6 ++ pxor %xmm10,%xmm2 ++ movaps %xmm15,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ pxor %xmm14,%xmm6 ++ movdqu %xmm5,48(%rsi) ++ leaq 64(%rsi),%rsi ++ movdqa %xmm6,%xmm2 ++ subq $16,%rdx + jmp L$cbc_dec_tail_collected ++ + .p2align 4 + L$cbc_dec_one: ++ movaps %xmm2,%xmm11 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx +@@ -2224,111 +2663,69 @@ L$oop_dec1_16: + leaq 16(%rcx),%rcx + jnz L$oop_dec1_16 + .byte 102,15,56,223,209 +- xorps %xmm9,%xmm2 +- movaps %xmm8,%xmm9 +- subq $16,%rdx ++ xorps %xmm10,%xmm2 ++ movaps %xmm11,%xmm10 + jmp L$cbc_dec_tail_collected + .p2align 4 + L$cbc_dec_two: ++ movaps %xmm3,%xmm12 + xorps %xmm4,%xmm4 + call _aesni_decrypt3 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- movaps %xmm7,%xmm9 +- movaps %xmm3,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm12,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ movdqa %xmm3,%xmm2 + leaq 16(%rsi),%rsi +- subq $32,%rdx + jmp L$cbc_dec_tail_collected + .p2align 4 + L$cbc_dec_three: ++ movaps %xmm4,%xmm13 + call _aesni_decrypt3 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- xorps %xmm7,%xmm4 +- movups %xmm3,16(%rsi) +- movaps %xmm6,%xmm9 +- movaps %xmm4,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm13,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ movdqa %xmm4,%xmm2 + leaq 32(%rsi),%rsi +- subq $48,%rdx + jmp L$cbc_dec_tail_collected + .p2align 4 + L$cbc_dec_four: ++ movaps %xmm5,%xmm14 + call _aesni_decrypt4 +- xorps %xmm9,%xmm2 +- movups 48(%rdi),%xmm9 +- xorps %xmm8,%xmm3 +- movups %xmm2,(%rsi) +- xorps %xmm7,%xmm4 +- movups %xmm3,16(%rsi) +- xorps %xmm6,%xmm5 +- movups %xmm4,32(%rsi) +- movaps %xmm5,%xmm2 ++ pxor %xmm10,%xmm2 ++ movaps %xmm14,%xmm10 ++ pxor %xmm11,%xmm3 ++ movdqu %xmm2,(%rsi) ++ pxor %xmm12,%xmm4 ++ movdqu %xmm3,16(%rsi) ++ pxor %xmm13,%xmm5 ++ movdqu %xmm4,32(%rsi) ++ movdqa %xmm5,%xmm2 + leaq 48(%rsi),%rsi +- subq $64,%rdx +- jmp L$cbc_dec_tail_collected +-.p2align 4 +-L$cbc_dec_five: +- xorps %xmm7,%xmm7 +- call _aesni_decrypt6 +- movups 16(%rdi),%xmm1 +- movups 32(%rdi),%xmm0 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- xorps %xmm1,%xmm4 +- movups 48(%rdi),%xmm1 +- xorps %xmm0,%xmm5 +- movups 64(%rdi),%xmm9 +- xorps %xmm1,%xmm6 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- leaq 64(%rsi),%rsi +- movaps %xmm6,%xmm2 +- subq $80,%rdx +- jmp L$cbc_dec_tail_collected +-.p2align 4 +-L$cbc_dec_six: +- call _aesni_decrypt6 +- movups 16(%rdi),%xmm1 +- movups 32(%rdi),%xmm0 +- xorps %xmm9,%xmm2 +- xorps %xmm8,%xmm3 +- xorps %xmm1,%xmm4 +- movups 48(%rdi),%xmm1 +- xorps %xmm0,%xmm5 +- movups 64(%rdi),%xmm0 +- xorps %xmm1,%xmm6 +- movups 80(%rdi),%xmm9 +- xorps %xmm0,%xmm7 +- movups %xmm2,(%rsi) +- movups %xmm3,16(%rsi) +- movups %xmm4,32(%rsi) +- movups %xmm5,48(%rsi) +- movups %xmm6,64(%rsi) +- leaq 80(%rsi),%rsi +- movaps %xmm7,%xmm2 +- subq $96,%rdx + jmp L$cbc_dec_tail_collected ++ + .p2align 4 + L$cbc_dec_tail_collected: ++ movups %xmm10,(%r8) + andq $15,%rdx +- movups %xmm9,(%r8) + jnz L$cbc_dec_tail_partial + movups %xmm2,(%rsi) + jmp L$cbc_dec_ret + .p2align 4 + L$cbc_dec_tail_partial: +- movaps %xmm2,-24(%rsp) ++ movaps %xmm2,(%rsp) + movq $16,%rcx + movq %rsi,%rdi + subq %rdx,%rcx +- leaq -24(%rsp),%rsi ++ leaq (%rsp),%rsi + .long 0x9066A4F3 + + L$cbc_dec_ret: ++ leaq (%rbp),%rsp ++ popq %rbp + L$cbc_ret: + .byte 0xf3,0xc3 + +@@ -2571,6 +2968,8 @@ L$increment64: + .long 1,0,0,0 + L$xts_magic: + .long 0x87,0,1,0 ++L$increment1: ++.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 + + .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .p2align 6 +diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s +index b9ec30c..1327e82 100644 +--- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s ++++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s +@@ -597,6 +597,468 @@ L$cbc_abort: + popq %rbp + .byte 0xf3,0xc3 + ++.globl _padlock_cfb_encrypt ++ ++.p2align 4 ++_padlock_cfb_encrypt: ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz L$cfb_abort ++ testq $15,%rcx ++ jnz L$cfb_abort ++ leaq L$padlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz L$cfb_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz L$cfb_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++ jmp L$cfb_loop ++.p2align 4 ++L$cfb_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz L$cfb_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++L$cfb_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,224 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz L$cfb_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++L$cfb_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jnz L$cfb_loop ++ cmpq %rbp,%rsp ++ je L$cfb_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++L$cfb_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja L$cfb_bzero ++ ++L$cfb_done: ++ leaq (%rbp),%rsp ++ jmp L$cfb_exit ++ ++.p2align 4 ++L$cfb_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,224 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++L$cfb_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++L$cfb_abort: ++ popq %rbx ++ popq %rbp ++ .byte 0xf3,0xc3 ++ ++.globl _padlock_ofb_encrypt ++ ++.p2align 4 ++_padlock_ofb_encrypt: ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz L$ofb_abort ++ testq $15,%rcx ++ jnz L$ofb_abort ++ leaq L$padlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz L$ofb_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz L$ofb_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++ jmp L$ofb_loop ++.p2align 4 ++L$ofb_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz L$ofb_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++L$ofb_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,232 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz L$ofb_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++L$ofb_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jnz L$ofb_loop ++ cmpq %rbp,%rsp ++ je L$ofb_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++L$ofb_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja L$ofb_bzero ++ ++L$ofb_done: ++ leaq (%rbp),%rsp ++ jmp L$ofb_exit ++ ++.p2align 4 ++L$ofb_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,232 ++ movdqa (%rax),%xmm0 ++ movdqa %xmm0,-16(%rdx) ++L$ofb_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++L$ofb_abort: ++ popq %rbx ++ popq %rbp ++ .byte 0xf3,0xc3 ++ ++.globl _padlock_ctr32_encrypt ++ ++.p2align 4 ++_padlock_ctr32_encrypt: ++ pushq %rbp ++ pushq %rbx ++ ++ xorl %eax,%eax ++ testq $15,%rdx ++ jnz L$ctr32_abort ++ testq $15,%rcx ++ jnz L$ctr32_abort ++ leaq L$padlock_saved_context(%rip),%rax ++ pushf ++ cld ++ call _padlock_verify_ctx ++ leaq 16(%rdx),%rdx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%rdx) ++ jnz L$ctr32_aligned ++ testq $15,%rdi ++ setz %al ++ testq $15,%rsi ++ setz %bl ++ testl %ebx,%eax ++ jnz L$ctr32_aligned ++ negq %rax ++ movq $512,%rbx ++ notq %rax ++ leaq (%rsp),%rbp ++ cmpq %rbx,%rcx ++ cmovcq %rcx,%rbx ++ andq %rbx,%rax ++ movq %rcx,%rbx ++ negq %rax ++ andq $512-1,%rbx ++ leaq (%rax,%rbp,1),%rsp ++ movq $512,%rax ++ cmovzq %rax,%rbx ++L$ctr32_reenter: ++ movl -4(%rdx),%eax ++ bswapl %eax ++ negl %eax ++ andl $31,%eax ++ movq $512,%rbx ++ shll $4,%eax ++ cmovzq %rbx,%rax ++ cmpq %rax,%rcx ++ cmovaq %rax,%rbx ++ cmovbeq %rcx,%rbx ++ cmpq %rbx,%rcx ++ ja L$ctr32_loop ++ movq %rsi,%rax ++ cmpq %rsp,%rbp ++ cmoveq %rdi,%rax ++ addq %rcx,%rax ++ negq %rax ++ andq $4095,%rax ++ cmpq $32,%rax ++ movq $-32,%rax ++ cmovaeq %rbx,%rax ++ andq %rax,%rbx ++ jz L$ctr32_unaligned_tail ++ jmp L$ctr32_loop ++.p2align 4 ++L$ctr32_loop: ++ cmpq %rcx,%rbx ++ cmovaq %rcx,%rbx ++ movq %rdi,%r8 ++ movq %rsi,%r9 ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ testq $15,%rdi ++ cmovnzq %rsp,%rdi ++ testq $15,%rsi ++ jz L$ctr32_inp_aligned ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++ movq %rbx,%rcx ++ movq %rdi,%rsi ++L$ctr32_inp_aligned: ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ movl -4(%rdx),%eax ++ testl $4294901760,%eax ++ jnz L$ctr32_no_carry ++ bswapl %eax ++ addl $65536,%eax ++ bswapl %eax ++ movl %eax,-4(%rdx) ++L$ctr32_no_carry: ++ movq %r8,%rdi ++ movq %r11,%rbx ++ testq $15,%rdi ++ jz L$ctr32_out_aligned ++ movq %rbx,%rcx ++ leaq (%rsp),%rsi ++ shrq $3,%rcx ++.byte 0xf3,0x48,0xa5 ++ subq %rbx,%rdi ++L$ctr32_out_aligned: ++ movq %r9,%rsi ++ movq %r10,%rcx ++ addq %rbx,%rdi ++ addq %rbx,%rsi ++ subq %rbx,%rcx ++ movq $512,%rbx ++ jz L$ctr32_break ++ cmpq %rbx,%rcx ++ jae L$ctr32_loop ++ movq %rcx,%rbx ++ movq %rsi,%rax ++ cmpq %rsp,%rbp ++ cmoveq %rdi,%rax ++ addq %rcx,%rax ++ negq %rax ++ andq $4095,%rax ++ cmpq $32,%rax ++ movq $-32,%rax ++ cmovaeq %rbx,%rax ++ andq %rax,%rbx ++ jnz L$ctr32_loop ++L$ctr32_unaligned_tail: ++ xorl %eax,%eax ++ cmpq %rsp,%rbp ++ cmoveq %rcx,%rax ++ movq %rdi,%r8 ++ movq %rcx,%rbx ++ subq %rax,%rsp ++ shrq $3,%rcx ++ leaq (%rsp),%rdi ++.byte 0xf3,0x48,0xa5 ++ movq %rsp,%rsi ++ movq %r8,%rdi ++ movq %rbx,%rcx ++ jmp L$ctr32_loop ++.p2align 4 ++L$ctr32_break: ++ cmpq %rbp,%rsp ++ je L$ctr32_done ++ ++ pxor %xmm0,%xmm0 ++ leaq (%rsp),%rax ++L$ctr32_bzero: ++ movaps %xmm0,(%rax) ++ leaq 16(%rax),%rax ++ cmpq %rax,%rbp ++ ja L$ctr32_bzero ++ ++L$ctr32_done: ++ leaq (%rbp),%rsp ++ jmp L$ctr32_exit ++ ++.p2align 4 ++L$ctr32_aligned: ++ movl -4(%rdx),%eax ++ bswapl %eax ++ negl %eax ++ andl $65535,%eax ++ movq $1048576,%rbx ++ shll $4,%eax ++ cmovzq %rbx,%rax ++ cmpq %rax,%rcx ++ cmovaq %rax,%rbx ++ cmovbeq %rcx,%rbx ++ jbe L$ctr32_aligned_skip ++ ++L$ctr32_aligned_loop: ++ movq %rcx,%r10 ++ movq %rbx,%rcx ++ movq %rbx,%r11 ++ ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ ++ movl -4(%rdx),%eax ++ bswapl %eax ++ addl $65536,%eax ++ bswapl %eax ++ movl %eax,-4(%rdx) ++ ++ movq %r10,%rcx ++ subq %r11,%rcx ++ movq $1048576,%rbx ++ jz L$ctr32_exit ++ cmpq %rbx,%rcx ++ jae L$ctr32_aligned_loop ++ ++L$ctr32_aligned_skip: ++ leaq (%rsi,%rcx,1),%rbp ++ negq %rbp ++ andq $4095,%rbp ++ xorl %eax,%eax ++ cmpq $32,%rbp ++ movq $32-1,%rbp ++ cmovaeq %rax,%rbp ++ andq %rcx,%rbp ++ subq %rbp,%rcx ++ jz L$ctr32_aligned_tail ++ leaq -16(%rdx),%rax ++ leaq 16(%rdx),%rbx ++ shrq $4,%rcx ++.byte 0xf3,0x0f,0xa7,216 ++ testq %rbp,%rbp ++ jz L$ctr32_exit ++ ++L$ctr32_aligned_tail: ++ movq %rdi,%r8 ++ movq %rbp,%rbx ++ movq %rbp,%rcx ++ leaq (%rsp),%rbp ++ subq %rcx,%rsp ++ shrq $3,%rcx ++ leaq (%rsp),%rdi ++.byte 0xf3,0x48,0xa5 ++ leaq (%r8),%rdi ++ leaq (%rsp),%rsi ++ movq %rbx,%rcx ++ jmp L$ctr32_loop ++L$ctr32_exit: ++ movl $1,%eax ++ leaq 8(%rsp),%rsp ++L$ctr32_abort: ++ popq %rbx ++ popq %rbp ++ .byte 0xf3,0xc3 ++ + .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + .p2align 4 + .data +diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s +index 7a38b7c..1a2fa92 100644 +--- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s ++++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s +@@ -510,6 +510,351 @@ L016cbc_abort: + popl %ebx + popl %ebp + ret ++.globl _padlock_cfb_encrypt ++.align 4 ++_padlock_cfb_encrypt: ++L_padlock_cfb_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz L028cfb_abort ++ testl $15,%ecx ++ jnz L028cfb_abort ++ leal Lpadlock_saved_context-L029cfb_pic_point,%eax ++ pushfl ++ cld ++ call __padlock_verify_ctx ++L029cfb_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%edx) ++ jnz L030cfb_aligned ++ testl $15,%edi ++ setz %al ++ testl $15,%esi ++ setz %bl ++ testl %ebx,%eax ++ jnz L030cfb_aligned ++ negl %eax ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp L031cfb_loop ++.align 4,0x90 ++L031cfb_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ testl $15,%edi ++ cmovnzl %esp,%edi ++ testl $15,%esi ++ jz L032cfb_inp_aligned ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++ movl %ebx,%ecx ++ movl %edi,%esi ++L032cfb_inp_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,224 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ testl $15,%edi ++ jz L033cfb_out_aligned ++ movl %ebx,%ecx ++ leal (%esp),%esi ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++L033cfb_out_aligned: ++ movl 4(%ebp),%esi ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz L031cfb_loop ++ cmpl %ebp,%esp ++ je L034cfb_done ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++L035cfb_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja L035cfb_bzero ++L034cfb_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ jmp L036cfb_exit ++.align 4,0x90 ++L030cfb_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,224 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++L036cfb_exit: ++ movl $1,%eax ++ leal 4(%esp),%esp ++L028cfb_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret ++.globl _padlock_ofb_encrypt ++.align 4 ++_padlock_ofb_encrypt: ++L_padlock_ofb_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz L037ofb_abort ++ testl $15,%ecx ++ jnz L037ofb_abort ++ leal Lpadlock_saved_context-L038ofb_pic_point,%eax ++ pushfl ++ cld ++ call __padlock_verify_ctx ++L038ofb_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ xorl %ebx,%ebx ++ testl $32,(%edx) ++ jnz L039ofb_aligned ++ testl $15,%edi ++ setz %al ++ testl $15,%esi ++ setz %bl ++ testl %ebx,%eax ++ jnz L039ofb_aligned ++ negl %eax ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp L040ofb_loop ++.align 4,0x90 ++L040ofb_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ testl $15,%edi ++ cmovnzl %esp,%edi ++ testl $15,%esi ++ jz L041ofb_inp_aligned ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++ movl %ebx,%ecx ++ movl %edi,%esi ++L041ofb_inp_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,232 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ testl $15,%edi ++ jz L042ofb_out_aligned ++ movl %ebx,%ecx ++ leal (%esp),%esi ++ shrl $2,%ecx ++.byte 243,165 ++ subl %ebx,%edi ++L042ofb_out_aligned: ++ movl 4(%ebp),%esi ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz L040ofb_loop ++ cmpl %ebp,%esp ++ je L043ofb_done ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++L044ofb_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja L044ofb_bzero ++L043ofb_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ jmp L045ofb_exit ++.align 4,0x90 ++L039ofb_aligned: ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,232 ++ movaps (%eax),%xmm0 ++ movaps %xmm0,-16(%edx) ++L045ofb_exit: ++ movl $1,%eax ++ leal 4(%esp),%esp ++L037ofb_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret ++.globl _padlock_ctr32_encrypt ++.align 4 ++_padlock_ctr32_encrypt: ++L_padlock_ctr32_encrypt_begin: ++ pushl %ebp ++ pushl %ebx ++ pushl %esi ++ pushl %edi ++ movl 20(%esp),%edi ++ movl 24(%esp),%esi ++ movl 28(%esp),%edx ++ movl 32(%esp),%ecx ++ testl $15,%edx ++ jnz L046ctr32_abort ++ testl $15,%ecx ++ jnz L046ctr32_abort ++ leal Lpadlock_saved_context-L047ctr32_pic_point,%eax ++ pushfl ++ cld ++ call __padlock_verify_ctx ++L047ctr32_pic_point: ++ leal 16(%edx),%edx ++ xorl %eax,%eax ++ movq -16(%edx),%mm0 ++ movl $512,%ebx ++ notl %eax ++ leal -24(%esp),%ebp ++ cmpl %ebx,%ecx ++ cmovcl %ecx,%ebx ++ andl %ebx,%eax ++ movl %ecx,%ebx ++ negl %eax ++ andl $511,%ebx ++ leal (%eax,%ebp,1),%esp ++ movl $512,%eax ++ cmovzl %eax,%ebx ++ movl %ebp,%eax ++ andl $-16,%ebp ++ andl $-16,%esp ++ movl %eax,16(%ebp) ++ jmp L048ctr32_loop ++.align 4,0x90 ++L048ctr32_loop: ++ movl %edi,(%ebp) ++ movl %esi,4(%ebp) ++ movl %ecx,8(%ebp) ++ movl %ebx,%ecx ++ movl %ebx,12(%ebp) ++ movl -4(%edx),%ecx ++ xorl %edi,%edi ++ movl -8(%edx),%eax ++L049ctr32_prepare: ++ movl %ecx,12(%esp,%edi,1) ++ bswap %ecx ++ movq %mm0,(%esp,%edi,1) ++ incl %ecx ++ movl %eax,8(%esp,%edi,1) ++ bswap %ecx ++ leal 16(%edi),%edi ++ cmpl %ebx,%edi ++ jb L049ctr32_prepare ++ movl %ecx,-4(%edx) ++ leal (%esp),%esi ++ leal (%esp),%edi ++ movl %ebx,%ecx ++ leal -16(%edx),%eax ++ leal 16(%edx),%ebx ++ shrl $4,%ecx ++.byte 243,15,167,200 ++ movl (%ebp),%edi ++ movl 12(%ebp),%ebx ++ movl 4(%ebp),%esi ++ xorl %ecx,%ecx ++L050ctr32_xor: ++ movups (%esi,%ecx,1),%xmm1 ++ leal 16(%ecx),%ecx ++ pxor -16(%esp,%ecx,1),%xmm1 ++ movups %xmm1,-16(%edi,%ecx,1) ++ cmpl %ebx,%ecx ++ jb L050ctr32_xor ++ movl 8(%ebp),%ecx ++ addl %ebx,%edi ++ addl %ebx,%esi ++ subl %ebx,%ecx ++ movl $512,%ebx ++ jnz L048ctr32_loop ++ pxor %xmm0,%xmm0 ++ leal (%esp),%eax ++L051ctr32_bzero: ++ movaps %xmm0,(%eax) ++ leal 16(%eax),%eax ++ cmpl %eax,%ebp ++ ja L051ctr32_bzero ++L052ctr32_done: ++ movl 16(%ebp),%ebp ++ leal 24(%ebp),%esp ++ movl $1,%eax ++ leal 4(%esp),%esp ++ emms ++L046ctr32_abort: ++ popl %edi ++ popl %esi ++ popl %ebx ++ popl %ebp ++ ret + .globl _padlock_xstore + .align 4 + _padlock_xstore: +@@ -526,10 +871,10 @@ __win32_segv_handler: + movl 4(%esp),%edx + movl 12(%esp),%ecx + cmpl $3221225477,(%edx) +- jne L028ret ++ jne L053ret + addl $4,184(%ecx) + movl $0,%eax +-L028ret: ++L053ret: + ret + .globl _padlock_sha1_oneshot + .align 4 +-- +1.8.4.2 + diff --git a/gnutls.spec b/gnutls.spec index 16cf652..cc83bef 100644 --- a/gnutls.spec +++ b/gnutls.spec @@ -27,6 +27,7 @@ Source0: %{name}-%{version}-hobbled.tar.xz Source1: libgnutls-config Source2: hobble-gnutls Patch1: gnutls-3.2.7-rpath.patch +Patch2: gnutls-3.2.7-asm.patch # Use only FIPS approved ciphers in the FIPS mode Patch7: gnutls-2.12.21-fips-algorithms.patch Patch8: gnutls-3.1.11-nosrp.patch @@ -131,6 +132,7 @@ This package contains Guile bindings for the library. %setup -q %patch1 -p1 -b .rpath +%patch2 -p1 -b .asm # This patch is not applicable as we use nettle now but some parts will be # later reused. #%patch7 -p1 -b .fips @@ -266,8 +268,9 @@ fi %endif %changelog -* Tue Nov 27 2013 Nikos Mavrogiannopoulos 3.2.7-2 -- Use the following root key for unbound /var/lib/unbound/root.key (#1012494) +* Wed Dec 4 2013 Nikos Mavrogiannopoulos 3.2.7-2 +- Use the correct root key for unbound /var/lib/unbound/root.key (#1012494) +- Pull asm fixes from upstream (#973210) * Mon Nov 25 2013 Nikos Mavrogiannopoulos 3.2.7-1 - new upstream release