11609 lines
231 KiB
Diff
11609 lines
231 KiB
Diff
|
From 8a7565113ab937cc99f8f4c929bde2ee08fc498c Mon Sep 17 00:00:00 2001
|
||
|
From: Nikos Mavrogiannopoulos <nmav@gnutls.org>
|
||
|
Date: Tue, 26 Nov 2013 23:19:45 +0100
|
||
|
Subject: [PATCH 1/2] updated auto-generated asm files. This fixes a valgrind
|
||
|
complaint when AES-NI is in use.
|
||
|
|
||
|
---
|
||
|
.../x86/coff/appro-aes-gcm-x86-64-coff.s | 574 ++++--
|
||
|
lib/accelerated/x86/coff/appro-aes-x86-64-coff.s | 1826 ++++++++++++--------
|
||
|
lib/accelerated/x86/coff/padlock-x86-64-coff.s | 495 ++++++
|
||
|
lib/accelerated/x86/coff/padlock-x86-coff.s | 352 +++-
|
||
|
lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s | 515 ++++--
|
||
|
lib/accelerated/x86/elf/appro-aes-x86-64.s | 1609 ++++++++++-------
|
||
|
lib/accelerated/x86/elf/padlock-x86-64.s | 462 +++++
|
||
|
lib/accelerated/x86/elf/padlock-x86.s | 575 +++++-
|
||
|
.../x86/macosx/appro-aes-gcm-x86-64-macosx.s | 515 ++++--
|
||
|
.../x86/macosx/appro-aes-x86-64-macosx.s | 1609 ++++++++++-------
|
||
|
lib/accelerated/x86/macosx/padlock-x86-64-macosx.s | 462 +++++
|
||
|
lib/accelerated/x86/macosx/padlock-x86-macosx.s | 349 +++-
|
||
|
12 files changed, 6978 insertions(+), 2365 deletions(-)
|
||
|
|
||
|
diff --git a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
|
||
|
index fa449d6..ceb9108 100644
|
||
|
--- a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
|
||
|
+++ b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
|
||
|
@@ -717,6 +717,11 @@ gcm_ghash_4bit:
|
||
|
.def gcm_init_clmul; .scl 2; .type 32; .endef
|
||
|
.p2align 4
|
||
|
gcm_init_clmul:
|
||
|
+.L_init_clmul:
|
||
|
+.LSEH_begin_gcm_init_clmul:
|
||
|
+
|
||
|
+.byte 0x48,0x83,0xec,0x18
|
||
|
+.byte 0x0f,0x29,0x34,0x24
|
||
|
movdqu (%rdx),%xmm2
|
||
|
pshufd $78,%xmm2,%xmm2
|
||
|
|
||
|
@@ -735,15 +740,15 @@ gcm_init_clmul:
|
||
|
pxor %xmm5,%xmm2
|
||
|
|
||
|
|
||
|
+ pshufd $78,%xmm2,%xmm6
|
||
|
movdqa %xmm2,%xmm0
|
||
|
+ pxor %xmm2,%xmm6
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
|
||
|
@@ -753,44 +758,137 @@ gcm_init_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ pshufd $78,%xmm2,%xmm3
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm2,%xmm3
|
||
|
+ movdqu %xmm2,0(%rcx)
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ movdqu %xmm0,16(%rcx)
|
||
|
+.byte 102,15,58,15,227,8
|
||
|
+ movdqu %xmm4,32(%rcx)
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,15,58,68,194,0
|
||
|
+.byte 102,15,58,68,202,17
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm3,%xmm4
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ movdqa %xmm0,%xmm5
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,15,58,68,194,0
|
||
|
+.byte 102,15,58,68,202,17
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm3,%xmm4
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- movdqu %xmm2,(%rcx)
|
||
|
- movdqu %xmm0,16(%rcx)
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ pshufd $78,%xmm5,%xmm3
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm5,%xmm3
|
||
|
+ movdqu %xmm5,48(%rcx)
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ movdqu %xmm0,64(%rcx)
|
||
|
+.byte 102,15,58,15,227,8
|
||
|
+ movdqu %xmm4,80(%rcx)
|
||
|
+ movaps (%rsp),%xmm6
|
||
|
+ leaq 24(%rsp),%rsp
|
||
|
+.LSEH_end_gcm_init_clmul:
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
.globl gcm_gmult_clmul
|
||
|
.def gcm_gmult_clmul; .scl 2; .type 32; .endef
|
||
|
.p2align 4
|
||
|
gcm_gmult_clmul:
|
||
|
+.L_gmult_clmul:
|
||
|
movdqu (%rcx),%xmm0
|
||
|
movdqa .Lbswap_mask(%rip),%xmm5
|
||
|
movdqu (%rdx),%xmm2
|
||
|
+ movdqu 32(%rdx),%xmm4
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
.byte 102,15,58,68,220,0
|
||
|
@@ -803,194 +901,372 @@ gcm_gmult_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqu %xmm0,(%rcx)
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
.globl gcm_ghash_clmul
|
||
|
.def gcm_ghash_clmul; .scl 2; .type 32; .endef
|
||
|
-.p2align 4
|
||
|
+.p2align 5
|
||
|
gcm_ghash_clmul:
|
||
|
+.L_ghash_clmul:
|
||
|
+ leaq -136(%rsp),%rax
|
||
|
.LSEH_begin_gcm_ghash_clmul:
|
||
|
|
||
|
-.byte 0x48,0x83,0xec,0x58
|
||
|
-.byte 0x0f,0x29,0x34,0x24
|
||
|
-.byte 0x0f,0x29,0x7c,0x24,0x10
|
||
|
-.byte 0x44,0x0f,0x29,0x44,0x24,0x20
|
||
|
-.byte 0x44,0x0f,0x29,0x4c,0x24,0x30
|
||
|
-.byte 0x44,0x0f,0x29,0x54,0x24,0x40
|
||
|
+.byte 0x48,0x8d,0x60,0xe0
|
||
|
+.byte 0x0f,0x29,0x70,0xe0
|
||
|
+.byte 0x0f,0x29,0x78,0xf0
|
||
|
+.byte 0x44,0x0f,0x29,0x00
|
||
|
+.byte 0x44,0x0f,0x29,0x48,0x10
|
||
|
+.byte 0x44,0x0f,0x29,0x50,0x20
|
||
|
+.byte 0x44,0x0f,0x29,0x58,0x30
|
||
|
+.byte 0x44,0x0f,0x29,0x60,0x40
|
||
|
+.byte 0x44,0x0f,0x29,0x68,0x50
|
||
|
+.byte 0x44,0x0f,0x29,0x70,0x60
|
||
|
+.byte 0x44,0x0f,0x29,0x78,0x70
|
||
|
movdqa .Lbswap_mask(%rip),%xmm5
|
||
|
+ movq $11547335547999543296,%rax
|
||
|
|
||
|
movdqu (%rcx),%xmm0
|
||
|
movdqu (%rdx),%xmm2
|
||
|
+ movdqu 32(%rdx),%xmm10
|
||
|
.byte 102,15,56,0,197
|
||
|
|
||
|
subq $16,%r9
|
||
|
jz .Lodd_tail
|
||
|
|
||
|
- movdqu 16(%rdx),%xmm8
|
||
|
+ movdqu 16(%rdx),%xmm9
|
||
|
+ cmpq $48,%r9
|
||
|
+ jb .Lskip4x
|
||
|
|
||
|
+ subq $48,%r9
|
||
|
+ movdqu 48(%rdx),%xmm14
|
||
|
+ movdqu 64(%rdx),%xmm15
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
- movdqu (%r8),%xmm3
|
||
|
- movdqu 16(%r8),%xmm6
|
||
|
-.byte 102,15,56,0,221
|
||
|
+ movdqu 48(%r8),%xmm6
|
||
|
+ movdqu 32(%r8),%xmm11
|
||
|
.byte 102,15,56,0,245
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- movdqa %xmm6,%xmm7
|
||
|
- pshufd $78,%xmm6,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
- pxor %xmm6,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm7
|
||
|
+ pxor %xmm6,%xmm7
|
||
|
.byte 102,15,58,68,242,0
|
||
|
-.byte 102,15,58,68,250,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
- pxor %xmm6,%xmm3
|
||
|
- pxor %xmm7,%xmm3
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
+.byte 102,65,15,58,68,250,0
|
||
|
+
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,217,0
|
||
|
+.byte 102,69,15,58,68,233,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+.byte 102,69,15,58,68,226,16
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+ movups 80(%rdx),%xmm10
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+
|
||
|
+ movdqu 16(%r8),%xmm11
|
||
|
+ movdqu 0(%r8),%xmm3
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,15,56,0,221
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,222,0
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,69,15,58,68,238,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+
|
||
|
+ leaq 64(%r8),%r8
|
||
|
+ subq $64,%r9
|
||
|
+ jc .Ltail4x
|
||
|
+
|
||
|
+ jmp .Lmod4_loop
|
||
|
+.p2align 5
|
||
|
+.Lmod4_loop:
|
||
|
+.byte 102,65,15,58,68,199,0
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+ movdqu 48(%r8),%xmm11
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,65,15,58,68,207,17
|
||
|
+ xorps %xmm6,%xmm0
|
||
|
+ movdqu 32(%r8),%xmm6
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+.byte 102,65,15,58,68,218,16
|
||
|
+ xorps %xmm8,%xmm1
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,15,56,0,245
|
||
|
+ movups 32(%rdx),%xmm10
|
||
|
+.byte 102,68,15,58,68,218,0
|
||
|
+ xorps %xmm7,%xmm3
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm7
|
||
|
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm6,%xmm7
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
movdqa %xmm3,%xmm4
|
||
|
- psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm3
|
||
|
+.byte 102,68,15,58,68,234,17
|
||
|
+ psrldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ movdqa .L7_mask(%rip),%xmm3
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+.byte 102,72,15,110,224
|
||
|
+
|
||
|
+ pand %xmm0,%xmm3
|
||
|
+.byte 102,15,56,0,227
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psllq $57,%xmm4
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm7
|
||
|
- pxor %xmm4,%xmm6
|
||
|
+.byte 102,65,15,58,68,241,0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ movdqu 0(%r8),%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+.byte 102,69,15,58,68,193,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+ movdqu 16(%r8),%xmm11
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,65,15,58,68,250,16
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+ movups 80(%rdx),%xmm10
|
||
|
+.byte 102,15,56,0,221
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pxor %xmm12,%xmm7
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,222,0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ psrlq $1,%xmm0
|
||
|
+.byte 102,69,15,58,68,238,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm8,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm8,%xmm4
|
||
|
|
||
|
- leaq 32(%r8),%r8
|
||
|
- subq $32,%r9
|
||
|
- jbe .Leven_tail
|
||
|
+ leaq 64(%r8),%r8
|
||
|
+ subq $64,%r9
|
||
|
+ jnc .Lmod4_loop
|
||
|
+
|
||
|
+.Ltail4x:
|
||
|
+.byte 102,65,15,58,68,199,0
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+.byte 102,65,15,58,68,207,17
|
||
|
+ xorps %xmm6,%xmm0
|
||
|
+.byte 102,65,15,58,68,218,16
|
||
|
+ xorps %xmm8,%xmm1
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ pxor %xmm7,%xmm3
|
||
|
|
||
|
-.Lmod_loop:
|
||
|
-.byte 102,65,15,58,68,192,0
|
||
|
-.byte 102,65,15,58,68,200,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
- pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
|
||
|
movdqa %xmm3,%xmm4
|
||
|
psrldq $8,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
- movdqu (%r8),%xmm3
|
||
|
- pxor %xmm6,%xmm0
|
||
|
- pxor %xmm7,%xmm1
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ addq $64,%r9
|
||
|
+ jz .Ldone
|
||
|
+ movdqu 32(%rdx),%xmm10
|
||
|
+ subq $16,%r9
|
||
|
+ jz .Lodd_tail
|
||
|
+.Lskip4x:
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+ movdqu (%r8),%xmm3
|
||
|
movdqu 16(%r8),%xmm6
|
||
|
.byte 102,15,56,0,221
|
||
|
.byte 102,15,56,0,245
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm3
|
||
|
+ pxor %xmm6,%xmm3
|
||
|
+.byte 102,15,58,68,242,0
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
+
|
||
|
+ leaq 32(%r8),%r8
|
||
|
+ subq $32,%r9
|
||
|
+ jbe .Leven_tail
|
||
|
+ jmp .Lmod_loop
|
||
|
|
||
|
- movdqa %xmm6,%xmm7
|
||
|
- pshufd $78,%xmm6,%xmm9
|
||
|
- pshufd $78,%xmm2,%xmm10
|
||
|
- pxor %xmm6,%xmm9
|
||
|
- pxor %xmm2,%xmm10
|
||
|
+.p2align 5
|
||
|
+.Lmod_loop:
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+
|
||
|
+.byte 102,65,15,58,68,193,0
|
||
|
+.byte 102,65,15,58,68,201,17
|
||
|
+.byte 102,65,15,58,68,226,16
|
||
|
+
|
||
|
+ pxor %xmm6,%xmm0
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
+ movdqu (%r8),%xmm8
|
||
|
+.byte 102,68,15,56,0,197
|
||
|
+ movdqu 16(%r8),%xmm6
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
+ pxor %xmm3,%xmm4
|
||
|
+.byte 102,15,56,0,245
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
.byte 102,15,58,68,242,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pshufd $78,%xmm8,%xmm3
|
||
|
+ pxor %xmm8,%xmm3
|
||
|
|
||
|
-.byte 102,15,58,68,250,17
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
-
|
||
|
-.byte 102,69,15,58,68,202,0
|
||
|
- movdqa %xmm0,%xmm1
|
||
|
- pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm8,%xmm4
|
||
|
- pxor %xmm0,%xmm3
|
||
|
- pxor %xmm8,%xmm4
|
||
|
-
|
||
|
- pxor %xmm6,%xmm9
|
||
|
- pxor %xmm7,%xmm9
|
||
|
- movdqa %xmm9,%xmm10
|
||
|
- psrldq $8,%xmm9
|
||
|
- pslldq $8,%xmm10
|
||
|
- pxor %xmm9,%xmm7
|
||
|
- pxor %xmm10,%xmm6
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
|
||
|
leaq 32(%r8),%r8
|
||
|
subq $32,%r9
|
||
|
ja .Lmod_loop
|
||
|
|
||
|
.Leven_tail:
|
||
|
-.byte 102,65,15,58,68,192,0
|
||
|
-.byte 102,65,15,58,68,200,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+
|
||
|
+.byte 102,65,15,58,68,193,0
|
||
|
+.byte 102,65,15,58,68,201,17
|
||
|
+.byte 102,65,15,58,68,226,16
|
||
|
+
|
||
|
+ pxor %xmm6,%xmm0
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
-
|
||
|
- movdqa %xmm3,%xmm4
|
||
|
+ pxor %xmm3,%xmm4
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
psrldq $8,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm6,%xmm0
|
||
|
- pxor %xmm7,%xmm1
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
testq %r9,%r9
|
||
|
jnz .Ldone
|
||
|
|
||
|
@@ -1000,12 +1276,10 @@ gcm_ghash_clmul:
|
||
|
pxor %xmm3,%xmm0
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
|
||
|
@@ -1015,27 +1289,28 @@ gcm_ghash_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
.Ldone:
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqu %xmm0,(%rcx)
|
||
|
@@ -1044,15 +1319,42 @@ gcm_ghash_clmul:
|
||
|
movaps 32(%rsp),%xmm8
|
||
|
movaps 48(%rsp),%xmm9
|
||
|
movaps 64(%rsp),%xmm10
|
||
|
- addq $88,%rsp
|
||
|
- .byte 0xf3,0xc3
|
||
|
+ movaps 80(%rsp),%xmm11
|
||
|
+ movaps 96(%rsp),%xmm12
|
||
|
+ movaps 112(%rsp),%xmm13
|
||
|
+ movaps 128(%rsp),%xmm14
|
||
|
+ movaps 144(%rsp),%xmm15
|
||
|
+ leaq 168(%rsp),%rsp
|
||
|
.LSEH_end_gcm_ghash_clmul:
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+
|
||
|
+.globl gcm_init_avx
|
||
|
+.def gcm_init_avx; .scl 2; .type 32; .endef
|
||
|
+.p2align 5
|
||
|
+gcm_init_avx:
|
||
|
+ jmp .L_init_clmul
|
||
|
+
|
||
|
+.globl gcm_gmult_avx
|
||
|
+.def gcm_gmult_avx; .scl 2; .type 32; .endef
|
||
|
+.p2align 5
|
||
|
+gcm_gmult_avx:
|
||
|
+ jmp .L_gmult_clmul
|
||
|
+
|
||
|
+.globl gcm_ghash_avx
|
||
|
+.def gcm_ghash_avx; .scl 2; .type 32; .endef
|
||
|
+.p2align 5
|
||
|
+gcm_ghash_avx:
|
||
|
+ jmp .L_ghash_clmul
|
||
|
|
||
|
.p2align 6
|
||
|
.Lbswap_mask:
|
||
|
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||
|
.L0x1c2_polynomial:
|
||
|
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
||
|
+.L7_mask:
|
||
|
+.long 7,0,7,0
|
||
|
+.L7_mask_poly:
|
||
|
+.long 7,0,450,0
|
||
|
.p2align 6
|
||
|
|
||
|
.Lrem_4bit:
|
||
|
@@ -1189,10 +1491,13 @@ se_handler:
|
||
|
.rva .LSEH_end_gcm_ghash_4bit
|
||
|
.rva .LSEH_info_gcm_ghash_4bit
|
||
|
|
||
|
+.rva .LSEH_begin_gcm_init_clmul
|
||
|
+.rva .LSEH_end_gcm_init_clmul
|
||
|
+.rva .LSEH_info_gcm_init_clmul
|
||
|
+
|
||
|
.rva .LSEH_begin_gcm_ghash_clmul
|
||
|
.rva .LSEH_end_gcm_ghash_clmul
|
||
|
.rva .LSEH_info_gcm_ghash_clmul
|
||
|
-
|
||
|
.section .xdata
|
||
|
.p2align 3
|
||
|
.LSEH_info_gcm_gmult_4bit:
|
||
|
@@ -1203,11 +1508,20 @@ se_handler:
|
||
|
.byte 9,0,0,0
|
||
|
.rva se_handler
|
||
|
.rva .Lghash_prologue,.Lghash_epilogue
|
||
|
+.LSEH_info_gcm_init_clmul:
|
||
|
+.byte 0x01,0x08,0x03,0x00
|
||
|
+.byte 0x08,0x68,0x00,0x00
|
||
|
+.byte 0x04,0x22,0x00,0x00
|
||
|
.LSEH_info_gcm_ghash_clmul:
|
||
|
-.byte 0x01,0x1f,0x0b,0x00
|
||
|
-.byte 0x1f,0xa8,0x04,0x00
|
||
|
-.byte 0x19,0x98,0x03,0x00
|
||
|
-.byte 0x13,0x88,0x02,0x00
|
||
|
-.byte 0x0d,0x78,0x01,0x00
|
||
|
+.byte 0x01,0x33,0x16,0x00
|
||
|
+.byte 0x33,0xf8,0x09,0x00
|
||
|
+.byte 0x2e,0xe8,0x08,0x00
|
||
|
+.byte 0x29,0xd8,0x07,0x00
|
||
|
+.byte 0x24,0xc8,0x06,0x00
|
||
|
+.byte 0x1f,0xb8,0x05,0x00
|
||
|
+.byte 0x1a,0xa8,0x04,0x00
|
||
|
+.byte 0x15,0x98,0x03,0x00
|
||
|
+.byte 0x10,0x88,0x02,0x00
|
||
|
+.byte 0x0c,0x78,0x01,0x00
|
||
|
.byte 0x08,0x68,0x00,0x00
|
||
|
-.byte 0x04,0xa2,0x00,0x00
|
||
|
+.byte 0x04,0x01,0x15,0x00
|
||
|
diff --git a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
|
||
|
index 7bd9665..224a226 100644
|
||
|
--- a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
|
||
|
+++ b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
|
||
|
@@ -997,211 +997,423 @@ aesni_ctr32_encrypt_blocks:
|
||
|
movq %r9,%rcx
|
||
|
movq 40(%rsp),%r8
|
||
|
|
||
|
- leaq -200(%rsp),%rsp
|
||
|
- movaps %xmm6,32(%rsp)
|
||
|
- movaps %xmm7,48(%rsp)
|
||
|
- movaps %xmm8,64(%rsp)
|
||
|
- movaps %xmm9,80(%rsp)
|
||
|
- movaps %xmm10,96(%rsp)
|
||
|
- movaps %xmm11,112(%rsp)
|
||
|
- movaps %xmm12,128(%rsp)
|
||
|
- movaps %xmm13,144(%rsp)
|
||
|
- movaps %xmm14,160(%rsp)
|
||
|
- movaps %xmm15,176(%rsp)
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $288,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ movaps %xmm6,-168(%rax)
|
||
|
+ movaps %xmm7,-152(%rax)
|
||
|
+ movaps %xmm8,-136(%rax)
|
||
|
+ movaps %xmm9,-120(%rax)
|
||
|
+ movaps %xmm10,-104(%rax)
|
||
|
+ movaps %xmm11,-88(%rax)
|
||
|
+ movaps %xmm12,-72(%rax)
|
||
|
+ movaps %xmm13,-56(%rax)
|
||
|
+ movaps %xmm14,-40(%rax)
|
||
|
+ movaps %xmm15,-24(%rax)
|
||
|
.Lctr32_body:
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
+
|
||
|
cmpq $1,%rdx
|
||
|
je .Lctr32_one_shortcut
|
||
|
|
||
|
- movdqu (%r8),%xmm14
|
||
|
- movdqa .Lbswap_mask(%rip),%xmm15
|
||
|
- xorl %eax,%eax
|
||
|
-.byte 102,69,15,58,22,242,3
|
||
|
-.byte 102,68,15,58,34,240,3
|
||
|
+ movdqu (%r8),%xmm2
|
||
|
+ movdqu (%rcx),%xmm0
|
||
|
+ movl 12(%r8),%r8d
|
||
|
+ pxor %xmm0,%xmm2
|
||
|
+ movl 12(%rcx),%r11d
|
||
|
+ movdqa %xmm2,0(%rsp)
|
||
|
+ bswapl %r8d
|
||
|
+ movdqa %xmm2,%xmm3
|
||
|
+ movdqa %xmm2,%xmm4
|
||
|
+ movdqa %xmm2,%xmm5
|
||
|
+ movdqa %xmm2,64(%rsp)
|
||
|
+ movdqa %xmm2,80(%rsp)
|
||
|
+ movdqa %xmm2,96(%rsp)
|
||
|
+ movdqa %xmm2,112(%rsp)
|
||
|
|
||
|
movl 240(%rcx),%eax
|
||
|
+
|
||
|
+ leaq 1(%r8),%r9
|
||
|
+ leaq 2(%r8),%r10
|
||
|
+ bswapl %r9d
|
||
|
bswapl %r10d
|
||
|
- pxor %xmm12,%xmm12
|
||
|
- pxor %xmm13,%xmm13
|
||
|
-.byte 102,69,15,58,34,226,0
|
||
|
- leaq 3(%r10),%r11
|
||
|
-.byte 102,69,15,58,34,235,0
|
||
|
- incl %r10d
|
||
|
-.byte 102,69,15,58,34,226,1
|
||
|
- incq %r11
|
||
|
-.byte 102,69,15,58,34,235,1
|
||
|
- incl %r10d
|
||
|
-.byte 102,69,15,58,34,226,2
|
||
|
- incq %r11
|
||
|
-.byte 102,69,15,58,34,235,2
|
||
|
- movdqa %xmm12,0(%rsp)
|
||
|
-.byte 102,69,15,56,0,231
|
||
|
- movdqa %xmm13,16(%rsp)
|
||
|
-.byte 102,69,15,56,0,239
|
||
|
-
|
||
|
- pshufd $192,%xmm12,%xmm2
|
||
|
- pshufd $128,%xmm12,%xmm3
|
||
|
- pshufd $64,%xmm12,%xmm4
|
||
|
- cmpq $6,%rdx
|
||
|
- jb .Lctr32_tail
|
||
|
- shrl $1,%eax
|
||
|
- movq %rcx,%r11
|
||
|
- movl %eax,%r10d
|
||
|
- subq $6,%rdx
|
||
|
- jmp .Lctr32_loop6
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ xorl %r11d,%r10d
|
||
|
+.byte 102,65,15,58,34,217,3
|
||
|
+ leaq 3(%r8),%r9
|
||
|
+ movdqa %xmm3,16(%rsp)
|
||
|
+.byte 102,65,15,58,34,226,3
|
||
|
+ bswapl %r9d
|
||
|
+ leaq 4(%r8),%r10
|
||
|
+ movdqa %xmm4,32(%rsp)
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ bswapl %r10d
|
||
|
+.byte 102,65,15,58,34,233,3
|
||
|
+ xorl %r11d,%r10d
|
||
|
+ movdqa %xmm5,48(%rsp)
|
||
|
+ leaq 5(%r8),%r9
|
||
|
+ movl %r10d,64+12(%rsp)
|
||
|
+ bswapl %r9d
|
||
|
+ leaq 6(%r8),%r10
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ bswapl %r10d
|
||
|
+ movl %r9d,80+12(%rsp)
|
||
|
+ xorl %r11d,%r10d
|
||
|
+ leaq 7(%r8),%r9
|
||
|
+ movl %r10d,96+12(%rsp)
|
||
|
+ bswapl %r9d
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ movl %r9d,112+12(%rsp)
|
||
|
|
||
|
-.p2align 4
|
||
|
-.Lctr32_loop6:
|
||
|
- pshufd $192,%xmm13,%xmm5
|
||
|
- por %xmm14,%xmm2
|
||
|
- movups (%r11),%xmm0
|
||
|
- pshufd $128,%xmm13,%xmm6
|
||
|
- por %xmm14,%xmm3
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pshufd $64,%xmm13,%xmm7
|
||
|
- por %xmm14,%xmm4
|
||
|
- por %xmm14,%xmm5
|
||
|
- xorps %xmm0,%xmm2
|
||
|
- por %xmm14,%xmm6
|
||
|
- por %xmm14,%xmm7
|
||
|
+ movups 16(%rcx),%xmm1
|
||
|
|
||
|
+ movdqa 64(%rsp),%xmm6
|
||
|
+ movdqa 80(%rsp),%xmm7
|
||
|
|
||
|
+ cmpq $8,%rdx
|
||
|
+ jb .Lctr32_tail
|
||
|
|
||
|
+ leaq 128(%rcx),%rcx
|
||
|
+ subq $8,%rdx
|
||
|
+ jmp .Lctr32_loop8
|
||
|
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+.p2align 5
|
||
|
+.Lctr32_loop8:
|
||
|
+ addl $8,%r8d
|
||
|
+ movdqa 96(%rsp),%xmm8
|
||
|
.byte 102,15,56,220,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+ movl %r8d,%r9d
|
||
|
+ movdqa 112(%rsp),%xmm9
|
||
|
.byte 102,15,56,220,217
|
||
|
- movdqa .Lincrement32(%rip),%xmm13
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+ bswapl %r9d
|
||
|
+ movups 32-128(%rcx),%xmm0
|
||
|
.byte 102,15,56,220,225
|
||
|
- movdqa 0(%rsp),%xmm12
|
||
|
- pxor %xmm0,%xmm6
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+ movl %r9d,0+12(%rsp)
|
||
|
+ leaq 1(%r8),%r9
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
- jmp .Lctr32_enc_loop6_enter
|
||
|
-.p2align 4
|
||
|
-.Lctr32_enc_loop6:
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 48-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,16+12(%rsp)
|
||
|
+ leaq 2(%r8),%r9
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 64-128(%rcx),%xmm0
|
||
|
.byte 102,15,56,220,209
|
||
|
.byte 102,15,56,220,217
|
||
|
- decl %eax
|
||
|
+ bswapl %r9d
|
||
|
.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,233
|
||
|
+ movl %r9d,32+12(%rsp)
|
||
|
+ leaq 3(%r8),%r9
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
-.Lctr32_enc_loop6_enter:
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 80-128(%rcx),%xmm1
|
||
|
.byte 102,15,56,220,208
|
||
|
.byte 102,15,56,220,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
+ bswapl %r9d
|
||
|
.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,232
|
||
|
+ movl %r9d,48+12(%rsp)
|
||
|
+ leaq 4(%r8),%r9
|
||
|
.byte 102,15,56,220,240
|
||
|
.byte 102,15,56,220,248
|
||
|
- movups (%rcx),%xmm0
|
||
|
- jnz .Lctr32_enc_loop6
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 96-128(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movl %r9d,64+12(%rsp)
|
||
|
+ leaq 5(%r8),%r9
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 112-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,80+12(%rsp)
|
||
|
+ leaq 6(%r8),%r9
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 128-128(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movl %r9d,96+12(%rsp)
|
||
|
+ leaq 7(%r8),%r9
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 144-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,112+12(%rsp)
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+ movdqu 0(%rdi),%xmm10
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 160-128(%rcx),%xmm0
|
||
|
+
|
||
|
+ cmpl $11,%eax
|
||
|
+ jb .Lctr32_enc_done
|
||
|
|
||
|
.byte 102,15,56,220,209
|
||
|
- paddd %xmm13,%xmm12
|
||
|
.byte 102,15,56,220,217
|
||
|
- paddd 16(%rsp),%xmm13
|
||
|
.byte 102,15,56,220,225
|
||
|
- movdqa %xmm12,0(%rsp)
|
||
|
.byte 102,15,56,220,233
|
||
|
- movdqa %xmm13,16(%rsp)
|
||
|
.byte 102,15,56,220,241
|
||
|
-.byte 102,69,15,56,0,231
|
||
|
.byte 102,15,56,220,249
|
||
|
-.byte 102,69,15,56,0,239
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 176-128(%rcx),%xmm1
|
||
|
|
||
|
-.byte 102,15,56,221,208
|
||
|
- movups (%rdi),%xmm8
|
||
|
-.byte 102,15,56,221,216
|
||
|
- movups 16(%rdi),%xmm9
|
||
|
-.byte 102,15,56,221,224
|
||
|
- movups 32(%rdi),%xmm10
|
||
|
-.byte 102,15,56,221,232
|
||
|
- movups 48(%rdi),%xmm11
|
||
|
-.byte 102,15,56,221,240
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
-.byte 102,15,56,221,248
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 192-128(%rcx),%xmm0
|
||
|
+ je .Lctr32_enc_done
|
||
|
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- pshufd $192,%xmm12,%xmm2
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- pshufd $128,%xmm12,%xmm3
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- pshufd $64,%xmm12,%xmm4
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- xorps %xmm6,%xmm1
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
- xorps %xmm7,%xmm0
|
||
|
- movups %xmm1,64(%rsi)
|
||
|
- movups %xmm0,80(%rsi)
|
||
|
- leaq 96(%rsi),%rsi
|
||
|
- movl %r10d,%eax
|
||
|
- subq $6,%rdx
|
||
|
- jnc .Lctr32_loop6
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 208-128(%rcx),%xmm1
|
||
|
|
||
|
- addq $6,%rdx
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 224-128(%rcx),%xmm0
|
||
|
+
|
||
|
+.Lctr32_enc_done:
|
||
|
+ movdqu 16(%rdi),%xmm11
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqu 32(%rdi),%xmm12
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ movdqu 48(%rdi),%xmm13
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ movdqu 64(%rdi),%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ movdqu 80(%rdi),%xmm15
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movdqu 96(%rdi),%xmm1
|
||
|
+
|
||
|
+.byte 102,65,15,56,221,210
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqu 112(%rdi),%xmm10
|
||
|
+ leaq 128(%rdi),%rdi
|
||
|
+.byte 102,65,15,56,221,219
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqa 0(%rsp),%xmm11
|
||
|
+.byte 102,65,15,56,221,228
|
||
|
+ movdqa 16(%rsp),%xmm12
|
||
|
+.byte 102,65,15,56,221,237
|
||
|
+ movdqa 32(%rsp),%xmm13
|
||
|
+.byte 102,65,15,56,221,246
|
||
|
+ movdqa 48(%rsp),%xmm14
|
||
|
+.byte 102,65,15,56,221,255
|
||
|
+ movdqa 64(%rsp),%xmm15
|
||
|
+.byte 102,68,15,56,221,193
|
||
|
+ movdqa 80(%rsp),%xmm0
|
||
|
+.byte 102,69,15,56,221,202
|
||
|
+ movups 16-128(%rcx),%xmm1
|
||
|
+
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ movdqa %xmm11,%xmm2
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm12,%xmm3
|
||
|
+ movups %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm13,%xmm4
|
||
|
+ movups %xmm5,48(%rsi)
|
||
|
+ movdqa %xmm14,%xmm5
|
||
|
+ movups %xmm6,64(%rsi)
|
||
|
+ movdqa %xmm15,%xmm6
|
||
|
+ movups %xmm7,80(%rsi)
|
||
|
+ movdqa %xmm0,%xmm7
|
||
|
+ movups %xmm8,96(%rsi)
|
||
|
+ movups %xmm9,112(%rsi)
|
||
|
+ leaq 128(%rsi),%rsi
|
||
|
+
|
||
|
+ subq $8,%rdx
|
||
|
+ jnc .Lctr32_loop8
|
||
|
+
|
||
|
+ addq $8,%rdx
|
||
|
jz .Lctr32_done
|
||
|
- movq %r11,%rcx
|
||
|
- leal 1(%rax,%rax,1),%eax
|
||
|
+ leaq -128(%rcx),%rcx
|
||
|
|
||
|
.Lctr32_tail:
|
||
|
- por %xmm14,%xmm2
|
||
|
- movups (%rdi),%xmm8
|
||
|
- cmpq $2,%rdx
|
||
|
- jb .Lctr32_one
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+ cmpq $4,%rdx
|
||
|
+ jb .Lctr32_loop3
|
||
|
+ je .Lctr32_loop4
|
||
|
|
||
|
- por %xmm14,%xmm3
|
||
|
- movups 16(%rdi),%xmm9
|
||
|
- je .Lctr32_two
|
||
|
+ movdqa 96(%rsp),%xmm8
|
||
|
+ pxor %xmm9,%xmm9
|
||
|
|
||
|
- pshufd $192,%xmm13,%xmm5
|
||
|
- por %xmm14,%xmm4
|
||
|
- movups 32(%rdi),%xmm10
|
||
|
- cmpq $4,%rdx
|
||
|
- jb .Lctr32_three
|
||
|
+ movups 16(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ shrl $1,%eax
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ decl %eax
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+.byte 102,15,56,220,241
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+ movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $128,%xmm13,%xmm6
|
||
|
- por %xmm14,%xmm5
|
||
|
- movups 48(%rdi),%xmm11
|
||
|
- je .Lctr32_four
|
||
|
+ call .Lenc_loop8_enter
|
||
|
|
||
|
- por %xmm14,%xmm6
|
||
|
- xorps %xmm7,%xmm7
|
||
|
+ movdqu 48(%rdi),%xmm13
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movdqu 64(%rdi),%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm10,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ cmpq $6,%rdx
|
||
|
+ jb .Lctr32_done
|
||
|
|
||
|
- call _aesni_encrypt6
|
||
|
+ movups 80(%rdi),%xmm11
|
||
|
+ xorps %xmm11,%xmm7
|
||
|
+ movups %xmm7,80(%rsi)
|
||
|
+ je .Lctr32_done
|
||
|
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- xorps %xmm6,%xmm1
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
- movups %xmm1,64(%rsi)
|
||
|
+ movups 96(%rdi),%xmm12
|
||
|
+ xorps %xmm12,%xmm8
|
||
|
+ movups %xmm8,96(%rsi)
|
||
|
+ jmp .Lctr32_done
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+.Lctr32_loop4:
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups (%rcx),%xmm1
|
||
|
+ decl %eax
|
||
|
+ jnz .Lctr32_loop4
|
||
|
+.byte 102,15,56,221,209
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+.byte 102,15,56,221,217
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+.byte 102,15,56,221,225
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+.byte 102,15,56,221,233
|
||
|
+ movups 48(%rdi),%xmm13
|
||
|
+
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ xorps %xmm11,%xmm3
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ jmp .Lctr32_done
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+.Lctr32_loop3:
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ movups (%rcx),%xmm1
|
||
|
+ decl %eax
|
||
|
+ jnz .Lctr32_loop3
|
||
|
+.byte 102,15,56,221,209
|
||
|
+.byte 102,15,56,221,217
|
||
|
+.byte 102,15,56,221,225
|
||
|
+
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ cmpq $2,%rdx
|
||
|
+ jb .Lctr32_done
|
||
|
+
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+ xorps %xmm11,%xmm3
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ je .Lctr32_done
|
||
|
+
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+ xorps %xmm12,%xmm4
|
||
|
+ movups %xmm4,32(%rsi)
|
||
|
jmp .Lctr32_done
|
||
|
|
||
|
.p2align 4
|
||
|
.Lctr32_one_shortcut:
|
||
|
movups (%r8),%xmm2
|
||
|
- movups (%rdi),%xmm8
|
||
|
+ movups (%rdi),%xmm10
|
||
|
movl 240(%rcx),%eax
|
||
|
-.Lctr32_one:
|
||
|
movups (%rcx),%xmm0
|
||
|
movups 16(%rcx),%xmm1
|
||
|
leaq 32(%rcx),%rcx
|
||
|
@@ -1213,56 +1425,25 @@ aesni_ctr32_encrypt_blocks:
|
||
|
leaq 16(%rcx),%rcx
|
||
|
jnz .Loop_enc1_7
|
||
|
.byte 102,15,56,221,209
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- jmp .Lctr32_done
|
||
|
-
|
||
|
-.p2align 4
|
||
|
-.Lctr32_two:
|
||
|
- xorps %xmm4,%xmm4
|
||
|
- call _aesni_encrypt3
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- jmp .Lctr32_done
|
||
|
-
|
||
|
-.p2align 4
|
||
|
-.Lctr32_three:
|
||
|
- call _aesni_encrypt3
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
jmp .Lctr32_done
|
||
|
|
||
|
.p2align 4
|
||
|
-.Lctr32_four:
|
||
|
- call _aesni_encrypt4
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
-
|
||
|
.Lctr32_done:
|
||
|
- movaps 32(%rsp),%xmm6
|
||
|
- movaps 48(%rsp),%xmm7
|
||
|
- movaps 64(%rsp),%xmm8
|
||
|
- movaps 80(%rsp),%xmm9
|
||
|
- movaps 96(%rsp),%xmm10
|
||
|
- movaps 112(%rsp),%xmm11
|
||
|
- movaps 128(%rsp),%xmm12
|
||
|
- movaps 144(%rsp),%xmm13
|
||
|
- movaps 160(%rsp),%xmm14
|
||
|
- movaps 176(%rsp),%xmm15
|
||
|
- leaq 200(%rsp),%rsp
|
||
|
-.Lctr32_ret:
|
||
|
+ movaps -160(%rbp),%xmm6
|
||
|
+ movaps -144(%rbp),%xmm7
|
||
|
+ movaps -128(%rbp),%xmm8
|
||
|
+ movaps -112(%rbp),%xmm9
|
||
|
+ movaps -96(%rbp),%xmm10
|
||
|
+ movaps -80(%rbp),%xmm11
|
||
|
+ movaps -64(%rbp),%xmm12
|
||
|
+ movaps -48(%rbp),%xmm13
|
||
|
+ movaps -32(%rbp),%xmm14
|
||
|
+ movaps -16(%rbp),%xmm15
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
+.Lctr32_epilogue:
|
||
|
movq 8(%rsp),%rdi
|
||
|
movq 16(%rsp),%rsi
|
||
|
.byte 0xf3,0xc3
|
||
|
@@ -1282,18 +1463,22 @@ aesni_xts_encrypt:
|
||
|
movq 40(%rsp),%r8
|
||
|
movq 48(%rsp),%r9
|
||
|
|
||
|
- leaq -264(%rsp),%rsp
|
||
|
- movaps %xmm6,96(%rsp)
|
||
|
- movaps %xmm7,112(%rsp)
|
||
|
- movaps %xmm8,128(%rsp)
|
||
|
- movaps %xmm9,144(%rsp)
|
||
|
- movaps %xmm10,160(%rsp)
|
||
|
- movaps %xmm11,176(%rsp)
|
||
|
- movaps %xmm12,192(%rsp)
|
||
|
- movaps %xmm13,208(%rsp)
|
||
|
- movaps %xmm14,224(%rsp)
|
||
|
- movaps %xmm15,240(%rsp)
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $272,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ movaps %xmm6,-168(%rax)
|
||
|
+ movaps %xmm7,-152(%rax)
|
||
|
+ movaps %xmm8,-136(%rax)
|
||
|
+ movaps %xmm9,-120(%rax)
|
||
|
+ movaps %xmm10,-104(%rax)
|
||
|
+ movaps %xmm11,-88(%rax)
|
||
|
+ movaps %xmm12,-72(%rax)
|
||
|
+ movaps %xmm13,-56(%rax)
|
||
|
+ movaps %xmm14,-40(%rax)
|
||
|
+ movaps %xmm15,-24(%rax)
|
||
|
.Lxts_enc_body:
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
movups (%r9),%xmm15
|
||
|
movl 240(%r8),%eax
|
||
|
movl 240(%rcx),%r10d
|
||
|
@@ -1308,228 +1493,266 @@ aesni_xts_encrypt:
|
||
|
leaq 16(%r8),%r8
|
||
|
jnz .Loop_enc1_8
|
||
|
.byte 102,68,15,56,221,249
|
||
|
+ movups (%rcx),%xmm0
|
||
|
movq %rcx,%r11
|
||
|
movl %r10d,%eax
|
||
|
+ shll $4,%r10d
|
||
|
movq %rdx,%r9
|
||
|
andq $-16,%rdx
|
||
|
|
||
|
+ movups 16(%rcx,%r10,1),%xmm1
|
||
|
+ movl %eax,%r10d
|
||
|
+
|
||
|
movdqa .Lxts_magic(%rip),%xmm8
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
paddq %xmm15,%xmm15
|
||
|
pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
pxor %xmm9,%xmm15
|
||
|
+ movaps %xmm1,96(%rsp)
|
||
|
+
|
||
|
subq $96,%rdx
|
||
|
jc .Lxts_enc_short
|
||
|
|
||
|
shrl $1,%eax
|
||
|
- subl $1,%eax
|
||
|
+ subl $3,%eax
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
movl %eax,%r10d
|
||
|
+ leaq .Lxts_magic(%rip),%r8
|
||
|
jmp .Lxts_enc_grandloop
|
||
|
|
||
|
-.p2align 4
|
||
|
+.p2align 5
|
||
|
.Lxts_enc_grandloop:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu 0(%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ movdqa %xmm0,%xmm8
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm11,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
pxor %xmm12,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
pxor %xmm13,%xmm5
|
||
|
- movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm15,%xmm8
|
||
|
+ movdqa 96(%rsp),%xmm9
|
||
|
pxor %xmm14,%xmm6
|
||
|
- pxor %xmm15,%xmm7
|
||
|
-
|
||
|
-
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups 32(%r11),%xmm0
|
||
|
+ leaq 96(%rdi),%rdi
|
||
|
+ pxor %xmm8,%xmm7
|
||
|
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pxor %xmm0,%xmm2
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm9,%xmm10
|
||
|
+.byte 102,15,56,220,241
|
||
|
+ pxor %xmm9,%xmm11
|
||
|
movdqa %xmm10,0(%rsp)
|
||
|
-.byte 102,15,56,220,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 48(%r11),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,220,208
|
||
|
+ pxor %xmm9,%xmm12
|
||
|
movdqa %xmm11,16(%rsp)
|
||
|
-.byte 102,15,56,220,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ pxor %xmm9,%xmm13
|
||
|
movdqa %xmm12,32(%rsp)
|
||
|
-.byte 102,15,56,220,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqa %xmm13,48(%rsp)
|
||
|
-.byte 102,15,56,220,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ pxor %xmm9,%xmm14
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
movdqa %xmm14,64(%rsp)
|
||
|
-.byte 102,15,56,220,241
|
||
|
- movdqa %xmm15,80(%rsp)
|
||
|
-.byte 102,15,56,220,249
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- jmp .Lxts_enc_loop6_enter
|
||
|
-
|
||
|
-.p2align 4
|
||
|
+.byte 102,15,56,220,240
|
||
|
+ movdqa %xmm8,80(%rsp)
|
||
|
+.byte 102,15,56,220,248
|
||
|
+ movups 64(%r11),%xmm0
|
||
|
+ leaq 64(%r11),%rcx
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ jmp .Lxts_enc_loop6
|
||
|
+.p2align 5
|
||
|
.Lxts_enc_loop6:
|
||
|
.byte 102,15,56,220,209
|
||
|
.byte 102,15,56,220,217
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,220,225
|
||
|
.byte 102,15,56,220,233
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
-.Lxts_enc_loop6_enter:
|
||
|
movups 16(%rcx),%xmm1
|
||
|
+ leaq 32(%rcx),%rcx
|
||
|
+
|
||
|
.byte 102,15,56,220,208
|
||
|
.byte 102,15,56,220,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
.byte 102,15,56,220,224
|
||
|
.byte 102,15,56,220,232
|
||
|
.byte 102,15,56,220,240
|
||
|
.byte 102,15,56,220,248
|
||
|
movups (%rcx),%xmm0
|
||
|
+ decl %eax
|
||
|
jnz .Lxts_enc_loop6
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa (%r8),%xmm8
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ movups (%r11),%xmm10
|
||
|
.byte 102,15,56,220,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,220,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,241
|
||
|
+ movaps %xmm10,%xmm11
|
||
|
.byte 102,15,56,220,249
|
||
|
movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,220,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,220,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,240
|
||
|
+ movaps %xmm11,%xmm12
|
||
|
.byte 102,15,56,220,248
|
||
|
movups 32(%rcx),%xmm0
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,220,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
+ movdqa %xmm13,48(%rsp)
|
||
|
.byte 102,15,56,220,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,241
|
||
|
+ movaps %xmm12,%xmm13
|
||
|
.byte 102,15,56,220,249
|
||
|
+ movups 48(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm12
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,220,208
|
||
|
+ pxor %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
+.byte 102,15,56,220,216
|
||
|
paddq %xmm15,%xmm15
|
||
|
-.byte 102,15,56,221,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
-.byte 102,15,56,221,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
-.byte 102,15,56,221,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-.byte 102,15,56,221,232
|
||
|
-.byte 102,15,56,221,240
|
||
|
-.byte 102,15,56,221,248
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+.byte 102,15,56,220,240
|
||
|
+ movaps %xmm13,%xmm14
|
||
|
+.byte 102,15,56,220,248
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm13
|
||
|
+ movdqa %xmm9,%xmm0
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ pxor %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm0
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm0
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+ movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
+
|
||
|
+ pxor %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
+.byte 102,15,56,221,84,36,0
|
||
|
paddq %xmm15,%xmm15
|
||
|
- xorps 0(%rsp),%xmm2
|
||
|
pand %xmm8,%xmm9
|
||
|
- xorps 16(%rsp),%xmm3
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+.byte 102,15,56,221,92,36,16
|
||
|
+.byte 102,15,56,221,100,36,32
|
||
|
pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- xorps 32(%rsp),%xmm4
|
||
|
- movups %xmm2,0(%rsi)
|
||
|
- xorps 48(%rsp),%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps 64(%rsp),%xmm6
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- xorps 80(%rsp),%xmm7
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+.byte 102,15,56,221,108,36,48
|
||
|
+.byte 102,15,56,221,116,36,64
|
||
|
+.byte 102,15,56,221,124,36,80
|
||
|
movl %r10d,%eax
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
+
|
||
|
leaq 96(%rsi),%rsi
|
||
|
+ movups %xmm2,-96(%rsi)
|
||
|
+ movups %xmm3,-80(%rsi)
|
||
|
+ movups %xmm4,-64(%rsi)
|
||
|
+ movups %xmm5,-48(%rsi)
|
||
|
+ movups %xmm6,-32(%rsi)
|
||
|
+ movups %xmm7,-16(%rsi)
|
||
|
subq $96,%rdx
|
||
|
jnc .Lxts_enc_grandloop
|
||
|
|
||
|
- leal 3(%rax,%rax,1),%eax
|
||
|
+ leal 7(%rax,%rax,1),%eax
|
||
|
movq %r11,%rcx
|
||
|
movl %eax,%r10d
|
||
|
|
||
|
.Lxts_enc_short:
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
addq $96,%rdx
|
||
|
jz .Lxts_enc_done
|
||
|
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
cmpq $32,%rdx
|
||
|
jb .Lxts_enc_one
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
je .Lxts_enc_two
|
||
|
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
cmpq $64,%rdx
|
||
|
jb .Lxts_enc_three
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
je .Lxts_enc_four
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
movdqu 48(%rdi),%xmm5
|
||
|
@@ -1632,15 +1855,15 @@ aesni_xts_encrypt:
|
||
|
|
||
|
call _aesni_encrypt4
|
||
|
|
||
|
- xorps %xmm10,%xmm2
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- xorps %xmm11,%xmm3
|
||
|
- xorps %xmm12,%xmm4
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm13,%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movdqa %xmm14,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
leaq 64(%rsi),%rsi
|
||
|
jmp .Lxts_enc_done
|
||
|
|
||
|
@@ -1681,17 +1904,18 @@ aesni_xts_encrypt:
|
||
|
movups %xmm2,-16(%rsi)
|
||
|
|
||
|
.Lxts_enc_ret:
|
||
|
- movaps 96(%rsp),%xmm6
|
||
|
- movaps 112(%rsp),%xmm7
|
||
|
- movaps 128(%rsp),%xmm8
|
||
|
- movaps 144(%rsp),%xmm9
|
||
|
- movaps 160(%rsp),%xmm10
|
||
|
- movaps 176(%rsp),%xmm11
|
||
|
- movaps 192(%rsp),%xmm12
|
||
|
- movaps 208(%rsp),%xmm13
|
||
|
- movaps 224(%rsp),%xmm14
|
||
|
- movaps 240(%rsp),%xmm15
|
||
|
- leaq 264(%rsp),%rsp
|
||
|
+ movaps -160(%rbp),%xmm6
|
||
|
+ movaps -144(%rbp),%xmm7
|
||
|
+ movaps -128(%rbp),%xmm8
|
||
|
+ movaps -112(%rbp),%xmm9
|
||
|
+ movaps -96(%rbp),%xmm10
|
||
|
+ movaps -80(%rbp),%xmm11
|
||
|
+ movaps -64(%rbp),%xmm12
|
||
|
+ movaps -48(%rbp),%xmm13
|
||
|
+ movaps -32(%rbp),%xmm14
|
||
|
+ movaps -16(%rbp),%xmm15
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
.Lxts_enc_epilogue:
|
||
|
movq 8(%rsp),%rdi
|
||
|
movq 16(%rsp),%rsi
|
||
|
@@ -1712,18 +1936,22 @@ aesni_xts_decrypt:
|
||
|
movq 40(%rsp),%r8
|
||
|
movq 48(%rsp),%r9
|
||
|
|
||
|
- leaq -264(%rsp),%rsp
|
||
|
- movaps %xmm6,96(%rsp)
|
||
|
- movaps %xmm7,112(%rsp)
|
||
|
- movaps %xmm8,128(%rsp)
|
||
|
- movaps %xmm9,144(%rsp)
|
||
|
- movaps %xmm10,160(%rsp)
|
||
|
- movaps %xmm11,176(%rsp)
|
||
|
- movaps %xmm12,192(%rsp)
|
||
|
- movaps %xmm13,208(%rsp)
|
||
|
- movaps %xmm14,224(%rsp)
|
||
|
- movaps %xmm15,240(%rsp)
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $272,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ movaps %xmm6,-168(%rax)
|
||
|
+ movaps %xmm7,-152(%rax)
|
||
|
+ movaps %xmm8,-136(%rax)
|
||
|
+ movaps %xmm9,-120(%rax)
|
||
|
+ movaps %xmm10,-104(%rax)
|
||
|
+ movaps %xmm11,-88(%rax)
|
||
|
+ movaps %xmm12,-72(%rax)
|
||
|
+ movaps %xmm13,-56(%rax)
|
||
|
+ movaps %xmm14,-40(%rax)
|
||
|
+ movaps %xmm15,-24(%rax)
|
||
|
.Lxts_dec_body:
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
movups (%r9),%xmm15
|
||
|
movl 240(%r8),%eax
|
||
|
movl 240(%rcx),%r10d
|
||
|
@@ -1744,228 +1972,266 @@ aesni_xts_decrypt:
|
||
|
shlq $4,%rax
|
||
|
subq %rax,%rdx
|
||
|
|
||
|
+ movups (%rcx),%xmm0
|
||
|
movq %rcx,%r11
|
||
|
movl %r10d,%eax
|
||
|
+ shll $4,%r10d
|
||
|
movq %rdx,%r9
|
||
|
andq $-16,%rdx
|
||
|
|
||
|
+ movups 16(%rcx,%r10,1),%xmm1
|
||
|
+ movl %eax,%r10d
|
||
|
+
|
||
|
movdqa .Lxts_magic(%rip),%xmm8
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
paddq %xmm15,%xmm15
|
||
|
pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
pxor %xmm9,%xmm15
|
||
|
+ movaps %xmm1,96(%rsp)
|
||
|
+
|
||
|
subq $96,%rdx
|
||
|
jc .Lxts_dec_short
|
||
|
|
||
|
shrl $1,%eax
|
||
|
- subl $1,%eax
|
||
|
+ subl $3,%eax
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
movl %eax,%r10d
|
||
|
+ leaq .Lxts_magic(%rip),%r8
|
||
|
jmp .Lxts_dec_grandloop
|
||
|
|
||
|
-.p2align 4
|
||
|
+.p2align 5
|
||
|
.Lxts_dec_grandloop:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu 0(%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ movdqa %xmm0,%xmm8
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm11,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
pxor %xmm12,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
pxor %xmm13,%xmm5
|
||
|
- movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,222,225
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm15,%xmm8
|
||
|
+ movdqa 96(%rsp),%xmm9
|
||
|
pxor %xmm14,%xmm6
|
||
|
- pxor %xmm15,%xmm7
|
||
|
-
|
||
|
-
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ movups 32(%r11),%xmm0
|
||
|
+ leaq 96(%rdi),%rdi
|
||
|
+ pxor %xmm8,%xmm7
|
||
|
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pxor %xmm0,%xmm2
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm9,%xmm10
|
||
|
+.byte 102,15,56,222,241
|
||
|
+ pxor %xmm9,%xmm11
|
||
|
movdqa %xmm10,0(%rsp)
|
||
|
-.byte 102,15,56,222,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ movups 48(%r11),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,222,208
|
||
|
+ pxor %xmm9,%xmm12
|
||
|
movdqa %xmm11,16(%rsp)
|
||
|
-.byte 102,15,56,222,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+.byte 102,15,56,222,216
|
||
|
+ pxor %xmm9,%xmm13
|
||
|
movdqa %xmm12,32(%rsp)
|
||
|
-.byte 102,15,56,222,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqa %xmm13,48(%rsp)
|
||
|
-.byte 102,15,56,222,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+.byte 102,15,56,222,224
|
||
|
+ pxor %xmm9,%xmm14
|
||
|
+.byte 102,15,56,222,232
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
movdqa %xmm14,64(%rsp)
|
||
|
-.byte 102,15,56,222,241
|
||
|
- movdqa %xmm15,80(%rsp)
|
||
|
-.byte 102,15,56,222,249
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- jmp .Lxts_dec_loop6_enter
|
||
|
-
|
||
|
-.p2align 4
|
||
|
+.byte 102,15,56,222,240
|
||
|
+ movdqa %xmm8,80(%rsp)
|
||
|
+.byte 102,15,56,222,248
|
||
|
+ movups 64(%r11),%xmm0
|
||
|
+ leaq 64(%r11),%rcx
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ jmp .Lxts_dec_loop6
|
||
|
+.p2align 5
|
||
|
.Lxts_dec_loop6:
|
||
|
.byte 102,15,56,222,209
|
||
|
.byte 102,15,56,222,217
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,222,225
|
||
|
.byte 102,15,56,222,233
|
||
|
.byte 102,15,56,222,241
|
||
|
.byte 102,15,56,222,249
|
||
|
-.Lxts_dec_loop6_enter:
|
||
|
movups 16(%rcx),%xmm1
|
||
|
+ leaq 32(%rcx),%rcx
|
||
|
+
|
||
|
.byte 102,15,56,222,208
|
||
|
.byte 102,15,56,222,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
.byte 102,15,56,222,224
|
||
|
.byte 102,15,56,222,232
|
||
|
.byte 102,15,56,222,240
|
||
|
.byte 102,15,56,222,248
|
||
|
movups (%rcx),%xmm0
|
||
|
+ decl %eax
|
||
|
jnz .Lxts_dec_loop6
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa (%r8),%xmm8
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ movups (%r11),%xmm10
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,222,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,241
|
||
|
+ movaps %xmm10,%xmm11
|
||
|
.byte 102,15,56,222,249
|
||
|
movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,222,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,222,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,240
|
||
|
+ movaps %xmm11,%xmm12
|
||
|
.byte 102,15,56,222,248
|
||
|
movups 32(%rcx),%xmm0
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
+ movdqa %xmm13,48(%rsp)
|
||
|
.byte 102,15,56,222,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,241
|
||
|
+ movaps %xmm12,%xmm13
|
||
|
.byte 102,15,56,222,249
|
||
|
+ movups 48(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm12
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,222,208
|
||
|
+ pxor %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
+.byte 102,15,56,222,216
|
||
|
paddq %xmm15,%xmm15
|
||
|
-.byte 102,15,56,223,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
-.byte 102,15,56,223,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
-.byte 102,15,56,223,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-.byte 102,15,56,223,232
|
||
|
-.byte 102,15,56,223,240
|
||
|
-.byte 102,15,56,223,248
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+.byte 102,15,56,222,240
|
||
|
+ movaps %xmm13,%xmm14
|
||
|
+.byte 102,15,56,222,248
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm13
|
||
|
+ movdqa %xmm9,%xmm0
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ pxor %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm0
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm0
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+ movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
+
|
||
|
+ pxor %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
+.byte 102,15,56,223,84,36,0
|
||
|
paddq %xmm15,%xmm15
|
||
|
- xorps 0(%rsp),%xmm2
|
||
|
pand %xmm8,%xmm9
|
||
|
- xorps 16(%rsp),%xmm3
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+.byte 102,15,56,223,92,36,16
|
||
|
+.byte 102,15,56,223,100,36,32
|
||
|
pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- xorps 32(%rsp),%xmm4
|
||
|
- movups %xmm2,0(%rsi)
|
||
|
- xorps 48(%rsp),%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps 64(%rsp),%xmm6
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- xorps 80(%rsp),%xmm7
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+.byte 102,15,56,223,108,36,48
|
||
|
+.byte 102,15,56,223,116,36,64
|
||
|
+.byte 102,15,56,223,124,36,80
|
||
|
movl %r10d,%eax
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
+
|
||
|
leaq 96(%rsi),%rsi
|
||
|
+ movups %xmm2,-96(%rsi)
|
||
|
+ movups %xmm3,-80(%rsi)
|
||
|
+ movups %xmm4,-64(%rsi)
|
||
|
+ movups %xmm5,-48(%rsi)
|
||
|
+ movups %xmm6,-32(%rsi)
|
||
|
+ movups %xmm7,-16(%rsi)
|
||
|
subq $96,%rdx
|
||
|
jnc .Lxts_dec_grandloop
|
||
|
|
||
|
- leal 3(%rax,%rax,1),%eax
|
||
|
+ leal 7(%rax,%rax,1),%eax
|
||
|
movq %r11,%rcx
|
||
|
movl %eax,%r10d
|
||
|
|
||
|
.Lxts_dec_short:
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
addq $96,%rdx
|
||
|
jz .Lxts_dec_done
|
||
|
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
cmpq $32,%rdx
|
||
|
jb .Lxts_dec_one
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
je .Lxts_dec_two
|
||
|
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
cmpq $64,%rdx
|
||
|
jb .Lxts_dec_three
|
||
|
je .Lxts_dec_four
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
movdqu 48(%rdi),%xmm5
|
||
|
@@ -2058,7 +2324,7 @@ aesni_xts_decrypt:
|
||
|
xorps %xmm10,%xmm2
|
||
|
movdqa %xmm13,%xmm10
|
||
|
xorps %xmm11,%xmm3
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
+ movdqa %xmm14,%xmm11
|
||
|
xorps %xmm12,%xmm4
|
||
|
movups %xmm2,(%rsi)
|
||
|
movups %xmm3,16(%rsi)
|
||
|
@@ -2068,14 +2334,8 @@ aesni_xts_decrypt:
|
||
|
|
||
|
.p2align 4
|
||
|
.Lxts_dec_four:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movups (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movups 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movups 32(%rdi),%xmm4
|
||
|
xorps %xmm10,%xmm2
|
||
|
movups 48(%rdi),%xmm5
|
||
|
@@ -2086,16 +2346,16 @@ aesni_xts_decrypt:
|
||
|
|
||
|
call _aesni_decrypt4
|
||
|
|
||
|
- xorps %xmm10,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
movdqa %xmm14,%xmm10
|
||
|
- xorps %xmm11,%xmm3
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
movdqa %xmm15,%xmm11
|
||
|
- xorps %xmm12,%xmm4
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm13,%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
leaq 64(%rsi),%rsi
|
||
|
jmp .Lxts_dec_done
|
||
|
|
||
|
@@ -2155,17 +2415,18 @@ aesni_xts_decrypt:
|
||
|
movups %xmm2,(%rsi)
|
||
|
|
||
|
.Lxts_dec_ret:
|
||
|
- movaps 96(%rsp),%xmm6
|
||
|
- movaps 112(%rsp),%xmm7
|
||
|
- movaps 128(%rsp),%xmm8
|
||
|
- movaps 144(%rsp),%xmm9
|
||
|
- movaps 160(%rsp),%xmm10
|
||
|
- movaps 176(%rsp),%xmm11
|
||
|
- movaps 192(%rsp),%xmm12
|
||
|
- movaps 208(%rsp),%xmm13
|
||
|
- movaps 224(%rsp),%xmm14
|
||
|
- movaps 240(%rsp),%xmm15
|
||
|
- leaq 264(%rsp),%rsp
|
||
|
+ movaps -160(%rbp),%xmm6
|
||
|
+ movaps -144(%rbp),%xmm7
|
||
|
+ movaps -128(%rbp),%xmm8
|
||
|
+ movaps -112(%rbp),%xmm9
|
||
|
+ movaps -96(%rbp),%xmm10
|
||
|
+ movaps -80(%rbp),%xmm11
|
||
|
+ movaps -64(%rbp),%xmm12
|
||
|
+ movaps -48(%rbp),%xmm13
|
||
|
+ movaps -32(%rbp),%xmm14
|
||
|
+ movaps -16(%rbp),%xmm15
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
.Lxts_dec_epilogue:
|
||
|
movq 8(%rsp),%rdi
|
||
|
movq 16(%rsp),%rsi
|
||
|
@@ -2245,155 +2506,335 @@ aesni_cbc_encrypt:
|
||
|
|
||
|
.p2align 4
|
||
|
.Lcbc_decrypt:
|
||
|
- leaq -88(%rsp),%rsp
|
||
|
- movaps %xmm6,(%rsp)
|
||
|
- movaps %xmm7,16(%rsp)
|
||
|
- movaps %xmm8,32(%rsp)
|
||
|
- movaps %xmm9,48(%rsp)
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $176,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ movaps %xmm6,16(%rsp)
|
||
|
+ movaps %xmm7,32(%rsp)
|
||
|
+ movaps %xmm8,48(%rsp)
|
||
|
+ movaps %xmm9,64(%rsp)
|
||
|
+ movaps %xmm10,80(%rsp)
|
||
|
+ movaps %xmm11,96(%rsp)
|
||
|
+ movaps %xmm12,112(%rsp)
|
||
|
+ movaps %xmm13,128(%rsp)
|
||
|
+ movaps %xmm14,144(%rsp)
|
||
|
+ movaps %xmm15,160(%rsp)
|
||
|
.Lcbc_decrypt_body:
|
||
|
- movups (%r8),%xmm9
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
+ movups (%r8),%xmm10
|
||
|
movl %r10d,%eax
|
||
|
- cmpq $112,%rdx
|
||
|
+ cmpq $80,%rdx
|
||
|
jbe .Lcbc_dec_tail
|
||
|
- shrl $1,%r10d
|
||
|
+
|
||
|
+ movups (%rcx),%xmm0
|
||
|
+ movdqu 0(%rdi),%xmm2
|
||
|
+ movdqu 16(%rdi),%xmm3
|
||
|
+ movdqa %xmm2,%xmm11
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
+ movdqa %xmm3,%xmm12
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
+ movdqa %xmm4,%xmm13
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
+ movdqa %xmm5,%xmm14
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ movdqa %xmm6,%xmm15
|
||
|
+ cmpq $112,%rdx
|
||
|
+ jbe .Lcbc_dec_six_or_seven
|
||
|
+
|
||
|
subq $112,%rdx
|
||
|
- movl %r10d,%eax
|
||
|
- movaps %xmm9,64(%rsp)
|
||
|
+ leaq 112(%rcx),%rcx
|
||
|
jmp .Lcbc_dec_loop8_enter
|
||
|
.p2align 4
|
||
|
.Lcbc_dec_loop8:
|
||
|
- movaps %xmm0,64(%rsp)
|
||
|
movups %xmm9,(%rsi)
|
||
|
leaq 16(%rsi),%rsi
|
||
|
.Lcbc_dec_loop8_enter:
|
||
|
- movups (%rcx),%xmm0
|
||
|
- movups (%rdi),%xmm2
|
||
|
- movups 16(%rdi),%xmm3
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
+ movdqu 96(%rdi),%xmm8
|
||
|
+ pxor %xmm0,%xmm2
|
||
|
+ movdqu 112(%rdi),%xmm9
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ movups 16-112(%rcx),%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ xorq %r11,%r11
|
||
|
+ cmpq $112,%rdx
|
||
|
+ pxor %xmm0,%xmm5
|
||
|
+ pxor %xmm0,%xmm6
|
||
|
+ pxor %xmm0,%xmm7
|
||
|
+ pxor %xmm0,%xmm8
|
||
|
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
- xorps %xmm0,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
- xorps %xmm0,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
.byte 102,15,56,222,209
|
||
|
- pxor %xmm0,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm0,%xmm9
|
||
|
+ movups 32-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ setnc %r11b
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+ shlq $7,%r11
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ addq %rdi,%r11
|
||
|
+ movups 48-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 64-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 80-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 96-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
.byte 102,15,56,222,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
- movdqu 96(%rdi),%xmm8
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqu 112(%rdi),%xmm9
|
||
|
.byte 102,15,56,222,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,222,241
|
||
|
- pxor %xmm0,%xmm8
|
||
|
.byte 102,15,56,222,249
|
||
|
- pxor %xmm0,%xmm9
|
||
|
- movups (%rcx),%xmm0
|
||
|
.byte 102,68,15,56,222,193
|
||
|
.byte 102,68,15,56,222,201
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
-
|
||
|
- call .Ldec_loop8_enter
|
||
|
+ movups 112-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 128-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 144-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 160-112(%rcx),%xmm0
|
||
|
+ cmpl $11,%eax
|
||
|
+ jb .Lcbc_dec_done
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 176-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 192-112(%rcx),%xmm0
|
||
|
+ je .Lcbc_dec_done
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 208-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 224-112(%rcx),%xmm0
|
||
|
+.Lcbc_dec_done:
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+.byte 102,15,56,222,225
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+.byte 102,15,56,222,241
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movdqu 80(%rdi),%xmm1
|
||
|
+
|
||
|
+.byte 102,65,15,56,223,210
|
||
|
+ movdqu 96(%rdi),%xmm10
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+.byte 102,65,15,56,223,219
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqu 112(%rdi),%xmm0
|
||
|
+ leaq 128(%rdi),%rdi
|
||
|
+.byte 102,65,15,56,223,228
|
||
|
+ movdqu 0(%r11),%xmm11
|
||
|
+.byte 102,65,15,56,223,237
|
||
|
+ movdqu 16(%r11),%xmm12
|
||
|
+.byte 102,65,15,56,223,246
|
||
|
+ movdqu 32(%r11),%xmm13
|
||
|
+.byte 102,65,15,56,223,255
|
||
|
+ movdqu 48(%r11),%xmm14
|
||
|
+.byte 102,68,15,56,223,193
|
||
|
+ movdqu 64(%r11),%xmm15
|
||
|
+.byte 102,69,15,56,223,202
|
||
|
+ movdqa %xmm0,%xmm10
|
||
|
+ movdqu 80(%r11),%xmm1
|
||
|
+ movups -112(%rcx),%xmm0
|
||
|
|
||
|
- movups (%rdi),%xmm1
|
||
|
- movups 16(%rdi),%xmm0
|
||
|
- xorps 64(%rsp),%xmm2
|
||
|
- xorps %xmm1,%xmm3
|
||
|
- movups 32(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm4
|
||
|
- movups 48(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm5
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm6
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm7
|
||
|
- movups 96(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm8
|
||
|
- movups 112(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm9
|
||
|
movups %xmm2,(%rsi)
|
||
|
+ movdqa %xmm11,%xmm2
|
||
|
movups %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm12,%xmm3
|
||
|
movups %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm13,%xmm4
|
||
|
movups %xmm5,48(%rsi)
|
||
|
- movl %r10d,%eax
|
||
|
+ movdqa %xmm14,%xmm5
|
||
|
movups %xmm6,64(%rsi)
|
||
|
- movq %r11,%rcx
|
||
|
+ movdqa %xmm15,%xmm6
|
||
|
movups %xmm7,80(%rsi)
|
||
|
- leaq 128(%rdi),%rdi
|
||
|
+ movdqa %xmm1,%xmm7
|
||
|
movups %xmm8,96(%rsi)
|
||
|
leaq 112(%rsi),%rsi
|
||
|
+
|
||
|
subq $128,%rdx
|
||
|
ja .Lcbc_dec_loop8
|
||
|
|
||
|
movaps %xmm9,%xmm2
|
||
|
- movaps %xmm0,%xmm9
|
||
|
+ leaq -112(%rcx),%rcx
|
||
|
addq $112,%rdx
|
||
|
jle .Lcbc_dec_tail_collected
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- leal 1(%r10,%r10,1),%eax
|
||
|
+ movups %xmm9,(%rsi)
|
||
|
leaq 16(%rsi),%rsi
|
||
|
+ cmpq $80,%rdx
|
||
|
+ jbe .Lcbc_dec_tail
|
||
|
+
|
||
|
+ movaps %xmm11,%xmm2
|
||
|
+.Lcbc_dec_six_or_seven:
|
||
|
+ cmpq $96,%rdx
|
||
|
+ ja .Lcbc_dec_seven
|
||
|
+
|
||
|
+ movaps %xmm7,%xmm8
|
||
|
+ call _aesni_decrypt6
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm8,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ pxor %xmm15,%xmm7
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ leaq 80(%rsi),%rsi
|
||
|
+ movdqa %xmm7,%xmm2
|
||
|
+ jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+.Lcbc_dec_seven:
|
||
|
+ movups 96(%rdi),%xmm8
|
||
|
+ xorps %xmm9,%xmm9
|
||
|
+ call _aesni_decrypt8
|
||
|
+ movups 80(%rdi),%xmm9
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movups 96(%rdi),%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ pxor %xmm15,%xmm7
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
+ movdqu %xmm7,80(%rsi)
|
||
|
+ leaq 96(%rsi),%rsi
|
||
|
+ movdqa %xmm8,%xmm2
|
||
|
+ jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
.Lcbc_dec_tail:
|
||
|
movups (%rdi),%xmm2
|
||
|
- movaps %xmm2,%xmm8
|
||
|
- cmpq $16,%rdx
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_one
|
||
|
|
||
|
movups 16(%rdi),%xmm3
|
||
|
- movaps %xmm3,%xmm7
|
||
|
- cmpq $32,%rdx
|
||
|
+ movaps %xmm2,%xmm11
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_two
|
||
|
|
||
|
movups 32(%rdi),%xmm4
|
||
|
- movaps %xmm4,%xmm6
|
||
|
- cmpq $48,%rdx
|
||
|
+ movaps %xmm3,%xmm12
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_three
|
||
|
|
||
|
movups 48(%rdi),%xmm5
|
||
|
- cmpq $64,%rdx
|
||
|
+ movaps %xmm4,%xmm13
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_four
|
||
|
|
||
|
movups 64(%rdi),%xmm6
|
||
|
- cmpq $80,%rdx
|
||
|
- jbe .Lcbc_dec_five
|
||
|
-
|
||
|
- movups 80(%rdi),%xmm7
|
||
|
- cmpq $96,%rdx
|
||
|
- jbe .Lcbc_dec_six
|
||
|
-
|
||
|
- movups 96(%rdi),%xmm8
|
||
|
- movaps %xmm9,64(%rsp)
|
||
|
- call _aesni_decrypt8
|
||
|
- movups (%rdi),%xmm1
|
||
|
- movups 16(%rdi),%xmm0
|
||
|
- xorps 64(%rsp),%xmm2
|
||
|
- xorps %xmm1,%xmm3
|
||
|
- movups 32(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm4
|
||
|
- movups 48(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm5
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm6
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm7
|
||
|
- movups 96(%rdi),%xmm9
|
||
|
- xorps %xmm0,%xmm8
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
- leaq 96(%rsi),%rsi
|
||
|
- movaps %xmm8,%xmm2
|
||
|
- subq $112,%rdx
|
||
|
+ movaps %xmm5,%xmm14
|
||
|
+ movaps %xmm6,%xmm15
|
||
|
+ xorps %xmm7,%xmm7
|
||
|
+ call _aesni_decrypt6
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm15,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ leaq 64(%rsi),%rsi
|
||
|
+ movdqa %xmm6,%xmm2
|
||
|
+ subq $16,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
.p2align 4
|
||
|
.Lcbc_dec_one:
|
||
|
+ movaps %xmm2,%xmm11
|
||
|
movups (%rcx),%xmm0
|
||
|
movups 16(%rcx),%xmm1
|
||
|
leaq 32(%rcx),%rcx
|
||
|
@@ -2405,116 +2846,79 @@ aesni_cbc_encrypt:
|
||
|
leaq 16(%rcx),%rcx
|
||
|
jnz .Loop_dec1_16
|
||
|
.byte 102,15,56,223,209
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- movaps %xmm8,%xmm9
|
||
|
- subq $16,%rdx
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movaps %xmm11,%xmm10
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
.p2align 4
|
||
|
.Lcbc_dec_two:
|
||
|
+ movaps %xmm3,%xmm12
|
||
|
xorps %xmm4,%xmm4
|
||
|
call _aesni_decrypt3
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movaps %xmm7,%xmm9
|
||
|
- movaps %xmm3,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm12,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ movdqa %xmm3,%xmm2
|
||
|
leaq 16(%rsi),%rsi
|
||
|
- subq $32,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
.p2align 4
|
||
|
.Lcbc_dec_three:
|
||
|
+ movaps %xmm4,%xmm13
|
||
|
call _aesni_decrypt3
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm7,%xmm4
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movaps %xmm6,%xmm9
|
||
|
- movaps %xmm4,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm13,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm4,%xmm2
|
||
|
leaq 32(%rsi),%rsi
|
||
|
- subq $48,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
.p2align 4
|
||
|
.Lcbc_dec_four:
|
||
|
+ movaps %xmm5,%xmm14
|
||
|
call _aesni_decrypt4
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- movups 48(%rdi),%xmm9
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm7,%xmm4
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps %xmm6,%xmm5
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movaps %xmm5,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm14,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm5,%xmm2
|
||
|
leaq 48(%rsi),%rsi
|
||
|
- subq $64,%rdx
|
||
|
- jmp .Lcbc_dec_tail_collected
|
||
|
-.p2align 4
|
||
|
-.Lcbc_dec_five:
|
||
|
- xorps %xmm7,%xmm7
|
||
|
- call _aesni_decrypt6
|
||
|
- movups 16(%rdi),%xmm1
|
||
|
- movups 32(%rdi),%xmm0
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- xorps %xmm1,%xmm4
|
||
|
- movups 48(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm5
|
||
|
- movups 64(%rdi),%xmm9
|
||
|
- xorps %xmm1,%xmm6
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- leaq 64(%rsi),%rsi
|
||
|
- movaps %xmm6,%xmm2
|
||
|
- subq $80,%rdx
|
||
|
- jmp .Lcbc_dec_tail_collected
|
||
|
-.p2align 4
|
||
|
-.Lcbc_dec_six:
|
||
|
- call _aesni_decrypt6
|
||
|
- movups 16(%rdi),%xmm1
|
||
|
- movups 32(%rdi),%xmm0
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- xorps %xmm1,%xmm4
|
||
|
- movups 48(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm5
|
||
|
- movups 64(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm6
|
||
|
- movups 80(%rdi),%xmm9
|
||
|
- xorps %xmm0,%xmm7
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- leaq 80(%rsi),%rsi
|
||
|
- movaps %xmm7,%xmm2
|
||
|
- subq $96,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
.p2align 4
|
||
|
.Lcbc_dec_tail_collected:
|
||
|
+ movups %xmm10,(%r8)
|
||
|
andq $15,%rdx
|
||
|
- movups %xmm9,(%r8)
|
||
|
jnz .Lcbc_dec_tail_partial
|
||
|
movups %xmm2,(%rsi)
|
||
|
jmp .Lcbc_dec_ret
|
||
|
.p2align 4
|
||
|
.Lcbc_dec_tail_partial:
|
||
|
- movaps %xmm2,64(%rsp)
|
||
|
+ movaps %xmm2,(%rsp)
|
||
|
movq $16,%rcx
|
||
|
movq %rsi,%rdi
|
||
|
subq %rdx,%rcx
|
||
|
- leaq 64(%rsp),%rsi
|
||
|
+ leaq (%rsp),%rsi
|
||
|
.long 0x9066A4F3
|
||
|
|
||
|
.Lcbc_dec_ret:
|
||
|
- movaps (%rsp),%xmm6
|
||
|
- movaps 16(%rsp),%xmm7
|
||
|
- movaps 32(%rsp),%xmm8
|
||
|
- movaps 48(%rsp),%xmm9
|
||
|
- leaq 88(%rsp),%rsp
|
||
|
+ movaps 16(%rsp),%xmm6
|
||
|
+ movaps 32(%rsp),%xmm7
|
||
|
+ movaps 48(%rsp),%xmm8
|
||
|
+ movaps 64(%rsp),%xmm9
|
||
|
+ movaps 80(%rsp),%xmm10
|
||
|
+ movaps 96(%rsp),%xmm11
|
||
|
+ movaps 112(%rsp),%xmm12
|
||
|
+ movaps 128(%rsp),%xmm13
|
||
|
+ movaps 144(%rsp),%xmm14
|
||
|
+ movaps 160(%rsp),%xmm15
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
.Lcbc_ret:
|
||
|
movq 8(%rsp),%rdi
|
||
|
movq 16(%rsp),%rsi
|
||
|
@@ -2759,6 +3163,8 @@ __aesni_set_encrypt_key:
|
||
|
.long 1,0,0,0
|
||
|
.Lxts_magic:
|
||
|
.long 0x87,0,1,0
|
||
|
+.Lincrement1:
|
||
|
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
||
|
|
||
|
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||
|
.p2align 6
|
||
|
@@ -2823,45 +3229,9 @@ ccm64_se_handler:
|
||
|
jmp .Lcommon_seh_tail
|
||
|
|
||
|
|
||
|
-.def ctr32_se_handler; .scl 3; .type 32; .endef
|
||
|
-.p2align 4
|
||
|
-ctr32_se_handler:
|
||
|
- pushq %rsi
|
||
|
- pushq %rdi
|
||
|
- pushq %rbx
|
||
|
- pushq %rbp
|
||
|
- pushq %r12
|
||
|
- pushq %r13
|
||
|
- pushq %r14
|
||
|
- pushq %r15
|
||
|
- pushfq
|
||
|
- subq $64,%rsp
|
||
|
-
|
||
|
- movq 120(%r8),%rax
|
||
|
- movq 248(%r8),%rbx
|
||
|
-
|
||
|
- leaq .Lctr32_body(%rip),%r10
|
||
|
- cmpq %r10,%rbx
|
||
|
- jb .Lcommon_seh_tail
|
||
|
-
|
||
|
- movq 152(%r8),%rax
|
||
|
-
|
||
|
- leaq .Lctr32_ret(%rip),%r10
|
||
|
- cmpq %r10,%rbx
|
||
|
- jae .Lcommon_seh_tail
|
||
|
-
|
||
|
- leaq 32(%rax),%rsi
|
||
|
- leaq 512(%r8),%rdi
|
||
|
- movl $20,%ecx
|
||
|
-.long 0xa548f3fc
|
||
|
- leaq 200(%rax),%rax
|
||
|
-
|
||
|
- jmp .Lcommon_seh_tail
|
||
|
-
|
||
|
-
|
||
|
-.def xts_se_handler; .scl 3; .type 32; .endef
|
||
|
+.def ctr_xts_se_handler; .scl 3; .type 32; .endef
|
||
|
.p2align 4
|
||
|
-xts_se_handler:
|
||
|
+ctr_xts_se_handler:
|
||
|
pushq %rsi
|
||
|
pushq %rdi
|
||
|
pushq %rbx
|
||
|
@@ -2891,13 +3261,13 @@ xts_se_handler:
|
||
|
cmpq %r10,%rbx
|
||
|
jae .Lcommon_seh_tail
|
||
|
|
||
|
- leaq 96(%rax),%rsi
|
||
|
+ movq 160(%r8),%rax
|
||
|
+ leaq -160(%rax),%rsi
|
||
|
leaq 512(%r8),%rdi
|
||
|
movl $20,%ecx
|
||
|
.long 0xa548f3fc
|
||
|
- leaq 104+160(%rax),%rax
|
||
|
|
||
|
- jmp .Lcommon_seh_tail
|
||
|
+ jmp .Lcommon_rbp_tail
|
||
|
|
||
|
.def cbc_se_handler; .scl 3; .type 32; .endef
|
||
|
.p2align 4
|
||
|
@@ -2928,11 +3298,16 @@ cbc_se_handler:
|
||
|
cmpq %r10,%rbx
|
||
|
jae .Lcommon_seh_tail
|
||
|
|
||
|
- leaq 0(%rax),%rsi
|
||
|
+ leaq 16(%rax),%rsi
|
||
|
leaq 512(%r8),%rdi
|
||
|
- movl $8,%ecx
|
||
|
+ movl $20,%ecx
|
||
|
.long 0xa548f3fc
|
||
|
- leaq 88(%rax),%rax
|
||
|
+
|
||
|
+.Lcommon_rbp_tail:
|
||
|
+ movq 160(%r8),%rax
|
||
|
+ movq (%rax),%rbp
|
||
|
+ leaq 8(%rax),%rax
|
||
|
+ movq %rbp,160(%r8)
|
||
|
jmp .Lcommon_seh_tail
|
||
|
|
||
|
.Lrestore_cbc_rax:
|
||
|
@@ -3029,14 +3404,15 @@ cbc_se_handler:
|
||
|
.rva .Lccm64_dec_body,.Lccm64_dec_ret
|
||
|
.LSEH_info_ctr32:
|
||
|
.byte 9,0,0,0
|
||
|
-.rva ctr32_se_handler
|
||
|
+.rva ctr_xts_se_handler
|
||
|
+.rva .Lctr32_body,.Lctr32_epilogue
|
||
|
.LSEH_info_xts_enc:
|
||
|
.byte 9,0,0,0
|
||
|
-.rva xts_se_handler
|
||
|
+.rva ctr_xts_se_handler
|
||
|
.rva .Lxts_enc_body,.Lxts_enc_epilogue
|
||
|
.LSEH_info_xts_dec:
|
||
|
.byte 9,0,0,0
|
||
|
-.rva xts_se_handler
|
||
|
+.rva ctr_xts_se_handler
|
||
|
.rva .Lxts_dec_body,.Lxts_dec_epilogue
|
||
|
.LSEH_info_cbc:
|
||
|
.byte 9,0,0,0
|
||
|
diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
|
||
|
index 9f658ee..a3a0e30 100644
|
||
|
--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
|
||
|
+++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
|
||
|
@@ -686,6 +686,501 @@ padlock_cbc_encrypt:
|
||
|
movq 16(%rsp),%rsi
|
||
|
.byte 0xf3,0xc3
|
||
|
.LSEH_end_padlock_cbc_encrypt:
|
||
|
+.globl padlock_cfb_encrypt
|
||
|
+.def padlock_cfb_encrypt; .scl 2; .type 32; .endef
|
||
|
+.p2align 4
|
||
|
+padlock_cfb_encrypt:
|
||
|
+ movq %rdi,8(%rsp)
|
||
|
+ movq %rsi,16(%rsp)
|
||
|
+ movq %rsp,%rax
|
||
|
+.LSEH_begin_padlock_cfb_encrypt:
|
||
|
+ movq %rcx,%rdi
|
||
|
+ movq %rdx,%rsi
|
||
|
+ movq %r8,%rdx
|
||
|
+ movq %r9,%rcx
|
||
|
+
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz .Lcfb_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz .Lcfb_abort
|
||
|
+ leaq .Lpadlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz .Lcfb_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .Lcfb_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+ jmp .Lcfb_loop
|
||
|
+.p2align 4
|
||
|
+.Lcfb_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz .Lcfb_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+.Lcfb_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,224
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz .Lcfb_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+.Lcfb_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jnz .Lcfb_loop
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je .Lcfb_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+.Lcfb_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja .Lcfb_bzero
|
||
|
+
|
||
|
+.Lcfb_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp .Lcfb_exit
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+.Lcfb_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,224
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+.Lcfb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+.Lcfb_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ movq 8(%rsp),%rdi
|
||
|
+ movq 16(%rsp),%rsi
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+.LSEH_end_padlock_cfb_encrypt:
|
||
|
+.globl padlock_ofb_encrypt
|
||
|
+.def padlock_ofb_encrypt; .scl 2; .type 32; .endef
|
||
|
+.p2align 4
|
||
|
+padlock_ofb_encrypt:
|
||
|
+ movq %rdi,8(%rsp)
|
||
|
+ movq %rsi,16(%rsp)
|
||
|
+ movq %rsp,%rax
|
||
|
+.LSEH_begin_padlock_ofb_encrypt:
|
||
|
+ movq %rcx,%rdi
|
||
|
+ movq %rdx,%rsi
|
||
|
+ movq %r8,%rdx
|
||
|
+ movq %r9,%rcx
|
||
|
+
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz .Lofb_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz .Lofb_abort
|
||
|
+ leaq .Lpadlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz .Lofb_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .Lofb_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+ jmp .Lofb_loop
|
||
|
+.p2align 4
|
||
|
+.Lofb_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz .Lofb_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+.Lofb_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,232
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz .Lofb_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+.Lofb_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jnz .Lofb_loop
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je .Lofb_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+.Lofb_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja .Lofb_bzero
|
||
|
+
|
||
|
+.Lofb_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp .Lofb_exit
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+.Lofb_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,232
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+.Lofb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+.Lofb_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ movq 8(%rsp),%rdi
|
||
|
+ movq 16(%rsp),%rsi
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+.LSEH_end_padlock_ofb_encrypt:
|
||
|
+.globl padlock_ctr32_encrypt
|
||
|
+.def padlock_ctr32_encrypt; .scl 2; .type 32; .endef
|
||
|
+.p2align 4
|
||
|
+padlock_ctr32_encrypt:
|
||
|
+ movq %rdi,8(%rsp)
|
||
|
+ movq %rsi,16(%rsp)
|
||
|
+ movq %rsp,%rax
|
||
|
+.LSEH_begin_padlock_ctr32_encrypt:
|
||
|
+ movq %rcx,%rdi
|
||
|
+ movq %rdx,%rsi
|
||
|
+ movq %r8,%rdx
|
||
|
+ movq %r9,%rcx
|
||
|
+
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz .Lctr32_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz .Lctr32_abort
|
||
|
+ leaq .Lpadlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz .Lctr32_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .Lctr32_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+.Lctr32_reenter:
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ negl %eax
|
||
|
+ andl $31,%eax
|
||
|
+ movq $512,%rbx
|
||
|
+ shll $4,%eax
|
||
|
+ cmovzq %rbx,%rax
|
||
|
+ cmpq %rax,%rcx
|
||
|
+ cmovaq %rax,%rbx
|
||
|
+ cmovbeq %rcx,%rbx
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ ja .Lctr32_loop
|
||
|
+ movq %rsi,%rax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rdi,%rax
|
||
|
+ addq %rcx,%rax
|
||
|
+ negq %rax
|
||
|
+ andq $4095,%rax
|
||
|
+ cmpq $32,%rax
|
||
|
+ movq $-32,%rax
|
||
|
+ cmovaeq %rbx,%rax
|
||
|
+ andq %rax,%rbx
|
||
|
+ jz .Lctr32_unaligned_tail
|
||
|
+ jmp .Lctr32_loop
|
||
|
+.p2align 4
|
||
|
+.Lctr32_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz .Lctr32_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+.Lctr32_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ testl $4294901760,%eax
|
||
|
+ jnz .Lctr32_no_carry
|
||
|
+ bswapl %eax
|
||
|
+ addl $65536,%eax
|
||
|
+ bswapl %eax
|
||
|
+ movl %eax,-4(%rdx)
|
||
|
+.Lctr32_no_carry:
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz .Lctr32_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+.Lctr32_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jz .Lctr32_break
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ jae .Lctr32_loop
|
||
|
+ movq %rcx,%rbx
|
||
|
+ movq %rsi,%rax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rdi,%rax
|
||
|
+ addq %rcx,%rax
|
||
|
+ negq %rax
|
||
|
+ andq $4095,%rax
|
||
|
+ cmpq $32,%rax
|
||
|
+ movq $-32,%rax
|
||
|
+ cmovaeq %rbx,%rax
|
||
|
+ andq %rax,%rbx
|
||
|
+ jnz .Lctr32_loop
|
||
|
+.Lctr32_unaligned_tail:
|
||
|
+ xorl %eax,%eax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rcx,%rax
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rcx,%rbx
|
||
|
+ subq %rax,%rsp
|
||
|
+ shrq $3,%rcx
|
||
|
+ leaq (%rsp),%rdi
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ movq %rsp,%rsi
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ jmp .Lctr32_loop
|
||
|
+.p2align 4
|
||
|
+.Lctr32_break:
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je .Lctr32_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+.Lctr32_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja .Lctr32_bzero
|
||
|
+
|
||
|
+.Lctr32_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp .Lctr32_exit
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+.Lctr32_aligned:
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ negl %eax
|
||
|
+ andl $65535,%eax
|
||
|
+ movq $1048576,%rbx
|
||
|
+ shll $4,%eax
|
||
|
+ cmovzq %rbx,%rax
|
||
|
+ cmpq %rax,%rcx
|
||
|
+ cmovaq %rax,%rbx
|
||
|
+ cmovbeq %rcx,%rbx
|
||
|
+ jbe .Lctr32_aligned_skip
|
||
|
+
|
||
|
+.Lctr32_aligned_loop:
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ addl $65536,%eax
|
||
|
+ bswapl %eax
|
||
|
+ movl %eax,-4(%rdx)
|
||
|
+
|
||
|
+ movq %r10,%rcx
|
||
|
+ subq %r11,%rcx
|
||
|
+ movq $1048576,%rbx
|
||
|
+ jz .Lctr32_exit
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ jae .Lctr32_aligned_loop
|
||
|
+
|
||
|
+.Lctr32_aligned_skip:
|
||
|
+ leaq (%rsi,%rcx,1),%rbp
|
||
|
+ negq %rbp
|
||
|
+ andq $4095,%rbp
|
||
|
+ xorl %eax,%eax
|
||
|
+ cmpq $32,%rbp
|
||
|
+ movq $32-1,%rbp
|
||
|
+ cmovaeq %rax,%rbp
|
||
|
+ andq %rcx,%rbp
|
||
|
+ subq %rbp,%rcx
|
||
|
+ jz .Lctr32_aligned_tail
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+ testq %rbp,%rbp
|
||
|
+ jz .Lctr32_exit
|
||
|
+
|
||
|
+.Lctr32_aligned_tail:
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rbp,%rbx
|
||
|
+ movq %rbp,%rcx
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ subq %rcx,%rsp
|
||
|
+ shrq $3,%rcx
|
||
|
+ leaq (%rsp),%rdi
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ leaq (%r8),%rdi
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ jmp .Lctr32_loop
|
||
|
+.Lctr32_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+.Lctr32_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ movq 8(%rsp),%rdi
|
||
|
+ movq 16(%rsp),%rsi
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+.LSEH_end_padlock_ctr32_encrypt:
|
||
|
.byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||
|
.p2align 4
|
||
|
.data
|
||
|
diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s
|
||
|
index 69eb468..d969f30 100644
|
||
|
--- a/lib/accelerated/x86/coff/padlock-x86-coff.s
|
||
|
+++ b/lib/accelerated/x86/coff/padlock-x86-coff.s
|
||
|
@@ -515,6 +515,354 @@ _padlock_cbc_encrypt:
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
+.globl _padlock_cfb_encrypt
|
||
|
+.def _padlock_cfb_encrypt; .scl 2; .type 32; .endef
|
||
|
+.align 16
|
||
|
+_padlock_cfb_encrypt:
|
||
|
+.L_padlock_cfb_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz .L028cfb_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz .L028cfb_abort
|
||
|
+ leal .Lpadlock_saved_context,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call __padlock_verify_ctx
|
||
|
+.L029cfb_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%edx)
|
||
|
+ jnz .L030cfb_aligned
|
||
|
+ testl $15,%edi
|
||
|
+ setz %al
|
||
|
+ testl $15,%esi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .L030cfb_aligned
|
||
|
+ negl %eax
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp .L031cfb_loop
|
||
|
+.align 16
|
||
|
+.L031cfb_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ testl $15,%edi
|
||
|
+ cmovnzl %esp,%edi
|
||
|
+ testl $15,%esi
|
||
|
+ jz .L032cfb_inp_aligned
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %edi,%esi
|
||
|
+.L032cfb_inp_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,224
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ testl $15,%edi
|
||
|
+ jz .L033cfb_out_aligned
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+.L033cfb_out_aligned:
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz .L031cfb_loop
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je .L034cfb_done
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+.L035cfb_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja .L035cfb_bzero
|
||
|
+.L034cfb_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ jmp .L036cfb_exit
|
||
|
+.align 16
|
||
|
+.L030cfb_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,224
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+.L036cfb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+.L028cfb_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
+.globl _padlock_ofb_encrypt
|
||
|
+.def _padlock_ofb_encrypt; .scl 2; .type 32; .endef
|
||
|
+.align 16
|
||
|
+_padlock_ofb_encrypt:
|
||
|
+.L_padlock_ofb_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz .L037ofb_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz .L037ofb_abort
|
||
|
+ leal .Lpadlock_saved_context,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call __padlock_verify_ctx
|
||
|
+.L038ofb_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%edx)
|
||
|
+ jnz .L039ofb_aligned
|
||
|
+ testl $15,%edi
|
||
|
+ setz %al
|
||
|
+ testl $15,%esi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .L039ofb_aligned
|
||
|
+ negl %eax
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp .L040ofb_loop
|
||
|
+.align 16
|
||
|
+.L040ofb_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ testl $15,%edi
|
||
|
+ cmovnzl %esp,%edi
|
||
|
+ testl $15,%esi
|
||
|
+ jz .L041ofb_inp_aligned
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %edi,%esi
|
||
|
+.L041ofb_inp_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,232
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ testl $15,%edi
|
||
|
+ jz .L042ofb_out_aligned
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+.L042ofb_out_aligned:
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz .L040ofb_loop
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je .L043ofb_done
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+.L044ofb_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja .L044ofb_bzero
|
||
|
+.L043ofb_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ jmp .L045ofb_exit
|
||
|
+.align 16
|
||
|
+.L039ofb_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,232
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+.L045ofb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+.L037ofb_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
+.globl _padlock_ctr32_encrypt
|
||
|
+.def _padlock_ctr32_encrypt; .scl 2; .type 32; .endef
|
||
|
+.align 16
|
||
|
+_padlock_ctr32_encrypt:
|
||
|
+.L_padlock_ctr32_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz .L046ctr32_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz .L046ctr32_abort
|
||
|
+ leal .Lpadlock_saved_context,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call __padlock_verify_ctx
|
||
|
+.L047ctr32_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ movq -16(%edx),%mm0
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp .L048ctr32_loop
|
||
|
+.align 16
|
||
|
+.L048ctr32_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ movl -4(%edx),%ecx
|
||
|
+ xorl %edi,%edi
|
||
|
+ movl -8(%edx),%eax
|
||
|
+.L049ctr32_prepare:
|
||
|
+ movl %ecx,12(%esp,%edi,1)
|
||
|
+ bswap %ecx
|
||
|
+ movq %mm0,(%esp,%edi,1)
|
||
|
+ incl %ecx
|
||
|
+ movl %eax,8(%esp,%edi,1)
|
||
|
+ bswap %ecx
|
||
|
+ leal 16(%edi),%edi
|
||
|
+ cmpl %ebx,%edi
|
||
|
+ jb .L049ctr32_prepare
|
||
|
+ movl %ecx,-4(%edx)
|
||
|
+ leal (%esp),%esi
|
||
|
+ leal (%esp),%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,200
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ xorl %ecx,%ecx
|
||
|
+.L050ctr32_xor:
|
||
|
+ movups (%esi,%ecx,1),%xmm1
|
||
|
+ leal 16(%ecx),%ecx
|
||
|
+ pxor -16(%esp,%ecx,1),%xmm1
|
||
|
+ movups %xmm1,-16(%edi,%ecx,1)
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ jb .L050ctr32_xor
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz .L048ctr32_loop
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+.L051ctr32_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja .L051ctr32_bzero
|
||
|
+.L052ctr32_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+ emms
|
||
|
+.L046ctr32_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
.globl _padlock_xstore
|
||
|
.def _padlock_xstore; .scl 2; .type 32; .endef
|
||
|
.align 16
|
||
|
@@ -533,10 +881,10 @@ __win32_segv_handler:
|
||
|
movl 4(%esp),%edx
|
||
|
movl 12(%esp),%ecx
|
||
|
cmpl $3221225477,(%edx)
|
||
|
- jne .L028ret
|
||
|
+ jne .L053ret
|
||
|
addl $4,184(%ecx)
|
||
|
movl $0,%eax
|
||
|
-.L028ret:
|
||
|
+.L053ret:
|
||
|
ret
|
||
|
.globl _padlock_sha1_oneshot
|
||
|
.def _padlock_sha1_oneshot; .scl 2; .type 32; .endef
|
||
|
diff --git a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
|
||
|
index 8f2b96f..9755951 100644
|
||
|
--- a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
|
||
|
+++ b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
|
||
|
@@ -697,6 +697,7 @@ gcm_ghash_4bit:
|
||
|
.type gcm_init_clmul,@function
|
||
|
.align 16
|
||
|
gcm_init_clmul:
|
||
|
+.L_init_clmul:
|
||
|
movdqu (%rsi),%xmm2
|
||
|
pshufd $78,%xmm2,%xmm2
|
||
|
|
||
|
@@ -715,15 +716,15 @@ gcm_init_clmul:
|
||
|
pxor %xmm5,%xmm2
|
||
|
|
||
|
|
||
|
+ pshufd $78,%xmm2,%xmm6
|
||
|
movdqa %xmm2,%xmm0
|
||
|
+ pxor %xmm2,%xmm6
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
|
||
|
@@ -733,44 +734,134 @@ gcm_init_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ pshufd $78,%xmm2,%xmm3
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm2,%xmm3
|
||
|
+ movdqu %xmm2,0(%rdi)
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ movdqu %xmm0,16(%rdi)
|
||
|
+.byte 102,15,58,15,227,8
|
||
|
+ movdqu %xmm4,32(%rdi)
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,15,58,68,194,0
|
||
|
+.byte 102,15,58,68,202,17
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm3,%xmm4
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ movdqa %xmm0,%xmm5
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,15,58,68,194,0
|
||
|
+.byte 102,15,58,68,202,17
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm3,%xmm4
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- movdqu %xmm2,(%rdi)
|
||
|
- movdqu %xmm0,16(%rdi)
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ pshufd $78,%xmm5,%xmm3
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm5,%xmm3
|
||
|
+ movdqu %xmm5,48(%rdi)
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ movdqu %xmm0,64(%rdi)
|
||
|
+.byte 102,15,58,15,227,8
|
||
|
+ movdqu %xmm4,80(%rdi)
|
||
|
.byte 0xf3,0xc3
|
||
|
.size gcm_init_clmul,.-gcm_init_clmul
|
||
|
.globl gcm_gmult_clmul
|
||
|
.type gcm_gmult_clmul,@function
|
||
|
.align 16
|
||
|
gcm_gmult_clmul:
|
||
|
+.L_gmult_clmul:
|
||
|
movdqu (%rdi),%xmm0
|
||
|
movdqa .Lbswap_mask(%rip),%xmm5
|
||
|
movdqu (%rsi),%xmm2
|
||
|
+ movdqu 32(%rsi),%xmm4
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
.byte 102,15,58,68,220,0
|
||
|
@@ -783,186 +874,358 @@ gcm_gmult_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqu %xmm0,(%rdi)
|
||
|
.byte 0xf3,0xc3
|
||
|
.size gcm_gmult_clmul,.-gcm_gmult_clmul
|
||
|
.globl gcm_ghash_clmul
|
||
|
.type gcm_ghash_clmul,@function
|
||
|
-.align 16
|
||
|
+.align 32
|
||
|
gcm_ghash_clmul:
|
||
|
+.L_ghash_clmul:
|
||
|
movdqa .Lbswap_mask(%rip),%xmm5
|
||
|
+ movq $11547335547999543296,%rax
|
||
|
|
||
|
movdqu (%rdi),%xmm0
|
||
|
movdqu (%rsi),%xmm2
|
||
|
+ movdqu 32(%rsi),%xmm10
|
||
|
.byte 102,15,56,0,197
|
||
|
|
||
|
subq $16,%rcx
|
||
|
jz .Lodd_tail
|
||
|
|
||
|
- movdqu 16(%rsi),%xmm8
|
||
|
+ movdqu 16(%rsi),%xmm9
|
||
|
+ cmpq $48,%rcx
|
||
|
+ jb .Lskip4x
|
||
|
|
||
|
+ subq $48,%rcx
|
||
|
+ movdqu 48(%rsi),%xmm14
|
||
|
+ movdqu 64(%rsi),%xmm15
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
- movdqu (%rdx),%xmm3
|
||
|
- movdqu 16(%rdx),%xmm6
|
||
|
-.byte 102,15,56,0,221
|
||
|
+ movdqu 48(%rdx),%xmm6
|
||
|
+ movdqu 32(%rdx),%xmm11
|
||
|
.byte 102,15,56,0,245
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- movdqa %xmm6,%xmm7
|
||
|
- pshufd $78,%xmm6,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
- pxor %xmm6,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm7
|
||
|
+ pxor %xmm6,%xmm7
|
||
|
.byte 102,15,58,68,242,0
|
||
|
-.byte 102,15,58,68,250,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
- pxor %xmm6,%xmm3
|
||
|
- pxor %xmm7,%xmm3
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
+.byte 102,65,15,58,68,250,0
|
||
|
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,217,0
|
||
|
+.byte 102,69,15,58,68,233,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+.byte 102,69,15,58,68,226,16
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+ movups 80(%rsi),%xmm10
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+
|
||
|
+ movdqu 16(%rdx),%xmm11
|
||
|
+ movdqu 0(%rdx),%xmm3
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,15,56,0,221
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,222,0
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,69,15,58,68,238,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+
|
||
|
+ leaq 64(%rdx),%rdx
|
||
|
+ subq $64,%rcx
|
||
|
+ jc .Ltail4x
|
||
|
+
|
||
|
+ jmp .Lmod4_loop
|
||
|
+.align 32
|
||
|
+.Lmod4_loop:
|
||
|
+.byte 102,65,15,58,68,199,0
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+ movdqu 48(%rdx),%xmm11
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,65,15,58,68,207,17
|
||
|
+ xorps %xmm6,%xmm0
|
||
|
+ movdqu 32(%rdx),%xmm6
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+.byte 102,65,15,58,68,218,16
|
||
|
+ xorps %xmm8,%xmm1
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,15,56,0,245
|
||
|
+ movups 32(%rsi),%xmm10
|
||
|
+.byte 102,68,15,58,68,218,0
|
||
|
+ xorps %xmm7,%xmm3
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm7
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm6,%xmm7
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
movdqa %xmm3,%xmm4
|
||
|
- psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm3
|
||
|
+.byte 102,68,15,58,68,234,17
|
||
|
+ psrldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ movdqa .L7_mask(%rip),%xmm3
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+.byte 102,72,15,110,224
|
||
|
+
|
||
|
+ pand %xmm0,%xmm3
|
||
|
+.byte 102,15,56,0,227
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psllq $57,%xmm4
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm7
|
||
|
- pxor %xmm4,%xmm6
|
||
|
+.byte 102,65,15,58,68,241,0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ movdqu 0(%rdx),%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+.byte 102,69,15,58,68,193,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+ movdqu 16(%rdx),%xmm11
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,65,15,58,68,250,16
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+ movups 80(%rsi),%xmm10
|
||
|
+.byte 102,15,56,0,221
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pxor %xmm12,%xmm7
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,222,0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ psrlq $1,%xmm0
|
||
|
+.byte 102,69,15,58,68,238,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm8,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm8,%xmm4
|
||
|
|
||
|
- leaq 32(%rdx),%rdx
|
||
|
- subq $32,%rcx
|
||
|
- jbe .Leven_tail
|
||
|
+ leaq 64(%rdx),%rdx
|
||
|
+ subq $64,%rcx
|
||
|
+ jnc .Lmod4_loop
|
||
|
+
|
||
|
+.Ltail4x:
|
||
|
+.byte 102,65,15,58,68,199,0
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+.byte 102,65,15,58,68,207,17
|
||
|
+ xorps %xmm6,%xmm0
|
||
|
+.byte 102,65,15,58,68,218,16
|
||
|
+ xorps %xmm8,%xmm1
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ pxor %xmm7,%xmm3
|
||
|
|
||
|
-.Lmod_loop:
|
||
|
-.byte 102,65,15,58,68,192,0
|
||
|
-.byte 102,65,15,58,68,200,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
- pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
|
||
|
movdqa %xmm3,%xmm4
|
||
|
psrldq $8,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
- movdqu (%rdx),%xmm3
|
||
|
- pxor %xmm6,%xmm0
|
||
|
- pxor %xmm7,%xmm1
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ addq $64,%rcx
|
||
|
+ jz .Ldone
|
||
|
+ movdqu 32(%rsi),%xmm10
|
||
|
+ subq $16,%rcx
|
||
|
+ jz .Lodd_tail
|
||
|
+.Lskip4x:
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+ movdqu (%rdx),%xmm3
|
||
|
movdqu 16(%rdx),%xmm6
|
||
|
.byte 102,15,56,0,221
|
||
|
.byte 102,15,56,0,245
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm3
|
||
|
+ pxor %xmm6,%xmm3
|
||
|
+.byte 102,15,58,68,242,0
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
+
|
||
|
+ leaq 32(%rdx),%rdx
|
||
|
+ subq $32,%rcx
|
||
|
+ jbe .Leven_tail
|
||
|
+ jmp .Lmod_loop
|
||
|
|
||
|
- movdqa %xmm6,%xmm7
|
||
|
- pshufd $78,%xmm6,%xmm9
|
||
|
- pshufd $78,%xmm2,%xmm10
|
||
|
- pxor %xmm6,%xmm9
|
||
|
- pxor %xmm2,%xmm10
|
||
|
+.align 32
|
||
|
+.Lmod_loop:
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+
|
||
|
+.byte 102,65,15,58,68,193,0
|
||
|
+.byte 102,65,15,58,68,201,17
|
||
|
+.byte 102,65,15,58,68,226,16
|
||
|
+
|
||
|
+ pxor %xmm6,%xmm0
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
+ movdqu (%rdx),%xmm8
|
||
|
+.byte 102,68,15,56,0,197
|
||
|
+ movdqu 16(%rdx),%xmm6
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
+ pxor %xmm3,%xmm4
|
||
|
+.byte 102,15,56,0,245
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
.byte 102,15,58,68,242,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pshufd $78,%xmm8,%xmm3
|
||
|
+ pxor %xmm8,%xmm3
|
||
|
|
||
|
-.byte 102,15,58,68,250,17
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
-
|
||
|
-.byte 102,69,15,58,68,202,0
|
||
|
- movdqa %xmm0,%xmm1
|
||
|
- pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm8,%xmm4
|
||
|
- pxor %xmm0,%xmm3
|
||
|
- pxor %xmm8,%xmm4
|
||
|
-
|
||
|
- pxor %xmm6,%xmm9
|
||
|
- pxor %xmm7,%xmm9
|
||
|
- movdqa %xmm9,%xmm10
|
||
|
- psrldq $8,%xmm9
|
||
|
- pslldq $8,%xmm10
|
||
|
- pxor %xmm9,%xmm7
|
||
|
- pxor %xmm10,%xmm6
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
|
||
|
leaq 32(%rdx),%rdx
|
||
|
subq $32,%rcx
|
||
|
ja .Lmod_loop
|
||
|
|
||
|
.Leven_tail:
|
||
|
-.byte 102,65,15,58,68,192,0
|
||
|
-.byte 102,65,15,58,68,200,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+
|
||
|
+.byte 102,65,15,58,68,193,0
|
||
|
+.byte 102,65,15,58,68,201,17
|
||
|
+.byte 102,65,15,58,68,226,16
|
||
|
+
|
||
|
+ pxor %xmm6,%xmm0
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
-
|
||
|
- movdqa %xmm3,%xmm4
|
||
|
+ pxor %xmm3,%xmm4
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
psrldq $8,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm6,%xmm0
|
||
|
- pxor %xmm7,%xmm1
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
testq %rcx,%rcx
|
||
|
jnz .Ldone
|
||
|
|
||
|
@@ -972,12 +1235,10 @@ gcm_ghash_clmul:
|
||
|
pxor %xmm3,%xmm0
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
|
||
|
@@ -987,38 +1248,60 @@ gcm_ghash_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
.Ldone:
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqu %xmm0,(%rdi)
|
||
|
.byte 0xf3,0xc3
|
||
|
-.LSEH_end_gcm_ghash_clmul:
|
||
|
.size gcm_ghash_clmul,.-gcm_ghash_clmul
|
||
|
+.globl gcm_init_avx
|
||
|
+.type gcm_init_avx,@function
|
||
|
+.align 32
|
||
|
+gcm_init_avx:
|
||
|
+ jmp .L_init_clmul
|
||
|
+.size gcm_init_avx,.-gcm_init_avx
|
||
|
+.globl gcm_gmult_avx
|
||
|
+.type gcm_gmult_avx,@function
|
||
|
+.align 32
|
||
|
+gcm_gmult_avx:
|
||
|
+ jmp .L_gmult_clmul
|
||
|
+.size gcm_gmult_avx,.-gcm_gmult_avx
|
||
|
+.globl gcm_ghash_avx
|
||
|
+.type gcm_ghash_avx,@function
|
||
|
+.align 32
|
||
|
+gcm_ghash_avx:
|
||
|
+ jmp .L_ghash_clmul
|
||
|
+.size gcm_ghash_avx,.-gcm_ghash_avx
|
||
|
.align 64
|
||
|
.Lbswap_mask:
|
||
|
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||
|
.L0x1c2_polynomial:
|
||
|
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
||
|
+.L7_mask:
|
||
|
+.long 7,0,7,0
|
||
|
+.L7_mask_poly:
|
||
|
+.long 7,0,450,0
|
||
|
.align 64
|
||
|
.type .Lrem_4bit,@object
|
||
|
.Lrem_4bit:
|
||
|
diff --git a/lib/accelerated/x86/elf/appro-aes-x86-64.s b/lib/accelerated/x86/elf/appro-aes-x86-64.s
|
||
|
index f48666f..d3734a6 100644
|
||
|
--- a/lib/accelerated/x86/elf/appro-aes-x86-64.s
|
||
|
+++ b/lib/accelerated/x86/elf/appro-aes-x86-64.s
|
||
|
@@ -925,199 +925,412 @@ aesni_ccm64_decrypt_blocks:
|
||
|
.type aesni_ctr32_encrypt_blocks,@function
|
||
|
.align 16
|
||
|
aesni_ctr32_encrypt_blocks:
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $128,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
+
|
||
|
cmpq $1,%rdx
|
||
|
je .Lctr32_one_shortcut
|
||
|
|
||
|
- movdqu (%r8),%xmm14
|
||
|
- movdqa .Lbswap_mask(%rip),%xmm15
|
||
|
- xorl %eax,%eax
|
||
|
-.byte 102,69,15,58,22,242,3
|
||
|
-.byte 102,68,15,58,34,240,3
|
||
|
+ movdqu (%r8),%xmm2
|
||
|
+ movdqu (%rcx),%xmm0
|
||
|
+ movl 12(%r8),%r8d
|
||
|
+ pxor %xmm0,%xmm2
|
||
|
+ movl 12(%rcx),%r11d
|
||
|
+ movdqa %xmm2,0(%rsp)
|
||
|
+ bswapl %r8d
|
||
|
+ movdqa %xmm2,%xmm3
|
||
|
+ movdqa %xmm2,%xmm4
|
||
|
+ movdqa %xmm2,%xmm5
|
||
|
+ movdqa %xmm2,64(%rsp)
|
||
|
+ movdqa %xmm2,80(%rsp)
|
||
|
+ movdqa %xmm2,96(%rsp)
|
||
|
+ movdqa %xmm2,112(%rsp)
|
||
|
|
||
|
movl 240(%rcx),%eax
|
||
|
+
|
||
|
+ leaq 1(%r8),%r9
|
||
|
+ leaq 2(%r8),%r10
|
||
|
+ bswapl %r9d
|
||
|
bswapl %r10d
|
||
|
- pxor %xmm12,%xmm12
|
||
|
- pxor %xmm13,%xmm13
|
||
|
-.byte 102,69,15,58,34,226,0
|
||
|
- leaq 3(%r10),%r11
|
||
|
-.byte 102,69,15,58,34,235,0
|
||
|
- incl %r10d
|
||
|
-.byte 102,69,15,58,34,226,1
|
||
|
- incq %r11
|
||
|
-.byte 102,69,15,58,34,235,1
|
||
|
- incl %r10d
|
||
|
-.byte 102,69,15,58,34,226,2
|
||
|
- incq %r11
|
||
|
-.byte 102,69,15,58,34,235,2
|
||
|
- movdqa %xmm12,-40(%rsp)
|
||
|
-.byte 102,69,15,56,0,231
|
||
|
- movdqa %xmm13,-24(%rsp)
|
||
|
-.byte 102,69,15,56,0,239
|
||
|
-
|
||
|
- pshufd $192,%xmm12,%xmm2
|
||
|
- pshufd $128,%xmm12,%xmm3
|
||
|
- pshufd $64,%xmm12,%xmm4
|
||
|
- cmpq $6,%rdx
|
||
|
- jb .Lctr32_tail
|
||
|
- shrl $1,%eax
|
||
|
- movq %rcx,%r11
|
||
|
- movl %eax,%r10d
|
||
|
- subq $6,%rdx
|
||
|
- jmp .Lctr32_loop6
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ xorl %r11d,%r10d
|
||
|
+.byte 102,65,15,58,34,217,3
|
||
|
+ leaq 3(%r8),%r9
|
||
|
+ movdqa %xmm3,16(%rsp)
|
||
|
+.byte 102,65,15,58,34,226,3
|
||
|
+ bswapl %r9d
|
||
|
+ leaq 4(%r8),%r10
|
||
|
+ movdqa %xmm4,32(%rsp)
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ bswapl %r10d
|
||
|
+.byte 102,65,15,58,34,233,3
|
||
|
+ xorl %r11d,%r10d
|
||
|
+ movdqa %xmm5,48(%rsp)
|
||
|
+ leaq 5(%r8),%r9
|
||
|
+ movl %r10d,64+12(%rsp)
|
||
|
+ bswapl %r9d
|
||
|
+ leaq 6(%r8),%r10
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ bswapl %r10d
|
||
|
+ movl %r9d,80+12(%rsp)
|
||
|
+ xorl %r11d,%r10d
|
||
|
+ leaq 7(%r8),%r9
|
||
|
+ movl %r10d,96+12(%rsp)
|
||
|
+ bswapl %r9d
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ movl %r9d,112+12(%rsp)
|
||
|
|
||
|
-.align 16
|
||
|
-.Lctr32_loop6:
|
||
|
- pshufd $192,%xmm13,%xmm5
|
||
|
- por %xmm14,%xmm2
|
||
|
- movups (%r11),%xmm0
|
||
|
- pshufd $128,%xmm13,%xmm6
|
||
|
- por %xmm14,%xmm3
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pshufd $64,%xmm13,%xmm7
|
||
|
- por %xmm14,%xmm4
|
||
|
- por %xmm14,%xmm5
|
||
|
- xorps %xmm0,%xmm2
|
||
|
- por %xmm14,%xmm6
|
||
|
- por %xmm14,%xmm7
|
||
|
+ movups 16(%rcx),%xmm1
|
||
|
|
||
|
+ movdqa 64(%rsp),%xmm6
|
||
|
+ movdqa 80(%rsp),%xmm7
|
||
|
|
||
|
+ cmpq $8,%rdx
|
||
|
+ jb .Lctr32_tail
|
||
|
|
||
|
+ leaq 128(%rcx),%rcx
|
||
|
+ subq $8,%rdx
|
||
|
+ jmp .Lctr32_loop8
|
||
|
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+.align 32
|
||
|
+.Lctr32_loop8:
|
||
|
+ addl $8,%r8d
|
||
|
+ movdqa 96(%rsp),%xmm8
|
||
|
.byte 102,15,56,220,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+ movl %r8d,%r9d
|
||
|
+ movdqa 112(%rsp),%xmm9
|
||
|
.byte 102,15,56,220,217
|
||
|
- movdqa .Lincrement32(%rip),%xmm13
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+ bswapl %r9d
|
||
|
+ movups 32-128(%rcx),%xmm0
|
||
|
.byte 102,15,56,220,225
|
||
|
- movdqa -40(%rsp),%xmm12
|
||
|
- pxor %xmm0,%xmm6
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+ movl %r9d,0+12(%rsp)
|
||
|
+ leaq 1(%r8),%r9
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
- jmp .Lctr32_enc_loop6_enter
|
||
|
-.align 16
|
||
|
-.Lctr32_enc_loop6:
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 48-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,16+12(%rsp)
|
||
|
+ leaq 2(%r8),%r9
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 64-128(%rcx),%xmm0
|
||
|
.byte 102,15,56,220,209
|
||
|
.byte 102,15,56,220,217
|
||
|
- decl %eax
|
||
|
+ bswapl %r9d
|
||
|
.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,233
|
||
|
+ movl %r9d,32+12(%rsp)
|
||
|
+ leaq 3(%r8),%r9
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
-.Lctr32_enc_loop6_enter:
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 80-128(%rcx),%xmm1
|
||
|
.byte 102,15,56,220,208
|
||
|
.byte 102,15,56,220,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
+ bswapl %r9d
|
||
|
.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,232
|
||
|
+ movl %r9d,48+12(%rsp)
|
||
|
+ leaq 4(%r8),%r9
|
||
|
.byte 102,15,56,220,240
|
||
|
.byte 102,15,56,220,248
|
||
|
- movups (%rcx),%xmm0
|
||
|
- jnz .Lctr32_enc_loop6
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 96-128(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movl %r9d,64+12(%rsp)
|
||
|
+ leaq 5(%r8),%r9
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 112-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,80+12(%rsp)
|
||
|
+ leaq 6(%r8),%r9
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 128-128(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movl %r9d,96+12(%rsp)
|
||
|
+ leaq 7(%r8),%r9
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 144-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,112+12(%rsp)
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+ movdqu 0(%rdi),%xmm10
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 160-128(%rcx),%xmm0
|
||
|
+
|
||
|
+ cmpl $11,%eax
|
||
|
+ jb .Lctr32_enc_done
|
||
|
|
||
|
.byte 102,15,56,220,209
|
||
|
- paddd %xmm13,%xmm12
|
||
|
.byte 102,15,56,220,217
|
||
|
- paddd -24(%rsp),%xmm13
|
||
|
.byte 102,15,56,220,225
|
||
|
- movdqa %xmm12,-40(%rsp)
|
||
|
.byte 102,15,56,220,233
|
||
|
- movdqa %xmm13,-24(%rsp)
|
||
|
.byte 102,15,56,220,241
|
||
|
-.byte 102,69,15,56,0,231
|
||
|
.byte 102,15,56,220,249
|
||
|
-.byte 102,69,15,56,0,239
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 176-128(%rcx),%xmm1
|
||
|
|
||
|
-.byte 102,15,56,221,208
|
||
|
- movups (%rdi),%xmm8
|
||
|
-.byte 102,15,56,221,216
|
||
|
- movups 16(%rdi),%xmm9
|
||
|
-.byte 102,15,56,221,224
|
||
|
- movups 32(%rdi),%xmm10
|
||
|
-.byte 102,15,56,221,232
|
||
|
- movups 48(%rdi),%xmm11
|
||
|
-.byte 102,15,56,221,240
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
-.byte 102,15,56,221,248
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 192-128(%rcx),%xmm0
|
||
|
+ je .Lctr32_enc_done
|
||
|
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- pshufd $192,%xmm12,%xmm2
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- pshufd $128,%xmm12,%xmm3
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- pshufd $64,%xmm12,%xmm4
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- xorps %xmm6,%xmm1
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
- xorps %xmm7,%xmm0
|
||
|
- movups %xmm1,64(%rsi)
|
||
|
- movups %xmm0,80(%rsi)
|
||
|
- leaq 96(%rsi),%rsi
|
||
|
- movl %r10d,%eax
|
||
|
- subq $6,%rdx
|
||
|
- jnc .Lctr32_loop6
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 208-128(%rcx),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 224-128(%rcx),%xmm0
|
||
|
+
|
||
|
+.Lctr32_enc_done:
|
||
|
+ movdqu 16(%rdi),%xmm11
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqu 32(%rdi),%xmm12
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ movdqu 48(%rdi),%xmm13
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ movdqu 64(%rdi),%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ movdqu 80(%rdi),%xmm15
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movdqu 96(%rdi),%xmm1
|
||
|
+
|
||
|
+.byte 102,65,15,56,221,210
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqu 112(%rdi),%xmm10
|
||
|
+ leaq 128(%rdi),%rdi
|
||
|
+.byte 102,65,15,56,221,219
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqa 0(%rsp),%xmm11
|
||
|
+.byte 102,65,15,56,221,228
|
||
|
+ movdqa 16(%rsp),%xmm12
|
||
|
+.byte 102,65,15,56,221,237
|
||
|
+ movdqa 32(%rsp),%xmm13
|
||
|
+.byte 102,65,15,56,221,246
|
||
|
+ movdqa 48(%rsp),%xmm14
|
||
|
+.byte 102,65,15,56,221,255
|
||
|
+ movdqa 64(%rsp),%xmm15
|
||
|
+.byte 102,68,15,56,221,193
|
||
|
+ movdqa 80(%rsp),%xmm0
|
||
|
+.byte 102,69,15,56,221,202
|
||
|
+ movups 16-128(%rcx),%xmm1
|
||
|
+
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ movdqa %xmm11,%xmm2
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm12,%xmm3
|
||
|
+ movups %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm13,%xmm4
|
||
|
+ movups %xmm5,48(%rsi)
|
||
|
+ movdqa %xmm14,%xmm5
|
||
|
+ movups %xmm6,64(%rsi)
|
||
|
+ movdqa %xmm15,%xmm6
|
||
|
+ movups %xmm7,80(%rsi)
|
||
|
+ movdqa %xmm0,%xmm7
|
||
|
+ movups %xmm8,96(%rsi)
|
||
|
+ movups %xmm9,112(%rsi)
|
||
|
+ leaq 128(%rsi),%rsi
|
||
|
+
|
||
|
+ subq $8,%rdx
|
||
|
+ jnc .Lctr32_loop8
|
||
|
|
||
|
- addq $6,%rdx
|
||
|
+ addq $8,%rdx
|
||
|
jz .Lctr32_done
|
||
|
- movq %r11,%rcx
|
||
|
- leal 1(%rax,%rax,1),%eax
|
||
|
+ leaq -128(%rcx),%rcx
|
||
|
|
||
|
.Lctr32_tail:
|
||
|
- por %xmm14,%xmm2
|
||
|
- movups (%rdi),%xmm8
|
||
|
- cmpq $2,%rdx
|
||
|
- jb .Lctr32_one
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+ cmpq $4,%rdx
|
||
|
+ jb .Lctr32_loop3
|
||
|
+ je .Lctr32_loop4
|
||
|
|
||
|
- por %xmm14,%xmm3
|
||
|
- movups 16(%rdi),%xmm9
|
||
|
- je .Lctr32_two
|
||
|
+ movdqa 96(%rsp),%xmm8
|
||
|
+ pxor %xmm9,%xmm9
|
||
|
|
||
|
- pshufd $192,%xmm13,%xmm5
|
||
|
- por %xmm14,%xmm4
|
||
|
- movups 32(%rdi),%xmm10
|
||
|
- cmpq $4,%rdx
|
||
|
- jb .Lctr32_three
|
||
|
+ movups 16(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ shrl $1,%eax
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ decl %eax
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+.byte 102,15,56,220,241
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+ movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $128,%xmm13,%xmm6
|
||
|
- por %xmm14,%xmm5
|
||
|
- movups 48(%rdi),%xmm11
|
||
|
- je .Lctr32_four
|
||
|
+ call .Lenc_loop8_enter
|
||
|
|
||
|
- por %xmm14,%xmm6
|
||
|
- xorps %xmm7,%xmm7
|
||
|
+ movdqu 48(%rdi),%xmm13
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movdqu 64(%rdi),%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm10,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ cmpq $6,%rdx
|
||
|
+ jb .Lctr32_done
|
||
|
|
||
|
- call _aesni_encrypt6
|
||
|
+ movups 80(%rdi),%xmm11
|
||
|
+ xorps %xmm11,%xmm7
|
||
|
+ movups %xmm7,80(%rsi)
|
||
|
+ je .Lctr32_done
|
||
|
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- xorps %xmm6,%xmm1
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
- movups %xmm1,64(%rsi)
|
||
|
+ movups 96(%rdi),%xmm12
|
||
|
+ xorps %xmm12,%xmm8
|
||
|
+ movups %xmm8,96(%rsi)
|
||
|
+ jmp .Lctr32_done
|
||
|
+
|
||
|
+.align 32
|
||
|
+.Lctr32_loop4:
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups (%rcx),%xmm1
|
||
|
+ decl %eax
|
||
|
+ jnz .Lctr32_loop4
|
||
|
+.byte 102,15,56,221,209
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+.byte 102,15,56,221,217
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+.byte 102,15,56,221,225
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+.byte 102,15,56,221,233
|
||
|
+ movups 48(%rdi),%xmm13
|
||
|
+
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ xorps %xmm11,%xmm3
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ jmp .Lctr32_done
|
||
|
+
|
||
|
+.align 32
|
||
|
+.Lctr32_loop3:
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ movups (%rcx),%xmm1
|
||
|
+ decl %eax
|
||
|
+ jnz .Lctr32_loop3
|
||
|
+.byte 102,15,56,221,209
|
||
|
+.byte 102,15,56,221,217
|
||
|
+.byte 102,15,56,221,225
|
||
|
+
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ cmpq $2,%rdx
|
||
|
+ jb .Lctr32_done
|
||
|
+
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+ xorps %xmm11,%xmm3
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ je .Lctr32_done
|
||
|
+
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+ xorps %xmm12,%xmm4
|
||
|
+ movups %xmm4,32(%rsi)
|
||
|
jmp .Lctr32_done
|
||
|
|
||
|
.align 16
|
||
|
.Lctr32_one_shortcut:
|
||
|
movups (%r8),%xmm2
|
||
|
- movups (%rdi),%xmm8
|
||
|
+ movups (%rdi),%xmm10
|
||
|
movl 240(%rcx),%eax
|
||
|
-.Lctr32_one:
|
||
|
movups (%rcx),%xmm0
|
||
|
movups 16(%rcx),%xmm1
|
||
|
leaq 32(%rcx),%rcx
|
||
|
@@ -1129,51 +1342,26 @@ aesni_ctr32_encrypt_blocks:
|
||
|
leaq 16(%rcx),%rcx
|
||
|
jnz .Loop_enc1_7
|
||
|
.byte 102,15,56,221,209
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- jmp .Lctr32_done
|
||
|
-
|
||
|
-.align 16
|
||
|
-.Lctr32_two:
|
||
|
- xorps %xmm4,%xmm4
|
||
|
- call _aesni_encrypt3
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- jmp .Lctr32_done
|
||
|
-
|
||
|
-.align 16
|
||
|
-.Lctr32_three:
|
||
|
- call _aesni_encrypt3
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
jmp .Lctr32_done
|
||
|
|
||
|
.align 16
|
||
|
-.Lctr32_four:
|
||
|
- call _aesni_encrypt4
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
-
|
||
|
.Lctr32_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
+.Lctr32_epilogue:
|
||
|
.byte 0xf3,0xc3
|
||
|
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
|
||
|
.globl aesni_xts_encrypt
|
||
|
.type aesni_xts_encrypt,@function
|
||
|
.align 16
|
||
|
aesni_xts_encrypt:
|
||
|
- leaq -104(%rsp),%rsp
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $112,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
movups (%r9),%xmm15
|
||
|
movl 240(%r8),%eax
|
||
|
movl 240(%rcx),%r10d
|
||
|
@@ -1188,228 +1376,266 @@ aesni_xts_encrypt:
|
||
|
leaq 16(%r8),%r8
|
||
|
jnz .Loop_enc1_8
|
||
|
.byte 102,68,15,56,221,249
|
||
|
+ movups (%rcx),%xmm0
|
||
|
movq %rcx,%r11
|
||
|
movl %r10d,%eax
|
||
|
+ shll $4,%r10d
|
||
|
movq %rdx,%r9
|
||
|
andq $-16,%rdx
|
||
|
|
||
|
+ movups 16(%rcx,%r10,1),%xmm1
|
||
|
+ movl %eax,%r10d
|
||
|
+
|
||
|
movdqa .Lxts_magic(%rip),%xmm8
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
paddq %xmm15,%xmm15
|
||
|
pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
pxor %xmm9,%xmm15
|
||
|
+ movaps %xmm1,96(%rsp)
|
||
|
+
|
||
|
subq $96,%rdx
|
||
|
jc .Lxts_enc_short
|
||
|
|
||
|
shrl $1,%eax
|
||
|
- subl $1,%eax
|
||
|
+ subl $3,%eax
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
movl %eax,%r10d
|
||
|
+ leaq .Lxts_magic(%rip),%r8
|
||
|
jmp .Lxts_enc_grandloop
|
||
|
|
||
|
-.align 16
|
||
|
+.align 32
|
||
|
.Lxts_enc_grandloop:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu 0(%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ movdqa %xmm0,%xmm8
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm11,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
pxor %xmm12,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
pxor %xmm13,%xmm5
|
||
|
- movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm15,%xmm8
|
||
|
+ movdqa 96(%rsp),%xmm9
|
||
|
pxor %xmm14,%xmm6
|
||
|
- pxor %xmm15,%xmm7
|
||
|
-
|
||
|
-
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups 32(%r11),%xmm0
|
||
|
+ leaq 96(%rdi),%rdi
|
||
|
+ pxor %xmm8,%xmm7
|
||
|
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pxor %xmm0,%xmm2
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm9,%xmm10
|
||
|
+.byte 102,15,56,220,241
|
||
|
+ pxor %xmm9,%xmm11
|
||
|
movdqa %xmm10,0(%rsp)
|
||
|
-.byte 102,15,56,220,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 48(%r11),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,220,208
|
||
|
+ pxor %xmm9,%xmm12
|
||
|
movdqa %xmm11,16(%rsp)
|
||
|
-.byte 102,15,56,220,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ pxor %xmm9,%xmm13
|
||
|
movdqa %xmm12,32(%rsp)
|
||
|
-.byte 102,15,56,220,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqa %xmm13,48(%rsp)
|
||
|
-.byte 102,15,56,220,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ pxor %xmm9,%xmm14
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
movdqa %xmm14,64(%rsp)
|
||
|
-.byte 102,15,56,220,241
|
||
|
- movdqa %xmm15,80(%rsp)
|
||
|
-.byte 102,15,56,220,249
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- jmp .Lxts_enc_loop6_enter
|
||
|
-
|
||
|
-.align 16
|
||
|
+.byte 102,15,56,220,240
|
||
|
+ movdqa %xmm8,80(%rsp)
|
||
|
+.byte 102,15,56,220,248
|
||
|
+ movups 64(%r11),%xmm0
|
||
|
+ leaq 64(%r11),%rcx
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ jmp .Lxts_enc_loop6
|
||
|
+.align 32
|
||
|
.Lxts_enc_loop6:
|
||
|
.byte 102,15,56,220,209
|
||
|
.byte 102,15,56,220,217
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,220,225
|
||
|
.byte 102,15,56,220,233
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
-.Lxts_enc_loop6_enter:
|
||
|
movups 16(%rcx),%xmm1
|
||
|
+ leaq 32(%rcx),%rcx
|
||
|
+
|
||
|
.byte 102,15,56,220,208
|
||
|
.byte 102,15,56,220,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
.byte 102,15,56,220,224
|
||
|
.byte 102,15,56,220,232
|
||
|
.byte 102,15,56,220,240
|
||
|
.byte 102,15,56,220,248
|
||
|
movups (%rcx),%xmm0
|
||
|
+ decl %eax
|
||
|
jnz .Lxts_enc_loop6
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa (%r8),%xmm8
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ movups (%r11),%xmm10
|
||
|
.byte 102,15,56,220,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,220,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,241
|
||
|
+ movaps %xmm10,%xmm11
|
||
|
.byte 102,15,56,220,249
|
||
|
movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,220,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,220,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,240
|
||
|
+ movaps %xmm11,%xmm12
|
||
|
.byte 102,15,56,220,248
|
||
|
movups 32(%rcx),%xmm0
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,220,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
+ movdqa %xmm13,48(%rsp)
|
||
|
.byte 102,15,56,220,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,241
|
||
|
+ movaps %xmm12,%xmm13
|
||
|
.byte 102,15,56,220,249
|
||
|
+ movups 48(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm12
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,220,208
|
||
|
+ pxor %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
+.byte 102,15,56,220,216
|
||
|
paddq %xmm15,%xmm15
|
||
|
-.byte 102,15,56,221,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
-.byte 102,15,56,221,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
-.byte 102,15,56,221,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-.byte 102,15,56,221,232
|
||
|
-.byte 102,15,56,221,240
|
||
|
-.byte 102,15,56,221,248
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+.byte 102,15,56,220,240
|
||
|
+ movaps %xmm13,%xmm14
|
||
|
+.byte 102,15,56,220,248
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm13
|
||
|
+ movdqa %xmm9,%xmm0
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ pxor %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm0
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm0
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+ movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
+
|
||
|
+ pxor %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
+.byte 102,15,56,221,84,36,0
|
||
|
paddq %xmm15,%xmm15
|
||
|
- xorps 0(%rsp),%xmm2
|
||
|
pand %xmm8,%xmm9
|
||
|
- xorps 16(%rsp),%xmm3
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+.byte 102,15,56,221,92,36,16
|
||
|
+.byte 102,15,56,221,100,36,32
|
||
|
pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- xorps 32(%rsp),%xmm4
|
||
|
- movups %xmm2,0(%rsi)
|
||
|
- xorps 48(%rsp),%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps 64(%rsp),%xmm6
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- xorps 80(%rsp),%xmm7
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+.byte 102,15,56,221,108,36,48
|
||
|
+.byte 102,15,56,221,116,36,64
|
||
|
+.byte 102,15,56,221,124,36,80
|
||
|
movl %r10d,%eax
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
+
|
||
|
leaq 96(%rsi),%rsi
|
||
|
+ movups %xmm2,-96(%rsi)
|
||
|
+ movups %xmm3,-80(%rsi)
|
||
|
+ movups %xmm4,-64(%rsi)
|
||
|
+ movups %xmm5,-48(%rsi)
|
||
|
+ movups %xmm6,-32(%rsi)
|
||
|
+ movups %xmm7,-16(%rsi)
|
||
|
subq $96,%rdx
|
||
|
jnc .Lxts_enc_grandloop
|
||
|
|
||
|
- leal 3(%rax,%rax,1),%eax
|
||
|
+ leal 7(%rax,%rax,1),%eax
|
||
|
movq %r11,%rcx
|
||
|
movl %eax,%r10d
|
||
|
|
||
|
.Lxts_enc_short:
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
addq $96,%rdx
|
||
|
jz .Lxts_enc_done
|
||
|
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
cmpq $32,%rdx
|
||
|
jb .Lxts_enc_one
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
je .Lxts_enc_two
|
||
|
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
cmpq $64,%rdx
|
||
|
jb .Lxts_enc_three
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
je .Lxts_enc_four
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
movdqu 48(%rdi),%xmm5
|
||
|
@@ -1512,15 +1738,15 @@ aesni_xts_encrypt:
|
||
|
|
||
|
call _aesni_encrypt4
|
||
|
|
||
|
- xorps %xmm10,%xmm2
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- xorps %xmm11,%xmm3
|
||
|
- xorps %xmm12,%xmm4
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm13,%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movdqa %xmm14,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
leaq 64(%rsi),%rsi
|
||
|
jmp .Lxts_enc_done
|
||
|
|
||
|
@@ -1561,7 +1787,8 @@ aesni_xts_encrypt:
|
||
|
movups %xmm2,-16(%rsi)
|
||
|
|
||
|
.Lxts_enc_ret:
|
||
|
- leaq 104(%rsp),%rsp
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
.Lxts_enc_epilogue:
|
||
|
.byte 0xf3,0xc3
|
||
|
.size aesni_xts_encrypt,.-aesni_xts_encrypt
|
||
|
@@ -1569,7 +1796,11 @@ aesni_xts_encrypt:
|
||
|
.type aesni_xts_decrypt,@function
|
||
|
.align 16
|
||
|
aesni_xts_decrypt:
|
||
|
- leaq -104(%rsp),%rsp
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $112,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
movups (%r9),%xmm15
|
||
|
movl 240(%r8),%eax
|
||
|
movl 240(%rcx),%r10d
|
||
|
@@ -1590,228 +1821,266 @@ aesni_xts_decrypt:
|
||
|
shlq $4,%rax
|
||
|
subq %rax,%rdx
|
||
|
|
||
|
+ movups (%rcx),%xmm0
|
||
|
movq %rcx,%r11
|
||
|
movl %r10d,%eax
|
||
|
+ shll $4,%r10d
|
||
|
movq %rdx,%r9
|
||
|
andq $-16,%rdx
|
||
|
|
||
|
+ movups 16(%rcx,%r10,1),%xmm1
|
||
|
+ movl %eax,%r10d
|
||
|
+
|
||
|
movdqa .Lxts_magic(%rip),%xmm8
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
paddq %xmm15,%xmm15
|
||
|
pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
pxor %xmm9,%xmm15
|
||
|
+ movaps %xmm1,96(%rsp)
|
||
|
+
|
||
|
subq $96,%rdx
|
||
|
jc .Lxts_dec_short
|
||
|
|
||
|
shrl $1,%eax
|
||
|
- subl $1,%eax
|
||
|
+ subl $3,%eax
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
movl %eax,%r10d
|
||
|
+ leaq .Lxts_magic(%rip),%r8
|
||
|
jmp .Lxts_dec_grandloop
|
||
|
|
||
|
-.align 16
|
||
|
+.align 32
|
||
|
.Lxts_dec_grandloop:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu 0(%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ movdqa %xmm0,%xmm8
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm11,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
pxor %xmm12,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
pxor %xmm13,%xmm5
|
||
|
- movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,222,225
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm15,%xmm8
|
||
|
+ movdqa 96(%rsp),%xmm9
|
||
|
pxor %xmm14,%xmm6
|
||
|
- pxor %xmm15,%xmm7
|
||
|
-
|
||
|
-
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ movups 32(%r11),%xmm0
|
||
|
+ leaq 96(%rdi),%rdi
|
||
|
+ pxor %xmm8,%xmm7
|
||
|
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pxor %xmm0,%xmm2
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm9,%xmm10
|
||
|
+.byte 102,15,56,222,241
|
||
|
+ pxor %xmm9,%xmm11
|
||
|
movdqa %xmm10,0(%rsp)
|
||
|
-.byte 102,15,56,222,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ movups 48(%r11),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,222,208
|
||
|
+ pxor %xmm9,%xmm12
|
||
|
movdqa %xmm11,16(%rsp)
|
||
|
-.byte 102,15,56,222,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+.byte 102,15,56,222,216
|
||
|
+ pxor %xmm9,%xmm13
|
||
|
movdqa %xmm12,32(%rsp)
|
||
|
-.byte 102,15,56,222,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqa %xmm13,48(%rsp)
|
||
|
-.byte 102,15,56,222,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+.byte 102,15,56,222,224
|
||
|
+ pxor %xmm9,%xmm14
|
||
|
+.byte 102,15,56,222,232
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
movdqa %xmm14,64(%rsp)
|
||
|
-.byte 102,15,56,222,241
|
||
|
- movdqa %xmm15,80(%rsp)
|
||
|
-.byte 102,15,56,222,249
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- jmp .Lxts_dec_loop6_enter
|
||
|
-
|
||
|
-.align 16
|
||
|
+.byte 102,15,56,222,240
|
||
|
+ movdqa %xmm8,80(%rsp)
|
||
|
+.byte 102,15,56,222,248
|
||
|
+ movups 64(%r11),%xmm0
|
||
|
+ leaq 64(%r11),%rcx
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ jmp .Lxts_dec_loop6
|
||
|
+.align 32
|
||
|
.Lxts_dec_loop6:
|
||
|
.byte 102,15,56,222,209
|
||
|
.byte 102,15,56,222,217
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,222,225
|
||
|
.byte 102,15,56,222,233
|
||
|
.byte 102,15,56,222,241
|
||
|
.byte 102,15,56,222,249
|
||
|
-.Lxts_dec_loop6_enter:
|
||
|
movups 16(%rcx),%xmm1
|
||
|
+ leaq 32(%rcx),%rcx
|
||
|
+
|
||
|
.byte 102,15,56,222,208
|
||
|
.byte 102,15,56,222,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
.byte 102,15,56,222,224
|
||
|
.byte 102,15,56,222,232
|
||
|
.byte 102,15,56,222,240
|
||
|
.byte 102,15,56,222,248
|
||
|
movups (%rcx),%xmm0
|
||
|
+ decl %eax
|
||
|
jnz .Lxts_dec_loop6
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa (%r8),%xmm8
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ movups (%r11),%xmm10
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,222,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,241
|
||
|
+ movaps %xmm10,%xmm11
|
||
|
.byte 102,15,56,222,249
|
||
|
movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,222,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,222,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,240
|
||
|
+ movaps %xmm11,%xmm12
|
||
|
.byte 102,15,56,222,248
|
||
|
movups 32(%rcx),%xmm0
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
+ movdqa %xmm13,48(%rsp)
|
||
|
.byte 102,15,56,222,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,241
|
||
|
+ movaps %xmm12,%xmm13
|
||
|
.byte 102,15,56,222,249
|
||
|
+ movups 48(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm12
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,222,208
|
||
|
+ pxor %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
+.byte 102,15,56,222,216
|
||
|
paddq %xmm15,%xmm15
|
||
|
-.byte 102,15,56,223,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
-.byte 102,15,56,223,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
-.byte 102,15,56,223,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-.byte 102,15,56,223,232
|
||
|
-.byte 102,15,56,223,240
|
||
|
-.byte 102,15,56,223,248
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+.byte 102,15,56,222,240
|
||
|
+ movaps %xmm13,%xmm14
|
||
|
+.byte 102,15,56,222,248
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm13
|
||
|
+ movdqa %xmm9,%xmm0
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ pxor %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm0
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm0
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+ movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
+
|
||
|
+ pxor %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
+.byte 102,15,56,223,84,36,0
|
||
|
paddq %xmm15,%xmm15
|
||
|
- xorps 0(%rsp),%xmm2
|
||
|
pand %xmm8,%xmm9
|
||
|
- xorps 16(%rsp),%xmm3
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+.byte 102,15,56,223,92,36,16
|
||
|
+.byte 102,15,56,223,100,36,32
|
||
|
pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- xorps 32(%rsp),%xmm4
|
||
|
- movups %xmm2,0(%rsi)
|
||
|
- xorps 48(%rsp),%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps 64(%rsp),%xmm6
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- xorps 80(%rsp),%xmm7
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+.byte 102,15,56,223,108,36,48
|
||
|
+.byte 102,15,56,223,116,36,64
|
||
|
+.byte 102,15,56,223,124,36,80
|
||
|
movl %r10d,%eax
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
+
|
||
|
leaq 96(%rsi),%rsi
|
||
|
+ movups %xmm2,-96(%rsi)
|
||
|
+ movups %xmm3,-80(%rsi)
|
||
|
+ movups %xmm4,-64(%rsi)
|
||
|
+ movups %xmm5,-48(%rsi)
|
||
|
+ movups %xmm6,-32(%rsi)
|
||
|
+ movups %xmm7,-16(%rsi)
|
||
|
subq $96,%rdx
|
||
|
jnc .Lxts_dec_grandloop
|
||
|
|
||
|
- leal 3(%rax,%rax,1),%eax
|
||
|
+ leal 7(%rax,%rax,1),%eax
|
||
|
movq %r11,%rcx
|
||
|
movl %eax,%r10d
|
||
|
|
||
|
.Lxts_dec_short:
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
addq $96,%rdx
|
||
|
jz .Lxts_dec_done
|
||
|
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
cmpq $32,%rdx
|
||
|
jb .Lxts_dec_one
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
je .Lxts_dec_two
|
||
|
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
cmpq $64,%rdx
|
||
|
jb .Lxts_dec_three
|
||
|
je .Lxts_dec_four
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
movdqu 48(%rdi),%xmm5
|
||
|
@@ -1904,7 +2173,7 @@ aesni_xts_decrypt:
|
||
|
xorps %xmm10,%xmm2
|
||
|
movdqa %xmm13,%xmm10
|
||
|
xorps %xmm11,%xmm3
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
+ movdqa %xmm14,%xmm11
|
||
|
xorps %xmm12,%xmm4
|
||
|
movups %xmm2,(%rsi)
|
||
|
movups %xmm3,16(%rsi)
|
||
|
@@ -1914,14 +2183,8 @@ aesni_xts_decrypt:
|
||
|
|
||
|
.align 16
|
||
|
.Lxts_dec_four:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movups (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movups 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movups 32(%rdi),%xmm4
|
||
|
xorps %xmm10,%xmm2
|
||
|
movups 48(%rdi),%xmm5
|
||
|
@@ -1932,16 +2195,16 @@ aesni_xts_decrypt:
|
||
|
|
||
|
call _aesni_decrypt4
|
||
|
|
||
|
- xorps %xmm10,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
movdqa %xmm14,%xmm10
|
||
|
- xorps %xmm11,%xmm3
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
movdqa %xmm15,%xmm11
|
||
|
- xorps %xmm12,%xmm4
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm13,%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
leaq 64(%rsi),%rsi
|
||
|
jmp .Lxts_dec_done
|
||
|
|
||
|
@@ -2001,7 +2264,8 @@ aesni_xts_decrypt:
|
||
|
movups %xmm2,(%rsi)
|
||
|
|
||
|
.Lxts_dec_ret:
|
||
|
- leaq 104(%rsp),%rsp
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
.Lxts_dec_epilogue:
|
||
|
.byte 0xf3,0xc3
|
||
|
.size aesni_xts_decrypt,.-aesni_xts_decrypt
|
||
|
@@ -2068,149 +2332,324 @@ aesni_cbc_encrypt:
|
||
|
|
||
|
.align 16
|
||
|
.Lcbc_decrypt:
|
||
|
- movups (%r8),%xmm9
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $16,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
+ movups (%r8),%xmm10
|
||
|
movl %r10d,%eax
|
||
|
- cmpq $112,%rdx
|
||
|
+ cmpq $80,%rdx
|
||
|
jbe .Lcbc_dec_tail
|
||
|
- shrl $1,%r10d
|
||
|
+
|
||
|
+ movups (%rcx),%xmm0
|
||
|
+ movdqu 0(%rdi),%xmm2
|
||
|
+ movdqu 16(%rdi),%xmm3
|
||
|
+ movdqa %xmm2,%xmm11
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
+ movdqa %xmm3,%xmm12
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
+ movdqa %xmm4,%xmm13
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
+ movdqa %xmm5,%xmm14
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ movdqa %xmm6,%xmm15
|
||
|
+ cmpq $112,%rdx
|
||
|
+ jbe .Lcbc_dec_six_or_seven
|
||
|
+
|
||
|
subq $112,%rdx
|
||
|
- movl %r10d,%eax
|
||
|
- movaps %xmm9,-24(%rsp)
|
||
|
+ leaq 112(%rcx),%rcx
|
||
|
jmp .Lcbc_dec_loop8_enter
|
||
|
.align 16
|
||
|
.Lcbc_dec_loop8:
|
||
|
- movaps %xmm0,-24(%rsp)
|
||
|
movups %xmm9,(%rsi)
|
||
|
leaq 16(%rsi),%rsi
|
||
|
.Lcbc_dec_loop8_enter:
|
||
|
- movups (%rcx),%xmm0
|
||
|
- movups (%rdi),%xmm2
|
||
|
- movups 16(%rdi),%xmm3
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
+ movdqu 96(%rdi),%xmm8
|
||
|
+ pxor %xmm0,%xmm2
|
||
|
+ movdqu 112(%rdi),%xmm9
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ movups 16-112(%rcx),%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ xorq %r11,%r11
|
||
|
+ cmpq $112,%rdx
|
||
|
+ pxor %xmm0,%xmm5
|
||
|
+ pxor %xmm0,%xmm6
|
||
|
+ pxor %xmm0,%xmm7
|
||
|
+ pxor %xmm0,%xmm8
|
||
|
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
- xorps %xmm0,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
- xorps %xmm0,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
.byte 102,15,56,222,209
|
||
|
- pxor %xmm0,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm0,%xmm9
|
||
|
+ movups 32-112(%rcx),%xmm0
|
||
|
.byte 102,15,56,222,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
- movdqu 96(%rdi),%xmm8
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqu 112(%rdi),%xmm9
|
||
|
.byte 102,15,56,222,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,222,241
|
||
|
- pxor %xmm0,%xmm8
|
||
|
.byte 102,15,56,222,249
|
||
|
- pxor %xmm0,%xmm9
|
||
|
- movups (%rcx),%xmm0
|
||
|
+ setnc %r11b
|
||
|
.byte 102,68,15,56,222,193
|
||
|
+ shlq $7,%r11
|
||
|
.byte 102,68,15,56,222,201
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
-
|
||
|
- call .Ldec_loop8_enter
|
||
|
+ addq %rdi,%r11
|
||
|
+ movups 48-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 64-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 80-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 96-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 112-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 128-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 144-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 160-112(%rcx),%xmm0
|
||
|
+ cmpl $11,%eax
|
||
|
+ jb .Lcbc_dec_done
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 176-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 192-112(%rcx),%xmm0
|
||
|
+ je .Lcbc_dec_done
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 208-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 224-112(%rcx),%xmm0
|
||
|
+.Lcbc_dec_done:
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+.byte 102,15,56,222,225
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+.byte 102,15,56,222,241
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movdqu 80(%rdi),%xmm1
|
||
|
+
|
||
|
+.byte 102,65,15,56,223,210
|
||
|
+ movdqu 96(%rdi),%xmm10
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+.byte 102,65,15,56,223,219
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqu 112(%rdi),%xmm0
|
||
|
+ leaq 128(%rdi),%rdi
|
||
|
+.byte 102,65,15,56,223,228
|
||
|
+ movdqu 0(%r11),%xmm11
|
||
|
+.byte 102,65,15,56,223,237
|
||
|
+ movdqu 16(%r11),%xmm12
|
||
|
+.byte 102,65,15,56,223,246
|
||
|
+ movdqu 32(%r11),%xmm13
|
||
|
+.byte 102,65,15,56,223,255
|
||
|
+ movdqu 48(%r11),%xmm14
|
||
|
+.byte 102,68,15,56,223,193
|
||
|
+ movdqu 64(%r11),%xmm15
|
||
|
+.byte 102,69,15,56,223,202
|
||
|
+ movdqa %xmm0,%xmm10
|
||
|
+ movdqu 80(%r11),%xmm1
|
||
|
+ movups -112(%rcx),%xmm0
|
||
|
|
||
|
- movups (%rdi),%xmm1
|
||
|
- movups 16(%rdi),%xmm0
|
||
|
- xorps -24(%rsp),%xmm2
|
||
|
- xorps %xmm1,%xmm3
|
||
|
- movups 32(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm4
|
||
|
- movups 48(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm5
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm6
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm7
|
||
|
- movups 96(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm8
|
||
|
- movups 112(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm9
|
||
|
movups %xmm2,(%rsi)
|
||
|
+ movdqa %xmm11,%xmm2
|
||
|
movups %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm12,%xmm3
|
||
|
movups %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm13,%xmm4
|
||
|
movups %xmm5,48(%rsi)
|
||
|
- movl %r10d,%eax
|
||
|
+ movdqa %xmm14,%xmm5
|
||
|
movups %xmm6,64(%rsi)
|
||
|
- movq %r11,%rcx
|
||
|
+ movdqa %xmm15,%xmm6
|
||
|
movups %xmm7,80(%rsi)
|
||
|
- leaq 128(%rdi),%rdi
|
||
|
+ movdqa %xmm1,%xmm7
|
||
|
movups %xmm8,96(%rsi)
|
||
|
leaq 112(%rsi),%rsi
|
||
|
+
|
||
|
subq $128,%rdx
|
||
|
ja .Lcbc_dec_loop8
|
||
|
|
||
|
movaps %xmm9,%xmm2
|
||
|
- movaps %xmm0,%xmm9
|
||
|
+ leaq -112(%rcx),%rcx
|
||
|
addq $112,%rdx
|
||
|
jle .Lcbc_dec_tail_collected
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- leal 1(%r10,%r10,1),%eax
|
||
|
+ movups %xmm9,(%rsi)
|
||
|
leaq 16(%rsi),%rsi
|
||
|
+ cmpq $80,%rdx
|
||
|
+ jbe .Lcbc_dec_tail
|
||
|
+
|
||
|
+ movaps %xmm11,%xmm2
|
||
|
+.Lcbc_dec_six_or_seven:
|
||
|
+ cmpq $96,%rdx
|
||
|
+ ja .Lcbc_dec_seven
|
||
|
+
|
||
|
+ movaps %xmm7,%xmm8
|
||
|
+ call _aesni_decrypt6
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm8,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ pxor %xmm15,%xmm7
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ leaq 80(%rsi),%rsi
|
||
|
+ movdqa %xmm7,%xmm2
|
||
|
+ jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
+.align 16
|
||
|
+.Lcbc_dec_seven:
|
||
|
+ movups 96(%rdi),%xmm8
|
||
|
+ xorps %xmm9,%xmm9
|
||
|
+ call _aesni_decrypt8
|
||
|
+ movups 80(%rdi),%xmm9
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movups 96(%rdi),%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ pxor %xmm15,%xmm7
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
+ movdqu %xmm7,80(%rsi)
|
||
|
+ leaq 96(%rsi),%rsi
|
||
|
+ movdqa %xmm8,%xmm2
|
||
|
+ jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
.Lcbc_dec_tail:
|
||
|
movups (%rdi),%xmm2
|
||
|
- movaps %xmm2,%xmm8
|
||
|
- cmpq $16,%rdx
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_one
|
||
|
|
||
|
movups 16(%rdi),%xmm3
|
||
|
- movaps %xmm3,%xmm7
|
||
|
- cmpq $32,%rdx
|
||
|
+ movaps %xmm2,%xmm11
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_two
|
||
|
|
||
|
movups 32(%rdi),%xmm4
|
||
|
- movaps %xmm4,%xmm6
|
||
|
- cmpq $48,%rdx
|
||
|
+ movaps %xmm3,%xmm12
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_three
|
||
|
|
||
|
movups 48(%rdi),%xmm5
|
||
|
- cmpq $64,%rdx
|
||
|
+ movaps %xmm4,%xmm13
|
||
|
+ subq $16,%rdx
|
||
|
jbe .Lcbc_dec_four
|
||
|
|
||
|
movups 64(%rdi),%xmm6
|
||
|
- cmpq $80,%rdx
|
||
|
- jbe .Lcbc_dec_five
|
||
|
-
|
||
|
- movups 80(%rdi),%xmm7
|
||
|
- cmpq $96,%rdx
|
||
|
- jbe .Lcbc_dec_six
|
||
|
-
|
||
|
- movups 96(%rdi),%xmm8
|
||
|
- movaps %xmm9,-24(%rsp)
|
||
|
- call _aesni_decrypt8
|
||
|
- movups (%rdi),%xmm1
|
||
|
- movups 16(%rdi),%xmm0
|
||
|
- xorps -24(%rsp),%xmm2
|
||
|
- xorps %xmm1,%xmm3
|
||
|
- movups 32(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm4
|
||
|
- movups 48(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm5
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm6
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm7
|
||
|
- movups 96(%rdi),%xmm9
|
||
|
- xorps %xmm0,%xmm8
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
- leaq 96(%rsi),%rsi
|
||
|
- movaps %xmm8,%xmm2
|
||
|
- subq $112,%rdx
|
||
|
+ movaps %xmm5,%xmm14
|
||
|
+ movaps %xmm6,%xmm15
|
||
|
+ xorps %xmm7,%xmm7
|
||
|
+ call _aesni_decrypt6
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm15,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ leaq 64(%rsi),%rsi
|
||
|
+ movdqa %xmm6,%xmm2
|
||
|
+ subq $16,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
.align 16
|
||
|
.Lcbc_dec_one:
|
||
|
+ movaps %xmm2,%xmm11
|
||
|
movups (%rcx),%xmm0
|
||
|
movups 16(%rcx),%xmm1
|
||
|
leaq 32(%rcx),%rcx
|
||
|
@@ -2222,111 +2661,69 @@ aesni_cbc_encrypt:
|
||
|
leaq 16(%rcx),%rcx
|
||
|
jnz .Loop_dec1_16
|
||
|
.byte 102,15,56,223,209
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- movaps %xmm8,%xmm9
|
||
|
- subq $16,%rdx
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movaps %xmm11,%xmm10
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
.align 16
|
||
|
.Lcbc_dec_two:
|
||
|
+ movaps %xmm3,%xmm12
|
||
|
xorps %xmm4,%xmm4
|
||
|
call _aesni_decrypt3
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movaps %xmm7,%xmm9
|
||
|
- movaps %xmm3,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm12,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ movdqa %xmm3,%xmm2
|
||
|
leaq 16(%rsi),%rsi
|
||
|
- subq $32,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
.align 16
|
||
|
.Lcbc_dec_three:
|
||
|
+ movaps %xmm4,%xmm13
|
||
|
call _aesni_decrypt3
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm7,%xmm4
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movaps %xmm6,%xmm9
|
||
|
- movaps %xmm4,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm13,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm4,%xmm2
|
||
|
leaq 32(%rsi),%rsi
|
||
|
- subq $48,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
.align 16
|
||
|
.Lcbc_dec_four:
|
||
|
+ movaps %xmm5,%xmm14
|
||
|
call _aesni_decrypt4
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- movups 48(%rdi),%xmm9
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm7,%xmm4
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps %xmm6,%xmm5
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movaps %xmm5,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm14,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm5,%xmm2
|
||
|
leaq 48(%rsi),%rsi
|
||
|
- subq $64,%rdx
|
||
|
- jmp .Lcbc_dec_tail_collected
|
||
|
-.align 16
|
||
|
-.Lcbc_dec_five:
|
||
|
- xorps %xmm7,%xmm7
|
||
|
- call _aesni_decrypt6
|
||
|
- movups 16(%rdi),%xmm1
|
||
|
- movups 32(%rdi),%xmm0
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- xorps %xmm1,%xmm4
|
||
|
- movups 48(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm5
|
||
|
- movups 64(%rdi),%xmm9
|
||
|
- xorps %xmm1,%xmm6
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- leaq 64(%rsi),%rsi
|
||
|
- movaps %xmm6,%xmm2
|
||
|
- subq $80,%rdx
|
||
|
- jmp .Lcbc_dec_tail_collected
|
||
|
-.align 16
|
||
|
-.Lcbc_dec_six:
|
||
|
- call _aesni_decrypt6
|
||
|
- movups 16(%rdi),%xmm1
|
||
|
- movups 32(%rdi),%xmm0
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- xorps %xmm1,%xmm4
|
||
|
- movups 48(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm5
|
||
|
- movups 64(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm6
|
||
|
- movups 80(%rdi),%xmm9
|
||
|
- xorps %xmm0,%xmm7
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- leaq 80(%rsi),%rsi
|
||
|
- movaps %xmm7,%xmm2
|
||
|
- subq $96,%rdx
|
||
|
jmp .Lcbc_dec_tail_collected
|
||
|
+
|
||
|
.align 16
|
||
|
.Lcbc_dec_tail_collected:
|
||
|
+ movups %xmm10,(%r8)
|
||
|
andq $15,%rdx
|
||
|
- movups %xmm9,(%r8)
|
||
|
jnz .Lcbc_dec_tail_partial
|
||
|
movups %xmm2,(%rsi)
|
||
|
jmp .Lcbc_dec_ret
|
||
|
.align 16
|
||
|
.Lcbc_dec_tail_partial:
|
||
|
- movaps %xmm2,-24(%rsp)
|
||
|
+ movaps %xmm2,(%rsp)
|
||
|
movq $16,%rcx
|
||
|
movq %rsi,%rdi
|
||
|
subq %rdx,%rcx
|
||
|
- leaq -24(%rsp),%rsi
|
||
|
+ leaq (%rsp),%rsi
|
||
|
.long 0x9066A4F3
|
||
|
|
||
|
.Lcbc_dec_ret:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
.Lcbc_ret:
|
||
|
.byte 0xf3,0xc3
|
||
|
.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
|
||
|
@@ -2569,6 +2966,8 @@ __aesni_set_encrypt_key:
|
||
|
.long 1,0,0,0
|
||
|
.Lxts_magic:
|
||
|
.long 0x87,0,1,0
|
||
|
+.Lincrement1:
|
||
|
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
||
|
|
||
|
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||
|
.align 64
|
||
|
diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s
|
||
|
index 4709ac2..2ac113d 100644
|
||
|
--- a/lib/accelerated/x86/elf/padlock-x86-64.s
|
||
|
+++ b/lib/accelerated/x86/elf/padlock-x86-64.s
|
||
|
@@ -595,6 +595,468 @@ padlock_cbc_encrypt:
|
||
|
popq %rbp
|
||
|
.byte 0xf3,0xc3
|
||
|
.size padlock_cbc_encrypt,.-padlock_cbc_encrypt
|
||
|
+.globl padlock_cfb_encrypt
|
||
|
+.type padlock_cfb_encrypt,@function
|
||
|
+.align 16
|
||
|
+padlock_cfb_encrypt:
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz .Lcfb_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz .Lcfb_abort
|
||
|
+ leaq .Lpadlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz .Lcfb_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .Lcfb_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+ jmp .Lcfb_loop
|
||
|
+.align 16
|
||
|
+.Lcfb_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz .Lcfb_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+.Lcfb_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,224
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz .Lcfb_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+.Lcfb_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jnz .Lcfb_loop
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je .Lcfb_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+.Lcfb_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja .Lcfb_bzero
|
||
|
+
|
||
|
+.Lcfb_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp .Lcfb_exit
|
||
|
+
|
||
|
+.align 16
|
||
|
+.Lcfb_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,224
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+.Lcfb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+.Lcfb_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+.size padlock_cfb_encrypt,.-padlock_cfb_encrypt
|
||
|
+.globl padlock_ofb_encrypt
|
||
|
+.type padlock_ofb_encrypt,@function
|
||
|
+.align 16
|
||
|
+padlock_ofb_encrypt:
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz .Lofb_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz .Lofb_abort
|
||
|
+ leaq .Lpadlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz .Lofb_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .Lofb_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+ jmp .Lofb_loop
|
||
|
+.align 16
|
||
|
+.Lofb_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz .Lofb_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+.Lofb_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,232
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz .Lofb_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+.Lofb_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jnz .Lofb_loop
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je .Lofb_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+.Lofb_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja .Lofb_bzero
|
||
|
+
|
||
|
+.Lofb_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp .Lofb_exit
|
||
|
+
|
||
|
+.align 16
|
||
|
+.Lofb_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,232
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+.Lofb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+.Lofb_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+.size padlock_ofb_encrypt,.-padlock_ofb_encrypt
|
||
|
+.globl padlock_ctr32_encrypt
|
||
|
+.type padlock_ctr32_encrypt,@function
|
||
|
+.align 16
|
||
|
+padlock_ctr32_encrypt:
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz .Lctr32_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz .Lctr32_abort
|
||
|
+ leaq .Lpadlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz .Lctr32_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .Lctr32_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+.Lctr32_reenter:
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ negl %eax
|
||
|
+ andl $31,%eax
|
||
|
+ movq $512,%rbx
|
||
|
+ shll $4,%eax
|
||
|
+ cmovzq %rbx,%rax
|
||
|
+ cmpq %rax,%rcx
|
||
|
+ cmovaq %rax,%rbx
|
||
|
+ cmovbeq %rcx,%rbx
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ ja .Lctr32_loop
|
||
|
+ movq %rsi,%rax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rdi,%rax
|
||
|
+ addq %rcx,%rax
|
||
|
+ negq %rax
|
||
|
+ andq $4095,%rax
|
||
|
+ cmpq $32,%rax
|
||
|
+ movq $-32,%rax
|
||
|
+ cmovaeq %rbx,%rax
|
||
|
+ andq %rax,%rbx
|
||
|
+ jz .Lctr32_unaligned_tail
|
||
|
+ jmp .Lctr32_loop
|
||
|
+.align 16
|
||
|
+.Lctr32_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz .Lctr32_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+.Lctr32_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ testl $4294901760,%eax
|
||
|
+ jnz .Lctr32_no_carry
|
||
|
+ bswapl %eax
|
||
|
+ addl $65536,%eax
|
||
|
+ bswapl %eax
|
||
|
+ movl %eax,-4(%rdx)
|
||
|
+.Lctr32_no_carry:
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz .Lctr32_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+.Lctr32_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jz .Lctr32_break
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ jae .Lctr32_loop
|
||
|
+ movq %rcx,%rbx
|
||
|
+ movq %rsi,%rax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rdi,%rax
|
||
|
+ addq %rcx,%rax
|
||
|
+ negq %rax
|
||
|
+ andq $4095,%rax
|
||
|
+ cmpq $32,%rax
|
||
|
+ movq $-32,%rax
|
||
|
+ cmovaeq %rbx,%rax
|
||
|
+ andq %rax,%rbx
|
||
|
+ jnz .Lctr32_loop
|
||
|
+.Lctr32_unaligned_tail:
|
||
|
+ xorl %eax,%eax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rcx,%rax
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rcx,%rbx
|
||
|
+ subq %rax,%rsp
|
||
|
+ shrq $3,%rcx
|
||
|
+ leaq (%rsp),%rdi
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ movq %rsp,%rsi
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ jmp .Lctr32_loop
|
||
|
+.align 16
|
||
|
+.Lctr32_break:
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je .Lctr32_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+.Lctr32_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja .Lctr32_bzero
|
||
|
+
|
||
|
+.Lctr32_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp .Lctr32_exit
|
||
|
+
|
||
|
+.align 16
|
||
|
+.Lctr32_aligned:
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ negl %eax
|
||
|
+ andl $65535,%eax
|
||
|
+ movq $1048576,%rbx
|
||
|
+ shll $4,%eax
|
||
|
+ cmovzq %rbx,%rax
|
||
|
+ cmpq %rax,%rcx
|
||
|
+ cmovaq %rax,%rbx
|
||
|
+ cmovbeq %rcx,%rbx
|
||
|
+ jbe .Lctr32_aligned_skip
|
||
|
+
|
||
|
+.Lctr32_aligned_loop:
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ addl $65536,%eax
|
||
|
+ bswapl %eax
|
||
|
+ movl %eax,-4(%rdx)
|
||
|
+
|
||
|
+ movq %r10,%rcx
|
||
|
+ subq %r11,%rcx
|
||
|
+ movq $1048576,%rbx
|
||
|
+ jz .Lctr32_exit
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ jae .Lctr32_aligned_loop
|
||
|
+
|
||
|
+.Lctr32_aligned_skip:
|
||
|
+ leaq (%rsi,%rcx,1),%rbp
|
||
|
+ negq %rbp
|
||
|
+ andq $4095,%rbp
|
||
|
+ xorl %eax,%eax
|
||
|
+ cmpq $32,%rbp
|
||
|
+ movq $32-1,%rbp
|
||
|
+ cmovaeq %rax,%rbp
|
||
|
+ andq %rcx,%rbp
|
||
|
+ subq %rbp,%rcx
|
||
|
+ jz .Lctr32_aligned_tail
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+ testq %rbp,%rbp
|
||
|
+ jz .Lctr32_exit
|
||
|
+
|
||
|
+.Lctr32_aligned_tail:
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rbp,%rbx
|
||
|
+ movq %rbp,%rcx
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ subq %rcx,%rsp
|
||
|
+ shrq $3,%rcx
|
||
|
+ leaq (%rsp),%rdi
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ leaq (%r8),%rdi
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ jmp .Lctr32_loop
|
||
|
+.Lctr32_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+.Lctr32_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+.size padlock_ctr32_encrypt,.-padlock_ctr32_encrypt
|
||
|
.byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||
|
.align 16
|
||
|
.data
|
||
|
diff --git a/lib/accelerated/x86/elf/padlock-x86.s b/lib/accelerated/x86/elf/padlock-x86.s
|
||
|
index ea982ec..2199255 100644
|
||
|
--- a/lib/accelerated/x86/elf/padlock-x86.s
|
||
|
+++ b/lib/accelerated/x86/elf/padlock-x86.s
|
||
|
@@ -187,16 +187,14 @@ padlock_ecb_encrypt:
|
||
|
leal 16(%edx),%edx
|
||
|
xorl %eax,%eax
|
||
|
xorl %ebx,%ebx
|
||
|
- cmpl $128,%ecx
|
||
|
- jbe .L006ecb_short
|
||
|
testl $32,(%edx)
|
||
|
- jnz .L007ecb_aligned
|
||
|
+ jnz .L006ecb_aligned
|
||
|
testl $15,%edi
|
||
|
setz %al
|
||
|
testl $15,%esi
|
||
|
setz %bl
|
||
|
testl %ebx,%eax
|
||
|
- jnz .L007ecb_aligned
|
||
|
+ jnz .L006ecb_aligned
|
||
|
negl %eax
|
||
|
movl $512,%ebx
|
||
|
notl %eax
|
||
|
@@ -208,10 +206,28 @@ padlock_ecb_encrypt:
|
||
|
negl %eax
|
||
|
andl $511,%ebx
|
||
|
leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
andl $-16,%esp
|
||
|
- jmp .L008ecb_loop
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ ja .L007ecb_loop
|
||
|
+ movl %esi,%eax
|
||
|
+ cmpl %esp,%ebp
|
||
|
+ cmovel %edi,%eax
|
||
|
+ addl %ecx,%eax
|
||
|
+ negl %eax
|
||
|
+ andl $4095,%eax
|
||
|
+ cmpl $128,%eax
|
||
|
+ movl $-128,%eax
|
||
|
+ cmovael %ebx,%eax
|
||
|
+ andl %eax,%ebx
|
||
|
+ jz .L008ecb_unaligned_tail
|
||
|
+ jmp .L007ecb_loop
|
||
|
.align 16
|
||
|
-.L008ecb_loop:
|
||
|
+.L007ecb_loop:
|
||
|
movl %edi,(%ebp)
|
||
|
movl %esi,4(%ebp)
|
||
|
movl %ecx,8(%ebp)
|
||
|
@@ -236,8 +252,8 @@ padlock_ecb_encrypt:
|
||
|
testl $15,%edi
|
||
|
jz .L010ecb_out_aligned
|
||
|
movl %ebx,%ecx
|
||
|
- shrl $2,%ecx
|
||
|
leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
.byte 243,165
|
||
|
subl %ebx,%edi
|
||
|
.L010ecb_out_aligned:
|
||
|
@@ -247,43 +263,75 @@ padlock_ecb_encrypt:
|
||
|
addl %ebx,%esi
|
||
|
subl %ebx,%ecx
|
||
|
movl $512,%ebx
|
||
|
- jnz .L008ecb_loop
|
||
|
+ jz .L011ecb_break
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ jae .L007ecb_loop
|
||
|
+.L008ecb_unaligned_tail:
|
||
|
+ xorl %eax,%eax
|
||
|
cmpl %ebp,%esp
|
||
|
- je .L011ecb_done
|
||
|
+ cmovel %ecx,%eax
|
||
|
+ subl %eax,%esp
|
||
|
+ movl %edi,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ shrl $2,%ecx
|
||
|
+ leal (%esp),%edi
|
||
|
+.byte 243,165
|
||
|
+ movl %esp,%esi
|
||
|
+ movl %eax,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ jmp .L007ecb_loop
|
||
|
+.align 16
|
||
|
+.L011ecb_break:
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je .L012ecb_done
|
||
|
pxor %xmm0,%xmm0
|
||
|
leal (%esp),%eax
|
||
|
-.L012ecb_bzero:
|
||
|
+.L013ecb_bzero:
|
||
|
movaps %xmm0,(%eax)
|
||
|
leal 16(%eax),%eax
|
||
|
cmpl %eax,%ebp
|
||
|
- ja .L012ecb_bzero
|
||
|
-.L011ecb_done:
|
||
|
+ ja .L013ecb_bzero
|
||
|
+.L012ecb_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
leal 24(%ebp),%esp
|
||
|
- jmp .L013ecb_exit
|
||
|
+ jmp .L014ecb_exit
|
||
|
.align 16
|
||
|
-.L006ecb_short:
|
||
|
+.L006ecb_aligned:
|
||
|
+ leal (%esi,%ecx,1),%ebp
|
||
|
+ negl %ebp
|
||
|
+ andl $4095,%ebp
|
||
|
xorl %eax,%eax
|
||
|
- leal -24(%esp),%ebp
|
||
|
- subl %ecx,%eax
|
||
|
- leal (%eax,%ebp,1),%esp
|
||
|
- andl $-16,%esp
|
||
|
- xorl %ebx,%ebx
|
||
|
-.L014ecb_short_copy:
|
||
|
- movups (%esi,%ebx,1),%xmm0
|
||
|
- leal 16(%ebx),%ebx
|
||
|
- cmpl %ebx,%ecx
|
||
|
- movaps %xmm0,-16(%esp,%ebx,1)
|
||
|
- ja .L014ecb_short_copy
|
||
|
- movl %esp,%esi
|
||
|
- movl %ecx,%ebx
|
||
|
- jmp .L008ecb_loop
|
||
|
-.align 16
|
||
|
-.L007ecb_aligned:
|
||
|
+ cmpl $128,%ebp
|
||
|
+ movl $127,%ebp
|
||
|
+ cmovael %eax,%ebp
|
||
|
+ andl %ecx,%ebp
|
||
|
+ subl %ebp,%ecx
|
||
|
+ jz .L015ecb_aligned_tail
|
||
|
leal -16(%edx),%eax
|
||
|
leal 16(%edx),%ebx
|
||
|
shrl $4,%ecx
|
||
|
.byte 243,15,167,200
|
||
|
-.L013ecb_exit:
|
||
|
+ testl %ebp,%ebp
|
||
|
+ jz .L014ecb_exit
|
||
|
+.L015ecb_aligned_tail:
|
||
|
+ movl %ebp,%ecx
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ movl %ebp,%esp
|
||
|
+ movl %ebp,%eax
|
||
|
+ subl %ecx,%esp
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ movl %edi,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ shrl $2,%ecx
|
||
|
+ leal (%esp),%edi
|
||
|
+.byte 243,165
|
||
|
+ movl %esp,%esi
|
||
|
+ movl %eax,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ jmp .L007ecb_loop
|
||
|
+.L014ecb_exit:
|
||
|
movl $1,%eax
|
||
|
leal 4(%esp),%esp
|
||
|
.L004ecb_abort:
|
||
|
@@ -307,19 +355,17 @@ padlock_cbc_encrypt:
|
||
|
movl 28(%esp),%edx
|
||
|
movl 32(%esp),%ecx
|
||
|
testl $15,%edx
|
||
|
- jnz .L015cbc_abort
|
||
|
+ jnz .L016cbc_abort
|
||
|
testl $15,%ecx
|
||
|
- jnz .L015cbc_abort
|
||
|
- leal .Lpadlock_saved_context-.L016cbc_pic_point,%eax
|
||
|
+ jnz .L016cbc_abort
|
||
|
+ leal .Lpadlock_saved_context-.L017cbc_pic_point,%eax
|
||
|
pushfl
|
||
|
cld
|
||
|
call _padlock_verify_ctx
|
||
|
-.L016cbc_pic_point:
|
||
|
+.L017cbc_pic_point:
|
||
|
leal 16(%edx),%edx
|
||
|
xorl %eax,%eax
|
||
|
xorl %ebx,%ebx
|
||
|
- cmpl $64,%ecx
|
||
|
- jbe .L017cbc_short
|
||
|
testl $32,(%edx)
|
||
|
jnz .L018cbc_aligned
|
||
|
testl $15,%edi
|
||
|
@@ -339,7 +385,25 @@ padlock_cbc_encrypt:
|
||
|
negl %eax
|
||
|
andl $511,%ebx
|
||
|
leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ ja .L019cbc_loop
|
||
|
+ movl %esi,%eax
|
||
|
+ cmpl %esp,%ebp
|
||
|
+ cmovel %edi,%eax
|
||
|
+ addl %ecx,%eax
|
||
|
+ negl %eax
|
||
|
+ andl $4095,%eax
|
||
|
+ cmpl $64,%eax
|
||
|
+ movl $-64,%eax
|
||
|
+ cmovael %ebx,%eax
|
||
|
+ andl %eax,%ebx
|
||
|
+ jz .L020cbc_unaligned_tail
|
||
|
jmp .L019cbc_loop
|
||
|
.align 16
|
||
|
.L019cbc_loop:
|
||
|
@@ -351,13 +415,13 @@ padlock_cbc_encrypt:
|
||
|
testl $15,%edi
|
||
|
cmovnzl %esp,%edi
|
||
|
testl $15,%esi
|
||
|
- jz .L020cbc_inp_aligned
|
||
|
+ jz .L021cbc_inp_aligned
|
||
|
shrl $2,%ecx
|
||
|
.byte 243,165
|
||
|
subl %ebx,%edi
|
||
|
movl %ebx,%ecx
|
||
|
movl %edi,%esi
|
||
|
-.L020cbc_inp_aligned:
|
||
|
+.L021cbc_inp_aligned:
|
||
|
leal -16(%edx),%eax
|
||
|
leal 16(%edx),%ebx
|
||
|
shrl $4,%ecx
|
||
|
@@ -367,67 +431,450 @@ padlock_cbc_encrypt:
|
||
|
movl (%ebp),%edi
|
||
|
movl 12(%ebp),%ebx
|
||
|
testl $15,%edi
|
||
|
- jz .L021cbc_out_aligned
|
||
|
+ jz .L022cbc_out_aligned
|
||
|
movl %ebx,%ecx
|
||
|
- shrl $2,%ecx
|
||
|
leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
.byte 243,165
|
||
|
subl %ebx,%edi
|
||
|
-.L021cbc_out_aligned:
|
||
|
+.L022cbc_out_aligned:
|
||
|
movl 4(%ebp),%esi
|
||
|
movl 8(%ebp),%ecx
|
||
|
addl %ebx,%edi
|
||
|
addl %ebx,%esi
|
||
|
subl %ebx,%ecx
|
||
|
movl $512,%ebx
|
||
|
- jnz .L019cbc_loop
|
||
|
+ jz .L023cbc_break
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ jae .L019cbc_loop
|
||
|
+.L020cbc_unaligned_tail:
|
||
|
+ xorl %eax,%eax
|
||
|
cmpl %ebp,%esp
|
||
|
- je .L022cbc_done
|
||
|
+ cmovel %ecx,%eax
|
||
|
+ subl %eax,%esp
|
||
|
+ movl %edi,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ shrl $2,%ecx
|
||
|
+ leal (%esp),%edi
|
||
|
+.byte 243,165
|
||
|
+ movl %esp,%esi
|
||
|
+ movl %eax,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ jmp .L019cbc_loop
|
||
|
+.align 16
|
||
|
+.L023cbc_break:
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je .L024cbc_done
|
||
|
pxor %xmm0,%xmm0
|
||
|
leal (%esp),%eax
|
||
|
-.L023cbc_bzero:
|
||
|
+.L025cbc_bzero:
|
||
|
movaps %xmm0,(%eax)
|
||
|
leal 16(%eax),%eax
|
||
|
cmpl %eax,%ebp
|
||
|
- ja .L023cbc_bzero
|
||
|
-.L022cbc_done:
|
||
|
+ ja .L025cbc_bzero
|
||
|
+.L024cbc_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
leal 24(%ebp),%esp
|
||
|
- jmp .L024cbc_exit
|
||
|
+ jmp .L026cbc_exit
|
||
|
.align 16
|
||
|
-.L017cbc_short:
|
||
|
+.L018cbc_aligned:
|
||
|
+ leal (%esi,%ecx,1),%ebp
|
||
|
+ negl %ebp
|
||
|
+ andl $4095,%ebp
|
||
|
xorl %eax,%eax
|
||
|
+ cmpl $64,%ebp
|
||
|
+ movl $63,%ebp
|
||
|
+ cmovael %eax,%ebp
|
||
|
+ andl %ecx,%ebp
|
||
|
+ subl %ebp,%ecx
|
||
|
+ jz .L027cbc_aligned_tail
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,208
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+ testl %ebp,%ebp
|
||
|
+ jz .L026cbc_exit
|
||
|
+.L027cbc_aligned_tail:
|
||
|
+ movl %ebp,%ecx
|
||
|
leal -24(%esp),%ebp
|
||
|
- subl %ecx,%eax
|
||
|
+ movl %ebp,%esp
|
||
|
+ movl %ebp,%eax
|
||
|
+ subl %ecx,%esp
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ movl %edi,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ shrl $2,%ecx
|
||
|
+ leal (%esp),%edi
|
||
|
+.byte 243,165
|
||
|
+ movl %esp,%esi
|
||
|
+ movl %eax,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ jmp .L019cbc_loop
|
||
|
+.L026cbc_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+.L016cbc_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
+.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin
|
||
|
+.globl padlock_cfb_encrypt
|
||
|
+.type padlock_cfb_encrypt,@function
|
||
|
+.align 16
|
||
|
+padlock_cfb_encrypt:
|
||
|
+.L_padlock_cfb_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz .L028cfb_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz .L028cfb_abort
|
||
|
+ leal .Lpadlock_saved_context-.L029cfb_pic_point,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+.L029cfb_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%edx)
|
||
|
+ jnz .L030cfb_aligned
|
||
|
+ testl $15,%edi
|
||
|
+ setz %al
|
||
|
+ testl $15,%esi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .L030cfb_aligned
|
||
|
+ negl %eax
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp .L031cfb_loop
|
||
|
+.align 16
|
||
|
+.L031cfb_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ testl $15,%edi
|
||
|
+ cmovnzl %esp,%edi
|
||
|
+ testl $15,%esi
|
||
|
+ jz .L032cfb_inp_aligned
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %edi,%esi
|
||
|
+.L032cfb_inp_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,224
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ testl $15,%edi
|
||
|
+ jz .L033cfb_out_aligned
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+.L033cfb_out_aligned:
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz .L031cfb_loop
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je .L034cfb_done
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+.L035cfb_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja .L035cfb_bzero
|
||
|
+.L034cfb_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ jmp .L036cfb_exit
|
||
|
+.align 16
|
||
|
+.L030cfb_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,224
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+.L036cfb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+.L028cfb_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
+.size padlock_cfb_encrypt,.-.L_padlock_cfb_encrypt_begin
|
||
|
+.globl padlock_ofb_encrypt
|
||
|
+.type padlock_ofb_encrypt,@function
|
||
|
+.align 16
|
||
|
+padlock_ofb_encrypt:
|
||
|
+.L_padlock_ofb_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz .L037ofb_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz .L037ofb_abort
|
||
|
+ leal .Lpadlock_saved_context-.L038ofb_pic_point,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+.L038ofb_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
xorl %ebx,%ebx
|
||
|
-.L025cbc_short_copy:
|
||
|
- movups (%esi,%ebx,1),%xmm0
|
||
|
- leal 16(%ebx),%ebx
|
||
|
+ testl $32,(%edx)
|
||
|
+ jnz .L039ofb_aligned
|
||
|
+ testl $15,%edi
|
||
|
+ setz %al
|
||
|
+ testl $15,%esi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz .L039ofb_aligned
|
||
|
+ negl %eax
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
cmpl %ebx,%ecx
|
||
|
- movaps %xmm0,-16(%esp,%ebx,1)
|
||
|
- ja .L025cbc_short_copy
|
||
|
- movl %esp,%esi
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
movl %ecx,%ebx
|
||
|
- jmp .L019cbc_loop
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp .L040ofb_loop
|
||
|
.align 16
|
||
|
-.L018cbc_aligned:
|
||
|
+.L040ofb_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ testl $15,%edi
|
||
|
+ cmovnzl %esp,%edi
|
||
|
+ testl $15,%esi
|
||
|
+ jz .L041ofb_inp_aligned
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %edi,%esi
|
||
|
+.L041ofb_inp_aligned:
|
||
|
leal -16(%edx),%eax
|
||
|
leal 16(%edx),%ebx
|
||
|
shrl $4,%ecx
|
||
|
-.byte 243,15,167,208
|
||
|
+.byte 243,15,167,232
|
||
|
movaps (%eax),%xmm0
|
||
|
movaps %xmm0,-16(%edx)
|
||
|
-.L024cbc_exit:
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ testl $15,%edi
|
||
|
+ jz .L042ofb_out_aligned
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+.L042ofb_out_aligned:
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz .L040ofb_loop
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je .L043ofb_done
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+.L044ofb_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja .L044ofb_bzero
|
||
|
+.L043ofb_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ jmp .L045ofb_exit
|
||
|
+.align 16
|
||
|
+.L039ofb_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,232
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+.L045ofb_exit:
|
||
|
movl $1,%eax
|
||
|
leal 4(%esp),%esp
|
||
|
-.L015cbc_abort:
|
||
|
+.L037ofb_abort:
|
||
|
popl %edi
|
||
|
popl %esi
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
-.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin
|
||
|
+.size padlock_ofb_encrypt,.-.L_padlock_ofb_encrypt_begin
|
||
|
+.globl padlock_ctr32_encrypt
|
||
|
+.type padlock_ctr32_encrypt,@function
|
||
|
+.align 16
|
||
|
+padlock_ctr32_encrypt:
|
||
|
+.L_padlock_ctr32_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz .L046ctr32_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz .L046ctr32_abort
|
||
|
+ leal .Lpadlock_saved_context-.L047ctr32_pic_point,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+.L047ctr32_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ movq -16(%edx),%mm0
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp .L048ctr32_loop
|
||
|
+.align 16
|
||
|
+.L048ctr32_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ movl -4(%edx),%ecx
|
||
|
+ xorl %edi,%edi
|
||
|
+ movl -8(%edx),%eax
|
||
|
+.L049ctr32_prepare:
|
||
|
+ movl %ecx,12(%esp,%edi,1)
|
||
|
+ bswap %ecx
|
||
|
+ movq %mm0,(%esp,%edi,1)
|
||
|
+ incl %ecx
|
||
|
+ movl %eax,8(%esp,%edi,1)
|
||
|
+ bswap %ecx
|
||
|
+ leal 16(%edi),%edi
|
||
|
+ cmpl %ebx,%edi
|
||
|
+ jb .L049ctr32_prepare
|
||
|
+ movl %ecx,-4(%edx)
|
||
|
+ leal (%esp),%esi
|
||
|
+ leal (%esp),%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,200
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ xorl %ecx,%ecx
|
||
|
+.L050ctr32_xor:
|
||
|
+ movups (%esi,%ecx,1),%xmm1
|
||
|
+ leal 16(%ecx),%ecx
|
||
|
+ pxor -16(%esp,%ecx,1),%xmm1
|
||
|
+ movups %xmm1,-16(%edi,%ecx,1)
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ jb .L050ctr32_xor
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz .L048ctr32_loop
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+.L051ctr32_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja .L051ctr32_bzero
|
||
|
+.L052ctr32_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+ emms
|
||
|
+.L046ctr32_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
+.size padlock_ctr32_encrypt,.-.L_padlock_ctr32_encrypt_begin
|
||
|
.globl padlock_xstore
|
||
|
.type padlock_xstore,@function
|
||
|
.align 16
|
||
|
@@ -447,10 +894,10 @@ _win32_segv_handler:
|
||
|
movl 4(%esp),%edx
|
||
|
movl 12(%esp),%ecx
|
||
|
cmpl $3221225477,(%edx)
|
||
|
- jne .L026ret
|
||
|
+ jne .L053ret
|
||
|
addl $4,184(%ecx)
|
||
|
movl $0,%eax
|
||
|
-.L026ret:
|
||
|
+.L053ret:
|
||
|
ret
|
||
|
.size _win32_segv_handler,.-_win32_segv_handler
|
||
|
.globl padlock_sha1_oneshot
|
||
|
diff --git a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
|
||
|
index cfac705..eac88ae 100644
|
||
|
--- a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
|
||
|
+++ b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
|
||
|
@@ -699,6 +699,7 @@ L$ghash_epilogue:
|
||
|
|
||
|
.p2align 4
|
||
|
_gcm_init_clmul:
|
||
|
+L$_init_clmul:
|
||
|
movdqu (%rsi),%xmm2
|
||
|
pshufd $78,%xmm2,%xmm2
|
||
|
|
||
|
@@ -717,15 +718,15 @@ _gcm_init_clmul:
|
||
|
pxor %xmm5,%xmm2
|
||
|
|
||
|
|
||
|
+ pshufd $78,%xmm2,%xmm6
|
||
|
movdqa %xmm2,%xmm0
|
||
|
+ pxor %xmm2,%xmm6
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
|
||
|
@@ -735,44 +736,134 @@ _gcm_init_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ pshufd $78,%xmm2,%xmm3
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm2,%xmm3
|
||
|
+ movdqu %xmm2,0(%rdi)
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ movdqu %xmm0,16(%rdi)
|
||
|
+.byte 102,15,58,15,227,8
|
||
|
+ movdqu %xmm4,32(%rdi)
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,15,58,68,194,0
|
||
|
+.byte 102,15,58,68,202,17
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm3,%xmm4
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ movdqa %xmm0,%xmm5
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,15,58,68,194,0
|
||
|
+.byte 102,15,58,68,202,17
|
||
|
+.byte 102,15,58,68,222,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm3,%xmm4
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- movdqu %xmm2,(%rdi)
|
||
|
- movdqu %xmm0,16(%rdi)
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ pshufd $78,%xmm5,%xmm3
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm5,%xmm3
|
||
|
+ movdqu %xmm5,48(%rdi)
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ movdqu %xmm0,64(%rdi)
|
||
|
+.byte 102,15,58,15,227,8
|
||
|
+ movdqu %xmm4,80(%rdi)
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
.globl _gcm_gmult_clmul
|
||
|
|
||
|
.p2align 4
|
||
|
_gcm_gmult_clmul:
|
||
|
+L$_gmult_clmul:
|
||
|
movdqu (%rdi),%xmm0
|
||
|
movdqa L$bswap_mask(%rip),%xmm5
|
||
|
movdqu (%rsi),%xmm2
|
||
|
+ movdqu 32(%rsi),%xmm4
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
.byte 102,15,58,68,220,0
|
||
|
@@ -785,186 +876,358 @@ _gcm_gmult_clmul:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqu %xmm0,(%rdi)
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
.globl _gcm_ghash_clmul
|
||
|
|
||
|
-.p2align 4
|
||
|
+.p2align 5
|
||
|
_gcm_ghash_clmul:
|
||
|
+L$_ghash_clmul:
|
||
|
movdqa L$bswap_mask(%rip),%xmm5
|
||
|
+ movq $11547335547999543296,%rax
|
||
|
|
||
|
movdqu (%rdi),%xmm0
|
||
|
movdqu (%rsi),%xmm2
|
||
|
+ movdqu 32(%rsi),%xmm10
|
||
|
.byte 102,15,56,0,197
|
||
|
|
||
|
subq $16,%rcx
|
||
|
jz L$odd_tail
|
||
|
|
||
|
- movdqu 16(%rsi),%xmm8
|
||
|
+ movdqu 16(%rsi),%xmm9
|
||
|
+ cmpq $48,%rcx
|
||
|
+ jb L$skip4x
|
||
|
|
||
|
+ subq $48,%rcx
|
||
|
+ movdqu 48(%rsi),%xmm14
|
||
|
+ movdqu 64(%rsi),%xmm15
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
- movdqu (%rdx),%xmm3
|
||
|
- movdqu 16(%rdx),%xmm6
|
||
|
-.byte 102,15,56,0,221
|
||
|
+ movdqu 48(%rdx),%xmm6
|
||
|
+ movdqu 32(%rdx),%xmm11
|
||
|
.byte 102,15,56,0,245
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- movdqa %xmm6,%xmm7
|
||
|
- pshufd $78,%xmm6,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
- pxor %xmm6,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm7
|
||
|
+ pxor %xmm6,%xmm7
|
||
|
.byte 102,15,58,68,242,0
|
||
|
-.byte 102,15,58,68,250,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
- pxor %xmm6,%xmm3
|
||
|
- pxor %xmm7,%xmm3
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
+.byte 102,65,15,58,68,250,0
|
||
|
+
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,217,0
|
||
|
+.byte 102,69,15,58,68,233,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+.byte 102,69,15,58,68,226,16
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+ movups 80(%rsi),%xmm10
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+
|
||
|
+ movdqu 16(%rdx),%xmm11
|
||
|
+ movdqu 0(%rdx),%xmm3
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,15,56,0,221
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,222,0
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm3
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+.byte 102,69,15,58,68,238,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+
|
||
|
+ leaq 64(%rdx),%rdx
|
||
|
+ subq $64,%rcx
|
||
|
+ jc L$tail4x
|
||
|
+
|
||
|
+ jmp L$mod4_loop
|
||
|
+.p2align 5
|
||
|
+L$mod4_loop:
|
||
|
+.byte 102,65,15,58,68,199,0
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+ movdqu 48(%rdx),%xmm11
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,65,15,58,68,207,17
|
||
|
+ xorps %xmm6,%xmm0
|
||
|
+ movdqu 32(%rdx),%xmm6
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+.byte 102,65,15,58,68,218,16
|
||
|
+ xorps %xmm8,%xmm1
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,15,56,0,245
|
||
|
+ movups 32(%rsi),%xmm10
|
||
|
+.byte 102,68,15,58,68,218,0
|
||
|
+ xorps %xmm7,%xmm3
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm7
|
||
|
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm6,%xmm7
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
movdqa %xmm3,%xmm4
|
||
|
- psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm3
|
||
|
+.byte 102,68,15,58,68,234,17
|
||
|
+ psrldq $8,%xmm4
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ movdqa L$7_mask(%rip),%xmm3
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+.byte 102,72,15,110,224
|
||
|
+
|
||
|
+ pand %xmm0,%xmm3
|
||
|
+.byte 102,15,56,0,227
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psllq $57,%xmm4
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm7
|
||
|
- pxor %xmm4,%xmm6
|
||
|
+.byte 102,65,15,58,68,241,0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ movdqu 0(%rdx),%xmm3
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+.byte 102,69,15,58,68,193,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+ movdqu 16(%rdx),%xmm11
|
||
|
+.byte 102,68,15,56,0,221
|
||
|
+.byte 102,65,15,58,68,250,16
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+ movups 80(%rsi),%xmm10
|
||
|
+.byte 102,15,56,0,221
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm11,%xmm13
|
||
|
+ pxor %xmm12,%xmm7
|
||
|
+ pshufd $78,%xmm11,%xmm12
|
||
|
+ pxor %xmm11,%xmm12
|
||
|
+.byte 102,69,15,58,68,222,0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ psrlq $1,%xmm0
|
||
|
+.byte 102,69,15,58,68,238,17
|
||
|
+ xorps %xmm11,%xmm6
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+
|
||
|
+.byte 102,69,15,58,68,226,0
|
||
|
+ xorps %xmm13,%xmm8
|
||
|
+
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm8,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm8,%xmm4
|
||
|
|
||
|
- leaq 32(%rdx),%rdx
|
||
|
- subq $32,%rcx
|
||
|
- jbe L$even_tail
|
||
|
+ leaq 64(%rdx),%rdx
|
||
|
+ subq $64,%rcx
|
||
|
+ jnc L$mod4_loop
|
||
|
+
|
||
|
+L$tail4x:
|
||
|
+.byte 102,65,15,58,68,199,0
|
||
|
+ xorps %xmm12,%xmm7
|
||
|
+.byte 102,65,15,58,68,207,17
|
||
|
+ xorps %xmm6,%xmm0
|
||
|
+.byte 102,65,15,58,68,218,16
|
||
|
+ xorps %xmm8,%xmm1
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ pxor %xmm7,%xmm3
|
||
|
|
||
|
-L$mod_loop:
|
||
|
-.byte 102,65,15,58,68,192,0
|
||
|
-.byte 102,65,15,58,68,200,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
- pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
|
||
|
movdqa %xmm3,%xmm4
|
||
|
psrldq $8,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
- movdqu (%rdx),%xmm3
|
||
|
- pxor %xmm6,%xmm0
|
||
|
- pxor %xmm7,%xmm1
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+ psllq $57,%xmm0
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
+ pslldq $8,%xmm0
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ psrlq $1,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
+ addq $64,%rcx
|
||
|
+ jz L$done
|
||
|
+ movdqu 32(%rsi),%xmm10
|
||
|
+ subq $16,%rcx
|
||
|
+ jz L$odd_tail
|
||
|
+L$skip4x:
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+
|
||
|
+ movdqu (%rdx),%xmm3
|
||
|
movdqu 16(%rdx),%xmm6
|
||
|
.byte 102,15,56,0,221
|
||
|
.byte 102,15,56,0,245
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
+
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+ pshufd $78,%xmm6,%xmm3
|
||
|
+ pxor %xmm6,%xmm3
|
||
|
+.byte 102,15,58,68,242,0
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
+
|
||
|
+ leaq 32(%rdx),%rdx
|
||
|
+ subq $32,%rcx
|
||
|
+ jbe L$even_tail
|
||
|
+ jmp L$mod_loop
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+L$mod_loop:
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+
|
||
|
+.byte 102,65,15,58,68,193,0
|
||
|
+.byte 102,65,15,58,68,201,17
|
||
|
+.byte 102,65,15,58,68,226,16
|
||
|
+
|
||
|
+ pxor %xmm6,%xmm0
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
+ movdqu (%rdx),%xmm8
|
||
|
+.byte 102,68,15,56,0,197
|
||
|
+ movdqu 16(%rdx),%xmm6
|
||
|
|
||
|
- movdqa %xmm6,%xmm7
|
||
|
- pshufd $78,%xmm6,%xmm9
|
||
|
- pshufd $78,%xmm2,%xmm10
|
||
|
- pxor %xmm6,%xmm9
|
||
|
- pxor %xmm2,%xmm10
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm1,%xmm3
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
+ pxor %xmm3,%xmm4
|
||
|
+.byte 102,15,56,0,245
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm6,%xmm8
|
||
|
+
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
.byte 102,15,58,68,242,0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
+ pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
+ pshufd $78,%xmm8,%xmm3
|
||
|
+ pxor %xmm8,%xmm3
|
||
|
|
||
|
-.byte 102,15,58,68,250,17
|
||
|
+.byte 102,68,15,58,68,194,17
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
-
|
||
|
-.byte 102,69,15,58,68,202,0
|
||
|
- movdqa %xmm0,%xmm1
|
||
|
- pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm8,%xmm4
|
||
|
- pxor %xmm0,%xmm3
|
||
|
- pxor %xmm8,%xmm4
|
||
|
-
|
||
|
- pxor %xmm6,%xmm9
|
||
|
- pxor %xmm7,%xmm9
|
||
|
- movdqa %xmm9,%xmm10
|
||
|
- psrldq $8,%xmm9
|
||
|
- pslldq $8,%xmm10
|
||
|
- pxor %xmm9,%xmm7
|
||
|
- pxor %xmm10,%xmm6
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
|
||
|
leaq 32(%rdx),%rdx
|
||
|
subq $32,%rcx
|
||
|
ja L$mod_loop
|
||
|
|
||
|
L$even_tail:
|
||
|
-.byte 102,65,15,58,68,192,0
|
||
|
-.byte 102,65,15,58,68,200,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+ movdqa %xmm0,%xmm1
|
||
|
+ pshufd $78,%xmm0,%xmm4
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+
|
||
|
+.byte 102,65,15,58,68,193,0
|
||
|
+.byte 102,65,15,58,68,201,17
|
||
|
+.byte 102,65,15,58,68,226,16
|
||
|
+
|
||
|
+ pxor %xmm6,%xmm0
|
||
|
+ pxor %xmm8,%xmm1
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
-
|
||
|
- movdqa %xmm3,%xmm4
|
||
|
+ pxor %xmm3,%xmm4
|
||
|
+ movdqa %xmm4,%xmm3
|
||
|
psrldq $8,%xmm3
|
||
|
pslldq $8,%xmm4
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm6,%xmm0
|
||
|
- pxor %xmm7,%xmm1
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
testq %rcx,%rcx
|
||
|
jnz L$done
|
||
|
|
||
|
@@ -974,12 +1237,10 @@ L$odd_tail:
|
||
|
pxor %xmm3,%xmm0
|
||
|
movdqa %xmm0,%xmm1
|
||
|
pshufd $78,%xmm0,%xmm3
|
||
|
- pshufd $78,%xmm2,%xmm4
|
||
|
pxor %xmm0,%xmm3
|
||
|
- pxor %xmm2,%xmm4
|
||
|
.byte 102,15,58,68,194,0
|
||
|
.byte 102,15,58,68,202,17
|
||
|
-.byte 102,15,58,68,220,0
|
||
|
+.byte 102,65,15,58,68,218,0
|
||
|
pxor %xmm0,%xmm3
|
||
|
pxor %xmm1,%xmm3
|
||
|
|
||
|
@@ -989,38 +1250,60 @@ L$odd_tail:
|
||
|
pxor %xmm3,%xmm1
|
||
|
pxor %xmm4,%xmm0
|
||
|
|
||
|
+ movdqa %xmm0,%xmm4
|
||
|
movdqa %xmm0,%xmm3
|
||
|
- psllq $1,%xmm0
|
||
|
- pxor %xmm3,%xmm0
|
||
|
psllq $5,%xmm0
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ psllq $1,%xmm0
|
||
|
pxor %xmm3,%xmm0
|
||
|
psllq $57,%xmm0
|
||
|
- movdqa %xmm0,%xmm4
|
||
|
+ movdqa %xmm0,%xmm3
|
||
|
pslldq $8,%xmm0
|
||
|
- psrldq $8,%xmm4
|
||
|
- pxor %xmm3,%xmm0
|
||
|
- pxor %xmm4,%xmm1
|
||
|
+ psrldq $8,%xmm3
|
||
|
+ pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm3,%xmm1
|
||
|
|
||
|
|
||
|
movdqa %xmm0,%xmm4
|
||
|
- psrlq $5,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
psrlq $1,%xmm0
|
||
|
+ pxor %xmm4,%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ psrlq $5,%xmm0
|
||
|
pxor %xmm4,%xmm0
|
||
|
- pxor %xmm1,%xmm4
|
||
|
psrlq $1,%xmm0
|
||
|
- pxor %xmm4,%xmm0
|
||
|
+ pxor %xmm1,%xmm0
|
||
|
L$done:
|
||
|
.byte 102,15,56,0,197
|
||
|
movdqu %xmm0,(%rdi)
|
||
|
.byte 0xf3,0xc3
|
||
|
-L$SEH_end_gcm_ghash_clmul:
|
||
|
+
|
||
|
+.globl _gcm_init_avx
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+_gcm_init_avx:
|
||
|
+ jmp L$_init_clmul
|
||
|
+
|
||
|
+.globl _gcm_gmult_avx
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+_gcm_gmult_avx:
|
||
|
+ jmp L$_gmult_clmul
|
||
|
+
|
||
|
+.globl _gcm_ghash_avx
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+_gcm_ghash_avx:
|
||
|
+ jmp L$_ghash_clmul
|
||
|
|
||
|
.p2align 6
|
||
|
L$bswap_mask:
|
||
|
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||
|
L$0x1c2_polynomial:
|
||
|
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
||
|
+L$7_mask:
|
||
|
+.long 7,0,7,0
|
||
|
+L$7_mask_poly:
|
||
|
+.long 7,0,450,0
|
||
|
.p2align 6
|
||
|
|
||
|
L$rem_4bit:
|
||
|
diff --git a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
|
||
|
index a82f0a5..e2cfa17 100644
|
||
|
--- a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
|
||
|
+++ b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
|
||
|
@@ -927,199 +927,412 @@ L$oop_enc1_6:
|
||
|
|
||
|
.p2align 4
|
||
|
_aesni_ctr32_encrypt_blocks:
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $128,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
+
|
||
|
cmpq $1,%rdx
|
||
|
je L$ctr32_one_shortcut
|
||
|
|
||
|
- movdqu (%r8),%xmm14
|
||
|
- movdqa L$bswap_mask(%rip),%xmm15
|
||
|
- xorl %eax,%eax
|
||
|
-.byte 102,69,15,58,22,242,3
|
||
|
-.byte 102,68,15,58,34,240,3
|
||
|
+ movdqu (%r8),%xmm2
|
||
|
+ movdqu (%rcx),%xmm0
|
||
|
+ movl 12(%r8),%r8d
|
||
|
+ pxor %xmm0,%xmm2
|
||
|
+ movl 12(%rcx),%r11d
|
||
|
+ movdqa %xmm2,0(%rsp)
|
||
|
+ bswapl %r8d
|
||
|
+ movdqa %xmm2,%xmm3
|
||
|
+ movdqa %xmm2,%xmm4
|
||
|
+ movdqa %xmm2,%xmm5
|
||
|
+ movdqa %xmm2,64(%rsp)
|
||
|
+ movdqa %xmm2,80(%rsp)
|
||
|
+ movdqa %xmm2,96(%rsp)
|
||
|
+ movdqa %xmm2,112(%rsp)
|
||
|
|
||
|
movl 240(%rcx),%eax
|
||
|
+
|
||
|
+ leaq 1(%r8),%r9
|
||
|
+ leaq 2(%r8),%r10
|
||
|
+ bswapl %r9d
|
||
|
bswapl %r10d
|
||
|
- pxor %xmm12,%xmm12
|
||
|
- pxor %xmm13,%xmm13
|
||
|
-.byte 102,69,15,58,34,226,0
|
||
|
- leaq 3(%r10),%r11
|
||
|
-.byte 102,69,15,58,34,235,0
|
||
|
- incl %r10d
|
||
|
-.byte 102,69,15,58,34,226,1
|
||
|
- incq %r11
|
||
|
-.byte 102,69,15,58,34,235,1
|
||
|
- incl %r10d
|
||
|
-.byte 102,69,15,58,34,226,2
|
||
|
- incq %r11
|
||
|
-.byte 102,69,15,58,34,235,2
|
||
|
- movdqa %xmm12,-40(%rsp)
|
||
|
-.byte 102,69,15,56,0,231
|
||
|
- movdqa %xmm13,-24(%rsp)
|
||
|
-.byte 102,69,15,56,0,239
|
||
|
-
|
||
|
- pshufd $192,%xmm12,%xmm2
|
||
|
- pshufd $128,%xmm12,%xmm3
|
||
|
- pshufd $64,%xmm12,%xmm4
|
||
|
- cmpq $6,%rdx
|
||
|
- jb L$ctr32_tail
|
||
|
- shrl $1,%eax
|
||
|
- movq %rcx,%r11
|
||
|
- movl %eax,%r10d
|
||
|
- subq $6,%rdx
|
||
|
- jmp L$ctr32_loop6
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ xorl %r11d,%r10d
|
||
|
+.byte 102,65,15,58,34,217,3
|
||
|
+ leaq 3(%r8),%r9
|
||
|
+ movdqa %xmm3,16(%rsp)
|
||
|
+.byte 102,65,15,58,34,226,3
|
||
|
+ bswapl %r9d
|
||
|
+ leaq 4(%r8),%r10
|
||
|
+ movdqa %xmm4,32(%rsp)
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ bswapl %r10d
|
||
|
+.byte 102,65,15,58,34,233,3
|
||
|
+ xorl %r11d,%r10d
|
||
|
+ movdqa %xmm5,48(%rsp)
|
||
|
+ leaq 5(%r8),%r9
|
||
|
+ movl %r10d,64+12(%rsp)
|
||
|
+ bswapl %r9d
|
||
|
+ leaq 6(%r8),%r10
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ bswapl %r10d
|
||
|
+ movl %r9d,80+12(%rsp)
|
||
|
+ xorl %r11d,%r10d
|
||
|
+ leaq 7(%r8),%r9
|
||
|
+ movl %r10d,96+12(%rsp)
|
||
|
+ bswapl %r9d
|
||
|
+ xorl %r11d,%r9d
|
||
|
+ movl %r9d,112+12(%rsp)
|
||
|
|
||
|
-.p2align 4
|
||
|
-L$ctr32_loop6:
|
||
|
- pshufd $192,%xmm13,%xmm5
|
||
|
- por %xmm14,%xmm2
|
||
|
- movups (%r11),%xmm0
|
||
|
- pshufd $128,%xmm13,%xmm6
|
||
|
- por %xmm14,%xmm3
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pshufd $64,%xmm13,%xmm7
|
||
|
- por %xmm14,%xmm4
|
||
|
- por %xmm14,%xmm5
|
||
|
- xorps %xmm0,%xmm2
|
||
|
- por %xmm14,%xmm6
|
||
|
- por %xmm14,%xmm7
|
||
|
+ movups 16(%rcx),%xmm1
|
||
|
|
||
|
+ movdqa 64(%rsp),%xmm6
|
||
|
+ movdqa 80(%rsp),%xmm7
|
||
|
|
||
|
+ cmpq $8,%rdx
|
||
|
+ jb L$ctr32_tail
|
||
|
|
||
|
+ leaq 128(%rcx),%rcx
|
||
|
+ subq $8,%rdx
|
||
|
+ jmp L$ctr32_loop8
|
||
|
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+.p2align 5
|
||
|
+L$ctr32_loop8:
|
||
|
+ addl $8,%r8d
|
||
|
+ movdqa 96(%rsp),%xmm8
|
||
|
.byte 102,15,56,220,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+ movl %r8d,%r9d
|
||
|
+ movdqa 112(%rsp),%xmm9
|
||
|
.byte 102,15,56,220,217
|
||
|
- movdqa L$increment32(%rip),%xmm13
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+ bswapl %r9d
|
||
|
+ movups 32-128(%rcx),%xmm0
|
||
|
.byte 102,15,56,220,225
|
||
|
- movdqa -40(%rsp),%xmm12
|
||
|
- pxor %xmm0,%xmm6
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+ movl %r9d,0+12(%rsp)
|
||
|
+ leaq 1(%r8),%r9
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
- jmp L$ctr32_enc_loop6_enter
|
||
|
-.p2align 4
|
||
|
-L$ctr32_enc_loop6:
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 48-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,16+12(%rsp)
|
||
|
+ leaq 2(%r8),%r9
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 64-128(%rcx),%xmm0
|
||
|
.byte 102,15,56,220,209
|
||
|
.byte 102,15,56,220,217
|
||
|
- decl %eax
|
||
|
+ bswapl %r9d
|
||
|
.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,233
|
||
|
+ movl %r9d,32+12(%rsp)
|
||
|
+ leaq 3(%r8),%r9
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
-L$ctr32_enc_loop6_enter:
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 80-128(%rcx),%xmm1
|
||
|
.byte 102,15,56,220,208
|
||
|
.byte 102,15,56,220,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
+ bswapl %r9d
|
||
|
.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
.byte 102,15,56,220,232
|
||
|
+ movl %r9d,48+12(%rsp)
|
||
|
+ leaq 4(%r8),%r9
|
||
|
.byte 102,15,56,220,240
|
||
|
.byte 102,15,56,220,248
|
||
|
- movups (%rcx),%xmm0
|
||
|
- jnz L$ctr32_enc_loop6
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 96-128(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movl %r9d,64+12(%rsp)
|
||
|
+ leaq 5(%r8),%r9
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 112-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,80+12(%rsp)
|
||
|
+ leaq 6(%r8),%r9
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 128-128(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movl %r9d,96+12(%rsp)
|
||
|
+ leaq 7(%r8),%r9
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 144-128(%rcx),%xmm1
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ bswapl %r9d
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ xorl %r11d,%r9d
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ movl %r9d,112+12(%rsp)
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+ movdqu 0(%rdi),%xmm10
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 160-128(%rcx),%xmm0
|
||
|
+
|
||
|
+ cmpl $11,%eax
|
||
|
+ jb L$ctr32_enc_done
|
||
|
|
||
|
.byte 102,15,56,220,209
|
||
|
- paddd %xmm13,%xmm12
|
||
|
.byte 102,15,56,220,217
|
||
|
- paddd -24(%rsp),%xmm13
|
||
|
.byte 102,15,56,220,225
|
||
|
- movdqa %xmm12,-40(%rsp)
|
||
|
.byte 102,15,56,220,233
|
||
|
- movdqa %xmm13,-24(%rsp)
|
||
|
.byte 102,15,56,220,241
|
||
|
-.byte 102,69,15,56,0,231
|
||
|
.byte 102,15,56,220,249
|
||
|
-.byte 102,69,15,56,0,239
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 176-128(%rcx),%xmm1
|
||
|
|
||
|
-.byte 102,15,56,221,208
|
||
|
- movups (%rdi),%xmm8
|
||
|
-.byte 102,15,56,221,216
|
||
|
- movups 16(%rdi),%xmm9
|
||
|
-.byte 102,15,56,221,224
|
||
|
- movups 32(%rdi),%xmm10
|
||
|
-.byte 102,15,56,221,232
|
||
|
- movups 48(%rdi),%xmm11
|
||
|
-.byte 102,15,56,221,240
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
-.byte 102,15,56,221,248
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 192-128(%rcx),%xmm0
|
||
|
+ je L$ctr32_enc_done
|
||
|
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- pshufd $192,%xmm12,%xmm2
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- pshufd $128,%xmm12,%xmm3
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- pshufd $64,%xmm12,%xmm4
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- xorps %xmm6,%xmm1
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
- xorps %xmm7,%xmm0
|
||
|
- movups %xmm1,64(%rsi)
|
||
|
- movups %xmm0,80(%rsi)
|
||
|
- leaq 96(%rsi),%rsi
|
||
|
- movl %r10d,%eax
|
||
|
- subq $6,%rdx
|
||
|
- jnc L$ctr32_loop6
|
||
|
+.byte 102,15,56,220,209
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movups 208-128(%rcx),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,220,208
|
||
|
+.byte 102,15,56,220,216
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+.byte 102,15,56,220,240
|
||
|
+.byte 102,15,56,220,248
|
||
|
+.byte 102,68,15,56,220,192
|
||
|
+.byte 102,68,15,56,220,200
|
||
|
+ movups 224-128(%rcx),%xmm0
|
||
|
+
|
||
|
+L$ctr32_enc_done:
|
||
|
+ movdqu 16(%rdi),%xmm11
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqu 32(%rdi),%xmm12
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ movdqu 48(%rdi),%xmm13
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ movdqu 64(%rdi),%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ movdqu 80(%rdi),%xmm15
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+.byte 102,68,15,56,220,201
|
||
|
+ movdqu 96(%rdi),%xmm1
|
||
|
+
|
||
|
+.byte 102,65,15,56,221,210
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqu 112(%rdi),%xmm10
|
||
|
+ leaq 128(%rdi),%rdi
|
||
|
+.byte 102,65,15,56,221,219
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqa 0(%rsp),%xmm11
|
||
|
+.byte 102,65,15,56,221,228
|
||
|
+ movdqa 16(%rsp),%xmm12
|
||
|
+.byte 102,65,15,56,221,237
|
||
|
+ movdqa 32(%rsp),%xmm13
|
||
|
+.byte 102,65,15,56,221,246
|
||
|
+ movdqa 48(%rsp),%xmm14
|
||
|
+.byte 102,65,15,56,221,255
|
||
|
+ movdqa 64(%rsp),%xmm15
|
||
|
+.byte 102,68,15,56,221,193
|
||
|
+ movdqa 80(%rsp),%xmm0
|
||
|
+.byte 102,69,15,56,221,202
|
||
|
+ movups 16-128(%rcx),%xmm1
|
||
|
+
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ movdqa %xmm11,%xmm2
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm12,%xmm3
|
||
|
+ movups %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm13,%xmm4
|
||
|
+ movups %xmm5,48(%rsi)
|
||
|
+ movdqa %xmm14,%xmm5
|
||
|
+ movups %xmm6,64(%rsi)
|
||
|
+ movdqa %xmm15,%xmm6
|
||
|
+ movups %xmm7,80(%rsi)
|
||
|
+ movdqa %xmm0,%xmm7
|
||
|
+ movups %xmm8,96(%rsi)
|
||
|
+ movups %xmm9,112(%rsi)
|
||
|
+ leaq 128(%rsi),%rsi
|
||
|
+
|
||
|
+ subq $8,%rdx
|
||
|
+ jnc L$ctr32_loop8
|
||
|
|
||
|
- addq $6,%rdx
|
||
|
+ addq $8,%rdx
|
||
|
jz L$ctr32_done
|
||
|
- movq %r11,%rcx
|
||
|
- leal 1(%rax,%rax,1),%eax
|
||
|
+ leaq -128(%rcx),%rcx
|
||
|
|
||
|
L$ctr32_tail:
|
||
|
- por %xmm14,%xmm2
|
||
|
- movups (%rdi),%xmm8
|
||
|
- cmpq $2,%rdx
|
||
|
- jb L$ctr32_one
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+ cmpq $4,%rdx
|
||
|
+ jb L$ctr32_loop3
|
||
|
+ je L$ctr32_loop4
|
||
|
|
||
|
- por %xmm14,%xmm3
|
||
|
- movups 16(%rdi),%xmm9
|
||
|
- je L$ctr32_two
|
||
|
+ movdqa 96(%rsp),%xmm8
|
||
|
+ pxor %xmm9,%xmm9
|
||
|
|
||
|
- pshufd $192,%xmm13,%xmm5
|
||
|
- por %xmm14,%xmm4
|
||
|
- movups 32(%rdi),%xmm10
|
||
|
- cmpq $4,%rdx
|
||
|
- jb L$ctr32_three
|
||
|
+ movups 16(%rcx),%xmm0
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ shrl $1,%eax
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ decl %eax
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+.byte 102,15,56,220,241
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+.byte 102,68,15,56,220,193
|
||
|
+ movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $128,%xmm13,%xmm6
|
||
|
- por %xmm14,%xmm5
|
||
|
- movups 48(%rdi),%xmm11
|
||
|
- je L$ctr32_four
|
||
|
+ call L$enc_loop8_enter
|
||
|
|
||
|
- por %xmm14,%xmm6
|
||
|
- xorps %xmm7,%xmm7
|
||
|
+ movdqu 48(%rdi),%xmm13
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movdqu 64(%rdi),%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm10,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ cmpq $6,%rdx
|
||
|
+ jb L$ctr32_done
|
||
|
|
||
|
- call _aesni_encrypt6
|
||
|
+ movups 80(%rdi),%xmm11
|
||
|
+ xorps %xmm11,%xmm7
|
||
|
+ movups %xmm7,80(%rsi)
|
||
|
+ je L$ctr32_done
|
||
|
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- xorps %xmm6,%xmm1
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
- movups %xmm1,64(%rsi)
|
||
|
+ movups 96(%rdi),%xmm12
|
||
|
+ xorps %xmm12,%xmm8
|
||
|
+ movups %xmm8,96(%rsi)
|
||
|
+ jmp L$ctr32_done
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+L$ctr32_loop4:
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups (%rcx),%xmm1
|
||
|
+ decl %eax
|
||
|
+ jnz L$ctr32_loop4
|
||
|
+.byte 102,15,56,221,209
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+.byte 102,15,56,221,217
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+.byte 102,15,56,221,225
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+.byte 102,15,56,221,233
|
||
|
+ movups 48(%rdi),%xmm13
|
||
|
+
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ xorps %xmm11,%xmm3
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ jmp L$ctr32_done
|
||
|
+
|
||
|
+.p2align 5
|
||
|
+L$ctr32_loop3:
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ leaq 16(%rcx),%rcx
|
||
|
+.byte 102,15,56,220,217
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ movups (%rcx),%xmm1
|
||
|
+ decl %eax
|
||
|
+ jnz L$ctr32_loop3
|
||
|
+.byte 102,15,56,221,209
|
||
|
+.byte 102,15,56,221,217
|
||
|
+.byte 102,15,56,221,225
|
||
|
+
|
||
|
+ movups (%rdi),%xmm10
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
+ cmpq $2,%rdx
|
||
|
+ jb L$ctr32_done
|
||
|
+
|
||
|
+ movups 16(%rdi),%xmm11
|
||
|
+ xorps %xmm11,%xmm3
|
||
|
+ movups %xmm3,16(%rsi)
|
||
|
+ je L$ctr32_done
|
||
|
+
|
||
|
+ movups 32(%rdi),%xmm12
|
||
|
+ xorps %xmm12,%xmm4
|
||
|
+ movups %xmm4,32(%rsi)
|
||
|
jmp L$ctr32_done
|
||
|
|
||
|
.p2align 4
|
||
|
L$ctr32_one_shortcut:
|
||
|
movups (%r8),%xmm2
|
||
|
- movups (%rdi),%xmm8
|
||
|
+ movups (%rdi),%xmm10
|
||
|
movl 240(%rcx),%eax
|
||
|
-L$ctr32_one:
|
||
|
movups (%rcx),%xmm0
|
||
|
movups 16(%rcx),%xmm1
|
||
|
leaq 32(%rcx),%rcx
|
||
|
@@ -1131,51 +1344,26 @@ L$oop_enc1_7:
|
||
|
leaq 16(%rcx),%rcx
|
||
|
jnz L$oop_enc1_7
|
||
|
.byte 102,15,56,221,209
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- jmp L$ctr32_done
|
||
|
-
|
||
|
-.p2align 4
|
||
|
-L$ctr32_two:
|
||
|
- xorps %xmm4,%xmm4
|
||
|
- call _aesni_encrypt3
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- jmp L$ctr32_done
|
||
|
-
|
||
|
-.p2align 4
|
||
|
-L$ctr32_three:
|
||
|
- call _aesni_encrypt3
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movups %xmm2,(%rsi)
|
||
|
jmp L$ctr32_done
|
||
|
|
||
|
.p2align 4
|
||
|
-L$ctr32_four:
|
||
|
- call _aesni_encrypt4
|
||
|
- xorps %xmm2,%xmm8
|
||
|
- xorps %xmm3,%xmm9
|
||
|
- movups %xmm8,(%rsi)
|
||
|
- xorps %xmm4,%xmm10
|
||
|
- movups %xmm9,16(%rsi)
|
||
|
- xorps %xmm5,%xmm11
|
||
|
- movups %xmm10,32(%rsi)
|
||
|
- movups %xmm11,48(%rsi)
|
||
|
-
|
||
|
L$ctr32_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
+L$ctr32_epilogue:
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
.globl _aesni_xts_encrypt
|
||
|
|
||
|
.p2align 4
|
||
|
_aesni_xts_encrypt:
|
||
|
- leaq -104(%rsp),%rsp
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $112,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
movups (%r9),%xmm15
|
||
|
movl 240(%r8),%eax
|
||
|
movl 240(%rcx),%r10d
|
||
|
@@ -1190,228 +1378,266 @@ L$oop_enc1_8:
|
||
|
leaq 16(%r8),%r8
|
||
|
jnz L$oop_enc1_8
|
||
|
.byte 102,68,15,56,221,249
|
||
|
+ movups (%rcx),%xmm0
|
||
|
movq %rcx,%r11
|
||
|
movl %r10d,%eax
|
||
|
+ shll $4,%r10d
|
||
|
movq %rdx,%r9
|
||
|
andq $-16,%rdx
|
||
|
|
||
|
+ movups 16(%rcx,%r10,1),%xmm1
|
||
|
+ movl %eax,%r10d
|
||
|
+
|
||
|
movdqa L$xts_magic(%rip),%xmm8
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
paddq %xmm15,%xmm15
|
||
|
pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
pxor %xmm9,%xmm15
|
||
|
+ movaps %xmm1,96(%rsp)
|
||
|
+
|
||
|
subq $96,%rdx
|
||
|
jc L$xts_enc_short
|
||
|
|
||
|
shrl $1,%eax
|
||
|
- subl $1,%eax
|
||
|
+ subl $3,%eax
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
movl %eax,%r10d
|
||
|
+ leaq L$xts_magic(%rip),%r8
|
||
|
jmp L$xts_enc_grandloop
|
||
|
|
||
|
-.p2align 4
|
||
|
+.p2align 5
|
||
|
L$xts_enc_grandloop:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu 0(%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ movdqa %xmm0,%xmm8
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm11,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
pxor %xmm12,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
pxor %xmm13,%xmm5
|
||
|
- movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,220,225
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm15,%xmm8
|
||
|
+ movdqa 96(%rsp),%xmm9
|
||
|
pxor %xmm14,%xmm6
|
||
|
- pxor %xmm15,%xmm7
|
||
|
-
|
||
|
-
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ movups 32(%r11),%xmm0
|
||
|
+ leaq 96(%rdi),%rdi
|
||
|
+ pxor %xmm8,%xmm7
|
||
|
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pxor %xmm0,%xmm2
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm9,%xmm10
|
||
|
+.byte 102,15,56,220,241
|
||
|
+ pxor %xmm9,%xmm11
|
||
|
movdqa %xmm10,0(%rsp)
|
||
|
-.byte 102,15,56,220,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 48(%r11),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,220,208
|
||
|
+ pxor %xmm9,%xmm12
|
||
|
movdqa %xmm11,16(%rsp)
|
||
|
-.byte 102,15,56,220,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+.byte 102,15,56,220,216
|
||
|
+ pxor %xmm9,%xmm13
|
||
|
movdqa %xmm12,32(%rsp)
|
||
|
-.byte 102,15,56,220,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqa %xmm13,48(%rsp)
|
||
|
-.byte 102,15,56,220,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+.byte 102,15,56,220,224
|
||
|
+ pxor %xmm9,%xmm14
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
movdqa %xmm14,64(%rsp)
|
||
|
-.byte 102,15,56,220,241
|
||
|
- movdqa %xmm15,80(%rsp)
|
||
|
-.byte 102,15,56,220,249
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- jmp L$xts_enc_loop6_enter
|
||
|
-
|
||
|
-.p2align 4
|
||
|
+.byte 102,15,56,220,240
|
||
|
+ movdqa %xmm8,80(%rsp)
|
||
|
+.byte 102,15,56,220,248
|
||
|
+ movups 64(%r11),%xmm0
|
||
|
+ leaq 64(%r11),%rcx
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ jmp L$xts_enc_loop6
|
||
|
+.p2align 5
|
||
|
L$xts_enc_loop6:
|
||
|
.byte 102,15,56,220,209
|
||
|
.byte 102,15,56,220,217
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,220,225
|
||
|
.byte 102,15,56,220,233
|
||
|
.byte 102,15,56,220,241
|
||
|
.byte 102,15,56,220,249
|
||
|
-L$xts_enc_loop6_enter:
|
||
|
movups 16(%rcx),%xmm1
|
||
|
+ leaq 32(%rcx),%rcx
|
||
|
+
|
||
|
.byte 102,15,56,220,208
|
||
|
.byte 102,15,56,220,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
.byte 102,15,56,220,224
|
||
|
.byte 102,15,56,220,232
|
||
|
.byte 102,15,56,220,240
|
||
|
.byte 102,15,56,220,248
|
||
|
movups (%rcx),%xmm0
|
||
|
+ decl %eax
|
||
|
jnz L$xts_enc_loop6
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa (%r8),%xmm8
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ movups (%r11),%xmm10
|
||
|
.byte 102,15,56,220,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,220,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,241
|
||
|
+ movaps %xmm10,%xmm11
|
||
|
.byte 102,15,56,220,249
|
||
|
movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,220,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,220,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,240
|
||
|
+ movaps %xmm11,%xmm12
|
||
|
.byte 102,15,56,220,248
|
||
|
movups 32(%rcx),%xmm0
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,220,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,220,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,220,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
+ movdqa %xmm13,48(%rsp)
|
||
|
.byte 102,15,56,220,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,220,241
|
||
|
+ movaps %xmm12,%xmm13
|
||
|
.byte 102,15,56,220,249
|
||
|
+ movups 48(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm12
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,220,208
|
||
|
+ pxor %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
+.byte 102,15,56,220,216
|
||
|
paddq %xmm15,%xmm15
|
||
|
-.byte 102,15,56,221,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
-.byte 102,15,56,221,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
-.byte 102,15,56,221,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-.byte 102,15,56,221,232
|
||
|
-.byte 102,15,56,221,240
|
||
|
-.byte 102,15,56,221,248
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+.byte 102,15,56,220,224
|
||
|
+.byte 102,15,56,220,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+.byte 102,15,56,220,240
|
||
|
+ movaps %xmm13,%xmm14
|
||
|
+.byte 102,15,56,220,248
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm13
|
||
|
+ movdqa %xmm9,%xmm0
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,220,209
|
||
|
+ pxor %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm0
|
||
|
+.byte 102,15,56,220,217
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm0
|
||
|
+.byte 102,15,56,220,225
|
||
|
+.byte 102,15,56,220,233
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+ movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,220,241
|
||
|
+.byte 102,15,56,220,249
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
+
|
||
|
+ pxor %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
+.byte 102,15,56,221,84,36,0
|
||
|
paddq %xmm15,%xmm15
|
||
|
- xorps 0(%rsp),%xmm2
|
||
|
pand %xmm8,%xmm9
|
||
|
- xorps 16(%rsp),%xmm3
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+.byte 102,15,56,221,92,36,16
|
||
|
+.byte 102,15,56,221,100,36,32
|
||
|
pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- xorps 32(%rsp),%xmm4
|
||
|
- movups %xmm2,0(%rsi)
|
||
|
- xorps 48(%rsp),%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps 64(%rsp),%xmm6
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- xorps 80(%rsp),%xmm7
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+.byte 102,15,56,221,108,36,48
|
||
|
+.byte 102,15,56,221,116,36,64
|
||
|
+.byte 102,15,56,221,124,36,80
|
||
|
movl %r10d,%eax
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
+
|
||
|
leaq 96(%rsi),%rsi
|
||
|
+ movups %xmm2,-96(%rsi)
|
||
|
+ movups %xmm3,-80(%rsi)
|
||
|
+ movups %xmm4,-64(%rsi)
|
||
|
+ movups %xmm5,-48(%rsi)
|
||
|
+ movups %xmm6,-32(%rsi)
|
||
|
+ movups %xmm7,-16(%rsi)
|
||
|
subq $96,%rdx
|
||
|
jnc L$xts_enc_grandloop
|
||
|
|
||
|
- leal 3(%rax,%rax,1),%eax
|
||
|
+ leal 7(%rax,%rax,1),%eax
|
||
|
movq %r11,%rcx
|
||
|
movl %eax,%r10d
|
||
|
|
||
|
L$xts_enc_short:
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
addq $96,%rdx
|
||
|
jz L$xts_enc_done
|
||
|
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
cmpq $32,%rdx
|
||
|
jb L$xts_enc_one
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
je L$xts_enc_two
|
||
|
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
cmpq $64,%rdx
|
||
|
jb L$xts_enc_three
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
je L$xts_enc_four
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
movdqu 48(%rdi),%xmm5
|
||
|
@@ -1514,15 +1740,15 @@ L$xts_enc_four:
|
||
|
|
||
|
call _aesni_encrypt4
|
||
|
|
||
|
- xorps %xmm10,%xmm2
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- xorps %xmm11,%xmm3
|
||
|
- xorps %xmm12,%xmm4
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm13,%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movdqa %xmm14,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
leaq 64(%rsi),%rsi
|
||
|
jmp L$xts_enc_done
|
||
|
|
||
|
@@ -1563,7 +1789,8 @@ L$oop_enc1_10:
|
||
|
movups %xmm2,-16(%rsi)
|
||
|
|
||
|
L$xts_enc_ret:
|
||
|
- leaq 104(%rsp),%rsp
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
L$xts_enc_epilogue:
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
@@ -1571,7 +1798,11 @@ L$xts_enc_epilogue:
|
||
|
|
||
|
.p2align 4
|
||
|
_aesni_xts_decrypt:
|
||
|
- leaq -104(%rsp),%rsp
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $112,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
movups (%r9),%xmm15
|
||
|
movl 240(%r8),%eax
|
||
|
movl 240(%rcx),%r10d
|
||
|
@@ -1592,228 +1823,266 @@ L$oop_enc1_11:
|
||
|
shlq $4,%rax
|
||
|
subq %rax,%rdx
|
||
|
|
||
|
+ movups (%rcx),%xmm0
|
||
|
movq %rcx,%r11
|
||
|
movl %r10d,%eax
|
||
|
+ shll $4,%r10d
|
||
|
movq %rdx,%r9
|
||
|
andq $-16,%rdx
|
||
|
|
||
|
+ movups 16(%rcx,%r10,1),%xmm1
|
||
|
+ movl %eax,%r10d
|
||
|
+
|
||
|
movdqa L$xts_magic(%rip),%xmm8
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
paddq %xmm15,%xmm15
|
||
|
- pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- pxor %xmm9,%xmm15
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
movdqa %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+ movdqa %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
paddq %xmm15,%xmm15
|
||
|
pand %xmm8,%xmm9
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
pxor %xmm9,%xmm15
|
||
|
+ movaps %xmm1,96(%rsp)
|
||
|
+
|
||
|
subq $96,%rdx
|
||
|
jc L$xts_dec_short
|
||
|
|
||
|
shrl $1,%eax
|
||
|
- subl $1,%eax
|
||
|
+ subl $3,%eax
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
movl %eax,%r10d
|
||
|
+ leaq L$xts_magic(%rip),%r8
|
||
|
jmp L$xts_dec_grandloop
|
||
|
|
||
|
-.p2align 4
|
||
|
+.p2align 5
|
||
|
L$xts_dec_grandloop:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu 0(%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ movdqa %xmm0,%xmm8
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm11,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
pxor %xmm12,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
- leaq 96(%rdi),%rdi
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
pxor %xmm13,%xmm5
|
||
|
- movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,222,225
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm15,%xmm8
|
||
|
+ movdqa 96(%rsp),%xmm9
|
||
|
pxor %xmm14,%xmm6
|
||
|
- pxor %xmm15,%xmm7
|
||
|
-
|
||
|
-
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ movups 32(%r11),%xmm0
|
||
|
+ leaq 96(%rdi),%rdi
|
||
|
+ pxor %xmm8,%xmm7
|
||
|
|
||
|
- movups 16(%r11),%xmm1
|
||
|
- pxor %xmm0,%xmm2
|
||
|
- pxor %xmm0,%xmm3
|
||
|
+ pxor %xmm9,%xmm10
|
||
|
+.byte 102,15,56,222,241
|
||
|
+ pxor %xmm9,%xmm11
|
||
|
movdqa %xmm10,0(%rsp)
|
||
|
-.byte 102,15,56,222,209
|
||
|
- leaq 32(%r11),%rcx
|
||
|
- pxor %xmm0,%xmm4
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ movups 48(%r11),%xmm1
|
||
|
+
|
||
|
+.byte 102,15,56,222,208
|
||
|
+ pxor %xmm9,%xmm12
|
||
|
movdqa %xmm11,16(%rsp)
|
||
|
-.byte 102,15,56,222,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
+.byte 102,15,56,222,216
|
||
|
+ pxor %xmm9,%xmm13
|
||
|
movdqa %xmm12,32(%rsp)
|
||
|
-.byte 102,15,56,222,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqa %xmm13,48(%rsp)
|
||
|
-.byte 102,15,56,222,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- movups (%rcx),%xmm0
|
||
|
- decl %eax
|
||
|
+.byte 102,15,56,222,224
|
||
|
+ pxor %xmm9,%xmm14
|
||
|
+.byte 102,15,56,222,232
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
movdqa %xmm14,64(%rsp)
|
||
|
-.byte 102,15,56,222,241
|
||
|
- movdqa %xmm15,80(%rsp)
|
||
|
-.byte 102,15,56,222,249
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
- jmp L$xts_dec_loop6_enter
|
||
|
-
|
||
|
-.p2align 4
|
||
|
+.byte 102,15,56,222,240
|
||
|
+ movdqa %xmm8,80(%rsp)
|
||
|
+.byte 102,15,56,222,248
|
||
|
+ movups 64(%r11),%xmm0
|
||
|
+ leaq 64(%r11),%rcx
|
||
|
+ pshufd $95,%xmm15,%xmm9
|
||
|
+ jmp L$xts_dec_loop6
|
||
|
+.p2align 5
|
||
|
L$xts_dec_loop6:
|
||
|
.byte 102,15,56,222,209
|
||
|
.byte 102,15,56,222,217
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,222,225
|
||
|
.byte 102,15,56,222,233
|
||
|
.byte 102,15,56,222,241
|
||
|
.byte 102,15,56,222,249
|
||
|
-L$xts_dec_loop6_enter:
|
||
|
movups 16(%rcx),%xmm1
|
||
|
+ leaq 32(%rcx),%rcx
|
||
|
+
|
||
|
.byte 102,15,56,222,208
|
||
|
.byte 102,15,56,222,216
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
.byte 102,15,56,222,224
|
||
|
.byte 102,15,56,222,232
|
||
|
.byte 102,15,56,222,240
|
||
|
.byte 102,15,56,222,248
|
||
|
movups (%rcx),%xmm0
|
||
|
+ decl %eax
|
||
|
jnz L$xts_dec_loop6
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa (%r8),%xmm8
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+ movups (%r11),%xmm10
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,222,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,241
|
||
|
+ movaps %xmm10,%xmm11
|
||
|
.byte 102,15,56,222,249
|
||
|
movups 16(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm10
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm10
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,222,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
.byte 102,15,56,222,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,240
|
||
|
+ movaps %xmm11,%xmm12
|
||
|
.byte 102,15,56,222,248
|
||
|
movups 32(%rcx),%xmm0
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
- paddq %xmm15,%xmm15
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
.byte 102,15,56,222,209
|
||
|
- pand %xmm8,%xmm9
|
||
|
+ pxor %xmm15,%xmm11
|
||
|
+ psrad $31,%xmm14
|
||
|
.byte 102,15,56,222,217
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm14
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm9,%xmm15
|
||
|
+ movdqa %xmm13,48(%rsp)
|
||
|
.byte 102,15,56,222,233
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
.byte 102,15,56,222,241
|
||
|
+ movaps %xmm12,%xmm13
|
||
|
.byte 102,15,56,222,249
|
||
|
+ movups 48(%rcx),%xmm1
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm12
|
||
|
+ movdqa %xmm9,%xmm14
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,222,208
|
||
|
+ pxor %xmm15,%xmm12
|
||
|
+ psrad $31,%xmm14
|
||
|
+.byte 102,15,56,222,216
|
||
|
paddq %xmm15,%xmm15
|
||
|
-.byte 102,15,56,223,208
|
||
|
- pand %xmm8,%xmm9
|
||
|
-.byte 102,15,56,223,216
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
-.byte 102,15,56,223,224
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-.byte 102,15,56,223,232
|
||
|
-.byte 102,15,56,223,240
|
||
|
-.byte 102,15,56,223,248
|
||
|
+ pand %xmm8,%xmm14
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+ pxor %xmm14,%xmm15
|
||
|
+.byte 102,15,56,222,240
|
||
|
+ movaps %xmm13,%xmm14
|
||
|
+.byte 102,15,56,222,248
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- pxor %xmm14,%xmm14
|
||
|
- movdqa %xmm15,%xmm13
|
||
|
+ movdqa %xmm9,%xmm0
|
||
|
+ paddd %xmm9,%xmm9
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ pxor %xmm15,%xmm13
|
||
|
+ psrad $31,%xmm0
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ paddq %xmm15,%xmm15
|
||
|
+ pand %xmm8,%xmm0
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+ movups (%r11),%xmm0
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ movups 16(%r11),%xmm1
|
||
|
+
|
||
|
+ pxor %xmm15,%xmm14
|
||
|
+ psrad $31,%xmm9
|
||
|
+.byte 102,15,56,223,84,36,0
|
||
|
paddq %xmm15,%xmm15
|
||
|
- xorps 0(%rsp),%xmm2
|
||
|
pand %xmm8,%xmm9
|
||
|
- xorps 16(%rsp),%xmm3
|
||
|
- pcmpgtd %xmm15,%xmm14
|
||
|
+.byte 102,15,56,223,92,36,16
|
||
|
+.byte 102,15,56,223,100,36,32
|
||
|
pxor %xmm9,%xmm15
|
||
|
-
|
||
|
- xorps 32(%rsp),%xmm4
|
||
|
- movups %xmm2,0(%rsi)
|
||
|
- xorps 48(%rsp),%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps 64(%rsp),%xmm6
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- xorps 80(%rsp),%xmm7
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+.byte 102,15,56,223,108,36,48
|
||
|
+.byte 102,15,56,223,116,36,64
|
||
|
+.byte 102,15,56,223,124,36,80
|
||
|
movl %r10d,%eax
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
+
|
||
|
leaq 96(%rsi),%rsi
|
||
|
+ movups %xmm2,-96(%rsi)
|
||
|
+ movups %xmm3,-80(%rsi)
|
||
|
+ movups %xmm4,-64(%rsi)
|
||
|
+ movups %xmm5,-48(%rsi)
|
||
|
+ movups %xmm6,-32(%rsi)
|
||
|
+ movups %xmm7,-16(%rsi)
|
||
|
subq $96,%rdx
|
||
|
jnc L$xts_dec_grandloop
|
||
|
|
||
|
- leal 3(%rax,%rax,1),%eax
|
||
|
+ leal 7(%rax,%rax,1),%eax
|
||
|
movq %r11,%rcx
|
||
|
movl %eax,%r10d
|
||
|
|
||
|
L$xts_dec_short:
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
addq $96,%rdx
|
||
|
jz L$xts_dec_done
|
||
|
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
cmpq $32,%rdx
|
||
|
jb L$xts_dec_one
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
je L$xts_dec_two
|
||
|
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
cmpq $64,%rdx
|
||
|
jb L$xts_dec_three
|
||
|
je L$xts_dec_four
|
||
|
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movdqu (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movdqu 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movdqu 32(%rdi),%xmm4
|
||
|
pxor %xmm10,%xmm2
|
||
|
movdqu 48(%rdi),%xmm5
|
||
|
@@ -1906,7 +2175,7 @@ L$xts_dec_three:
|
||
|
xorps %xmm10,%xmm2
|
||
|
movdqa %xmm13,%xmm10
|
||
|
xorps %xmm11,%xmm3
|
||
|
- movdqa %xmm15,%xmm11
|
||
|
+ movdqa %xmm14,%xmm11
|
||
|
xorps %xmm12,%xmm4
|
||
|
movups %xmm2,(%rsi)
|
||
|
movups %xmm3,16(%rsi)
|
||
|
@@ -1916,14 +2185,8 @@ L$xts_dec_three:
|
||
|
|
||
|
.p2align 4
|
||
|
L$xts_dec_four:
|
||
|
- pshufd $19,%xmm14,%xmm9
|
||
|
- movdqa %xmm15,%xmm14
|
||
|
- paddq %xmm15,%xmm15
|
||
|
movups (%rdi),%xmm2
|
||
|
- pand %xmm8,%xmm9
|
||
|
movups 16(%rdi),%xmm3
|
||
|
- pxor %xmm9,%xmm15
|
||
|
-
|
||
|
movups 32(%rdi),%xmm4
|
||
|
xorps %xmm10,%xmm2
|
||
|
movups 48(%rdi),%xmm5
|
||
|
@@ -1934,16 +2197,16 @@ L$xts_dec_four:
|
||
|
|
||
|
call _aesni_decrypt4
|
||
|
|
||
|
- xorps %xmm10,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
movdqa %xmm14,%xmm10
|
||
|
- xorps %xmm11,%xmm3
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
movdqa %xmm15,%xmm11
|
||
|
- xorps %xmm12,%xmm4
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm13,%xmm5
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
leaq 64(%rsi),%rsi
|
||
|
jmp L$xts_dec_done
|
||
|
|
||
|
@@ -2003,7 +2266,8 @@ L$oop_dec1_14:
|
||
|
movups %xmm2,(%rsi)
|
||
|
|
||
|
L$xts_dec_ret:
|
||
|
- leaq 104(%rsp),%rsp
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
L$xts_dec_epilogue:
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
@@ -2070,149 +2334,324 @@ L$cbc_enc_tail:
|
||
|
|
||
|
.p2align 4
|
||
|
L$cbc_decrypt:
|
||
|
- movups (%r8),%xmm9
|
||
|
+ leaq (%rsp),%rax
|
||
|
+ pushq %rbp
|
||
|
+ subq $16,%rsp
|
||
|
+ andq $-16,%rsp
|
||
|
+ leaq -8(%rax),%rbp
|
||
|
+ movups (%r8),%xmm10
|
||
|
movl %r10d,%eax
|
||
|
- cmpq $112,%rdx
|
||
|
+ cmpq $80,%rdx
|
||
|
jbe L$cbc_dec_tail
|
||
|
- shrl $1,%r10d
|
||
|
+
|
||
|
+ movups (%rcx),%xmm0
|
||
|
+ movdqu 0(%rdi),%xmm2
|
||
|
+ movdqu 16(%rdi),%xmm3
|
||
|
+ movdqa %xmm2,%xmm11
|
||
|
+ movdqu 32(%rdi),%xmm4
|
||
|
+ movdqa %xmm3,%xmm12
|
||
|
+ movdqu 48(%rdi),%xmm5
|
||
|
+ movdqa %xmm4,%xmm13
|
||
|
+ movdqu 64(%rdi),%xmm6
|
||
|
+ movdqa %xmm5,%xmm14
|
||
|
+ movdqu 80(%rdi),%xmm7
|
||
|
+ movdqa %xmm6,%xmm15
|
||
|
+ cmpq $112,%rdx
|
||
|
+ jbe L$cbc_dec_six_or_seven
|
||
|
+
|
||
|
subq $112,%rdx
|
||
|
- movl %r10d,%eax
|
||
|
- movaps %xmm9,-24(%rsp)
|
||
|
+ leaq 112(%rcx),%rcx
|
||
|
jmp L$cbc_dec_loop8_enter
|
||
|
.p2align 4
|
||
|
L$cbc_dec_loop8:
|
||
|
- movaps %xmm0,-24(%rsp)
|
||
|
movups %xmm9,(%rsi)
|
||
|
leaq 16(%rsi),%rsi
|
||
|
L$cbc_dec_loop8_enter:
|
||
|
- movups (%rcx),%xmm0
|
||
|
- movups (%rdi),%xmm2
|
||
|
- movups 16(%rdi),%xmm3
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
+ movdqu 96(%rdi),%xmm8
|
||
|
+ pxor %xmm0,%xmm2
|
||
|
+ movdqu 112(%rdi),%xmm9
|
||
|
+ pxor %xmm0,%xmm3
|
||
|
+ movups 16-112(%rcx),%xmm1
|
||
|
+ pxor %xmm0,%xmm4
|
||
|
+ xorq %r11,%r11
|
||
|
+ cmpq $112,%rdx
|
||
|
+ pxor %xmm0,%xmm5
|
||
|
+ pxor %xmm0,%xmm6
|
||
|
+ pxor %xmm0,%xmm7
|
||
|
+ pxor %xmm0,%xmm8
|
||
|
|
||
|
- leaq 32(%rcx),%rcx
|
||
|
- movdqu 32(%rdi),%xmm4
|
||
|
- xorps %xmm0,%xmm2
|
||
|
- movdqu 48(%rdi),%xmm5
|
||
|
- xorps %xmm0,%xmm3
|
||
|
- movdqu 64(%rdi),%xmm6
|
||
|
.byte 102,15,56,222,209
|
||
|
- pxor %xmm0,%xmm4
|
||
|
- movdqu 80(%rdi),%xmm7
|
||
|
+ pxor %xmm0,%xmm9
|
||
|
+ movups 32-112(%rcx),%xmm0
|
||
|
.byte 102,15,56,222,217
|
||
|
- pxor %xmm0,%xmm5
|
||
|
- movdqu 96(%rdi),%xmm8
|
||
|
.byte 102,15,56,222,225
|
||
|
- pxor %xmm0,%xmm6
|
||
|
- movdqu 112(%rdi),%xmm9
|
||
|
.byte 102,15,56,222,233
|
||
|
- pxor %xmm0,%xmm7
|
||
|
- decl %eax
|
||
|
.byte 102,15,56,222,241
|
||
|
- pxor %xmm0,%xmm8
|
||
|
.byte 102,15,56,222,249
|
||
|
- pxor %xmm0,%xmm9
|
||
|
- movups (%rcx),%xmm0
|
||
|
+ setnc %r11b
|
||
|
.byte 102,68,15,56,222,193
|
||
|
+ shlq $7,%r11
|
||
|
.byte 102,68,15,56,222,201
|
||
|
- movups 16(%rcx),%xmm1
|
||
|
-
|
||
|
- call L$dec_loop8_enter
|
||
|
+ addq %rdi,%r11
|
||
|
+ movups 48-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 64-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 80-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 96-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 112-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 128-112(%rcx),%xmm0
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 144-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 160-112(%rcx),%xmm0
|
||
|
+ cmpl $11,%eax
|
||
|
+ jb L$cbc_dec_done
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 176-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 192-112(%rcx),%xmm0
|
||
|
+ je L$cbc_dec_done
|
||
|
+.byte 102,15,56,222,209
|
||
|
+.byte 102,15,56,222,217
|
||
|
+.byte 102,15,56,222,225
|
||
|
+.byte 102,15,56,222,233
|
||
|
+.byte 102,15,56,222,241
|
||
|
+.byte 102,15,56,222,249
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movups 208-112(%rcx),%xmm1
|
||
|
+.byte 102,15,56,222,208
|
||
|
+.byte 102,15,56,222,216
|
||
|
+.byte 102,15,56,222,224
|
||
|
+.byte 102,15,56,222,232
|
||
|
+.byte 102,15,56,222,240
|
||
|
+.byte 102,15,56,222,248
|
||
|
+.byte 102,68,15,56,222,192
|
||
|
+.byte 102,68,15,56,222,200
|
||
|
+ movups 224-112(%rcx),%xmm0
|
||
|
+L$cbc_dec_done:
|
||
|
+.byte 102,15,56,222,209
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+.byte 102,15,56,222,217
|
||
|
+ pxor %xmm0,%xmm11
|
||
|
+.byte 102,15,56,222,225
|
||
|
+ pxor %xmm0,%xmm12
|
||
|
+.byte 102,15,56,222,233
|
||
|
+ pxor %xmm0,%xmm13
|
||
|
+.byte 102,15,56,222,241
|
||
|
+ pxor %xmm0,%xmm14
|
||
|
+.byte 102,15,56,222,249
|
||
|
+ pxor %xmm0,%xmm15
|
||
|
+.byte 102,68,15,56,222,193
|
||
|
+.byte 102,68,15,56,222,201
|
||
|
+ movdqu 80(%rdi),%xmm1
|
||
|
+
|
||
|
+.byte 102,65,15,56,223,210
|
||
|
+ movdqu 96(%rdi),%xmm10
|
||
|
+ pxor %xmm0,%xmm1
|
||
|
+.byte 102,65,15,56,223,219
|
||
|
+ pxor %xmm0,%xmm10
|
||
|
+ movdqu 112(%rdi),%xmm0
|
||
|
+ leaq 128(%rdi),%rdi
|
||
|
+.byte 102,65,15,56,223,228
|
||
|
+ movdqu 0(%r11),%xmm11
|
||
|
+.byte 102,65,15,56,223,237
|
||
|
+ movdqu 16(%r11),%xmm12
|
||
|
+.byte 102,65,15,56,223,246
|
||
|
+ movdqu 32(%r11),%xmm13
|
||
|
+.byte 102,65,15,56,223,255
|
||
|
+ movdqu 48(%r11),%xmm14
|
||
|
+.byte 102,68,15,56,223,193
|
||
|
+ movdqu 64(%r11),%xmm15
|
||
|
+.byte 102,69,15,56,223,202
|
||
|
+ movdqa %xmm0,%xmm10
|
||
|
+ movdqu 80(%r11),%xmm1
|
||
|
+ movups -112(%rcx),%xmm0
|
||
|
|
||
|
- movups (%rdi),%xmm1
|
||
|
- movups 16(%rdi),%xmm0
|
||
|
- xorps -24(%rsp),%xmm2
|
||
|
- xorps %xmm1,%xmm3
|
||
|
- movups 32(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm4
|
||
|
- movups 48(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm5
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm6
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm7
|
||
|
- movups 96(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm8
|
||
|
- movups 112(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm9
|
||
|
movups %xmm2,(%rsi)
|
||
|
+ movdqa %xmm11,%xmm2
|
||
|
movups %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm12,%xmm3
|
||
|
movups %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm13,%xmm4
|
||
|
movups %xmm5,48(%rsi)
|
||
|
- movl %r10d,%eax
|
||
|
+ movdqa %xmm14,%xmm5
|
||
|
movups %xmm6,64(%rsi)
|
||
|
- movq %r11,%rcx
|
||
|
+ movdqa %xmm15,%xmm6
|
||
|
movups %xmm7,80(%rsi)
|
||
|
- leaq 128(%rdi),%rdi
|
||
|
+ movdqa %xmm1,%xmm7
|
||
|
movups %xmm8,96(%rsi)
|
||
|
leaq 112(%rsi),%rsi
|
||
|
+
|
||
|
subq $128,%rdx
|
||
|
ja L$cbc_dec_loop8
|
||
|
|
||
|
movaps %xmm9,%xmm2
|
||
|
- movaps %xmm0,%xmm9
|
||
|
+ leaq -112(%rcx),%rcx
|
||
|
addq $112,%rdx
|
||
|
jle L$cbc_dec_tail_collected
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- leal 1(%r10,%r10,1),%eax
|
||
|
+ movups %xmm9,(%rsi)
|
||
|
leaq 16(%rsi),%rsi
|
||
|
+ cmpq $80,%rdx
|
||
|
+ jbe L$cbc_dec_tail
|
||
|
+
|
||
|
+ movaps %xmm11,%xmm2
|
||
|
+L$cbc_dec_six_or_seven:
|
||
|
+ cmpq $96,%rdx
|
||
|
+ ja L$cbc_dec_seven
|
||
|
+
|
||
|
+ movaps %xmm7,%xmm8
|
||
|
+ call _aesni_decrypt6
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm8,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ pxor %xmm15,%xmm7
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ leaq 80(%rsi),%rsi
|
||
|
+ movdqa %xmm7,%xmm2
|
||
|
+ jmp L$cbc_dec_tail_collected
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+L$cbc_dec_seven:
|
||
|
+ movups 96(%rdi),%xmm8
|
||
|
+ xorps %xmm9,%xmm9
|
||
|
+ call _aesni_decrypt8
|
||
|
+ movups 80(%rdi),%xmm9
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movups 96(%rdi),%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ pxor %xmm15,%xmm7
|
||
|
+ movdqu %xmm6,64(%rsi)
|
||
|
+ pxor %xmm9,%xmm8
|
||
|
+ movdqu %xmm7,80(%rsi)
|
||
|
+ leaq 96(%rsi),%rsi
|
||
|
+ movdqa %xmm8,%xmm2
|
||
|
+ jmp L$cbc_dec_tail_collected
|
||
|
+
|
||
|
L$cbc_dec_tail:
|
||
|
movups (%rdi),%xmm2
|
||
|
- movaps %xmm2,%xmm8
|
||
|
- cmpq $16,%rdx
|
||
|
+ subq $16,%rdx
|
||
|
jbe L$cbc_dec_one
|
||
|
|
||
|
movups 16(%rdi),%xmm3
|
||
|
- movaps %xmm3,%xmm7
|
||
|
- cmpq $32,%rdx
|
||
|
+ movaps %xmm2,%xmm11
|
||
|
+ subq $16,%rdx
|
||
|
jbe L$cbc_dec_two
|
||
|
|
||
|
movups 32(%rdi),%xmm4
|
||
|
- movaps %xmm4,%xmm6
|
||
|
- cmpq $48,%rdx
|
||
|
+ movaps %xmm3,%xmm12
|
||
|
+ subq $16,%rdx
|
||
|
jbe L$cbc_dec_three
|
||
|
|
||
|
movups 48(%rdi),%xmm5
|
||
|
- cmpq $64,%rdx
|
||
|
+ movaps %xmm4,%xmm13
|
||
|
+ subq $16,%rdx
|
||
|
jbe L$cbc_dec_four
|
||
|
|
||
|
movups 64(%rdi),%xmm6
|
||
|
- cmpq $80,%rdx
|
||
|
- jbe L$cbc_dec_five
|
||
|
-
|
||
|
- movups 80(%rdi),%xmm7
|
||
|
- cmpq $96,%rdx
|
||
|
- jbe L$cbc_dec_six
|
||
|
-
|
||
|
- movups 96(%rdi),%xmm8
|
||
|
- movaps %xmm9,-24(%rsp)
|
||
|
- call _aesni_decrypt8
|
||
|
- movups (%rdi),%xmm1
|
||
|
- movups 16(%rdi),%xmm0
|
||
|
- xorps -24(%rsp),%xmm2
|
||
|
- xorps %xmm1,%xmm3
|
||
|
- movups 32(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm4
|
||
|
- movups 48(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm5
|
||
|
- movups 64(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm6
|
||
|
- movups 80(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm7
|
||
|
- movups 96(%rdi),%xmm9
|
||
|
- xorps %xmm0,%xmm8
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- movups %xmm7,80(%rsi)
|
||
|
- leaq 96(%rsi),%rsi
|
||
|
- movaps %xmm8,%xmm2
|
||
|
- subq $112,%rdx
|
||
|
+ movaps %xmm5,%xmm14
|
||
|
+ movaps %xmm6,%xmm15
|
||
|
+ xorps %xmm7,%xmm7
|
||
|
+ call _aesni_decrypt6
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm15,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ pxor %xmm14,%xmm6
|
||
|
+ movdqu %xmm5,48(%rsi)
|
||
|
+ leaq 64(%rsi),%rsi
|
||
|
+ movdqa %xmm6,%xmm2
|
||
|
+ subq $16,%rdx
|
||
|
jmp L$cbc_dec_tail_collected
|
||
|
+
|
||
|
.p2align 4
|
||
|
L$cbc_dec_one:
|
||
|
+ movaps %xmm2,%xmm11
|
||
|
movups (%rcx),%xmm0
|
||
|
movups 16(%rcx),%xmm1
|
||
|
leaq 32(%rcx),%rcx
|
||
|
@@ -2224,111 +2663,69 @@ L$oop_dec1_16:
|
||
|
leaq 16(%rcx),%rcx
|
||
|
jnz L$oop_dec1_16
|
||
|
.byte 102,15,56,223,209
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- movaps %xmm8,%xmm9
|
||
|
- subq $16,%rdx
|
||
|
+ xorps %xmm10,%xmm2
|
||
|
+ movaps %xmm11,%xmm10
|
||
|
jmp L$cbc_dec_tail_collected
|
||
|
.p2align 4
|
||
|
L$cbc_dec_two:
|
||
|
+ movaps %xmm3,%xmm12
|
||
|
xorps %xmm4,%xmm4
|
||
|
call _aesni_decrypt3
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movaps %xmm7,%xmm9
|
||
|
- movaps %xmm3,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm12,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ movdqa %xmm3,%xmm2
|
||
|
leaq 16(%rsi),%rsi
|
||
|
- subq $32,%rdx
|
||
|
jmp L$cbc_dec_tail_collected
|
||
|
.p2align 4
|
||
|
L$cbc_dec_three:
|
||
|
+ movaps %xmm4,%xmm13
|
||
|
call _aesni_decrypt3
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm7,%xmm4
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movaps %xmm6,%xmm9
|
||
|
- movaps %xmm4,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm13,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ movdqa %xmm4,%xmm2
|
||
|
leaq 32(%rsi),%rsi
|
||
|
- subq $48,%rdx
|
||
|
jmp L$cbc_dec_tail_collected
|
||
|
.p2align 4
|
||
|
L$cbc_dec_four:
|
||
|
+ movaps %xmm5,%xmm14
|
||
|
call _aesni_decrypt4
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- movups 48(%rdi),%xmm9
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- xorps %xmm7,%xmm4
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- xorps %xmm6,%xmm5
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movaps %xmm5,%xmm2
|
||
|
+ pxor %xmm10,%xmm2
|
||
|
+ movaps %xmm14,%xmm10
|
||
|
+ pxor %xmm11,%xmm3
|
||
|
+ movdqu %xmm2,(%rsi)
|
||
|
+ pxor %xmm12,%xmm4
|
||
|
+ movdqu %xmm3,16(%rsi)
|
||
|
+ pxor %xmm13,%xmm5
|
||
|
+ movdqu %xmm4,32(%rsi)
|
||
|
+ movdqa %xmm5,%xmm2
|
||
|
leaq 48(%rsi),%rsi
|
||
|
- subq $64,%rdx
|
||
|
- jmp L$cbc_dec_tail_collected
|
||
|
-.p2align 4
|
||
|
-L$cbc_dec_five:
|
||
|
- xorps %xmm7,%xmm7
|
||
|
- call _aesni_decrypt6
|
||
|
- movups 16(%rdi),%xmm1
|
||
|
- movups 32(%rdi),%xmm0
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- xorps %xmm1,%xmm4
|
||
|
- movups 48(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm5
|
||
|
- movups 64(%rdi),%xmm9
|
||
|
- xorps %xmm1,%xmm6
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- leaq 64(%rsi),%rsi
|
||
|
- movaps %xmm6,%xmm2
|
||
|
- subq $80,%rdx
|
||
|
- jmp L$cbc_dec_tail_collected
|
||
|
-.p2align 4
|
||
|
-L$cbc_dec_six:
|
||
|
- call _aesni_decrypt6
|
||
|
- movups 16(%rdi),%xmm1
|
||
|
- movups 32(%rdi),%xmm0
|
||
|
- xorps %xmm9,%xmm2
|
||
|
- xorps %xmm8,%xmm3
|
||
|
- xorps %xmm1,%xmm4
|
||
|
- movups 48(%rdi),%xmm1
|
||
|
- xorps %xmm0,%xmm5
|
||
|
- movups 64(%rdi),%xmm0
|
||
|
- xorps %xmm1,%xmm6
|
||
|
- movups 80(%rdi),%xmm9
|
||
|
- xorps %xmm0,%xmm7
|
||
|
- movups %xmm2,(%rsi)
|
||
|
- movups %xmm3,16(%rsi)
|
||
|
- movups %xmm4,32(%rsi)
|
||
|
- movups %xmm5,48(%rsi)
|
||
|
- movups %xmm6,64(%rsi)
|
||
|
- leaq 80(%rsi),%rsi
|
||
|
- movaps %xmm7,%xmm2
|
||
|
- subq $96,%rdx
|
||
|
jmp L$cbc_dec_tail_collected
|
||
|
+
|
||
|
.p2align 4
|
||
|
L$cbc_dec_tail_collected:
|
||
|
+ movups %xmm10,(%r8)
|
||
|
andq $15,%rdx
|
||
|
- movups %xmm9,(%r8)
|
||
|
jnz L$cbc_dec_tail_partial
|
||
|
movups %xmm2,(%rsi)
|
||
|
jmp L$cbc_dec_ret
|
||
|
.p2align 4
|
||
|
L$cbc_dec_tail_partial:
|
||
|
- movaps %xmm2,-24(%rsp)
|
||
|
+ movaps %xmm2,(%rsp)
|
||
|
movq $16,%rcx
|
||
|
movq %rsi,%rdi
|
||
|
subq %rdx,%rcx
|
||
|
- leaq -24(%rsp),%rsi
|
||
|
+ leaq (%rsp),%rsi
|
||
|
.long 0x9066A4F3
|
||
|
|
||
|
L$cbc_dec_ret:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ popq %rbp
|
||
|
L$cbc_ret:
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
@@ -2571,6 +2968,8 @@ L$increment64:
|
||
|
.long 1,0,0,0
|
||
|
L$xts_magic:
|
||
|
.long 0x87,0,1,0
|
||
|
+L$increment1:
|
||
|
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
||
|
|
||
|
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||
|
.p2align 6
|
||
|
diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
|
||
|
index b9ec30c..1327e82 100644
|
||
|
--- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
|
||
|
+++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
|
||
|
@@ -597,6 +597,468 @@ L$cbc_abort:
|
||
|
popq %rbp
|
||
|
.byte 0xf3,0xc3
|
||
|
|
||
|
+.globl _padlock_cfb_encrypt
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+_padlock_cfb_encrypt:
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz L$cfb_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz L$cfb_abort
|
||
|
+ leaq L$padlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz L$cfb_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz L$cfb_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+ jmp L$cfb_loop
|
||
|
+.p2align 4
|
||
|
+L$cfb_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz L$cfb_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+L$cfb_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,224
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz L$cfb_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+L$cfb_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jnz L$cfb_loop
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je L$cfb_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+L$cfb_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja L$cfb_bzero
|
||
|
+
|
||
|
+L$cfb_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp L$cfb_exit
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+L$cfb_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,224
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+L$cfb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+L$cfb_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+
|
||
|
+.globl _padlock_ofb_encrypt
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+_padlock_ofb_encrypt:
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz L$ofb_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz L$ofb_abort
|
||
|
+ leaq L$padlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz L$ofb_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz L$ofb_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+ jmp L$ofb_loop
|
||
|
+.p2align 4
|
||
|
+L$ofb_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz L$ofb_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+L$ofb_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,232
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz L$ofb_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+L$ofb_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jnz L$ofb_loop
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je L$ofb_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+L$ofb_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja L$ofb_bzero
|
||
|
+
|
||
|
+L$ofb_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp L$ofb_exit
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+L$ofb_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,232
|
||
|
+ movdqa (%rax),%xmm0
|
||
|
+ movdqa %xmm0,-16(%rdx)
|
||
|
+L$ofb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+L$ofb_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+
|
||
|
+.globl _padlock_ctr32_encrypt
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+_padlock_ctr32_encrypt:
|
||
|
+ pushq %rbp
|
||
|
+ pushq %rbx
|
||
|
+
|
||
|
+ xorl %eax,%eax
|
||
|
+ testq $15,%rdx
|
||
|
+ jnz L$ctr32_abort
|
||
|
+ testq $15,%rcx
|
||
|
+ jnz L$ctr32_abort
|
||
|
+ leaq L$padlock_saved_context(%rip),%rax
|
||
|
+ pushf
|
||
|
+ cld
|
||
|
+ call _padlock_verify_ctx
|
||
|
+ leaq 16(%rdx),%rdx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%rdx)
|
||
|
+ jnz L$ctr32_aligned
|
||
|
+ testq $15,%rdi
|
||
|
+ setz %al
|
||
|
+ testq $15,%rsi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz L$ctr32_aligned
|
||
|
+ negq %rax
|
||
|
+ movq $512,%rbx
|
||
|
+ notq %rax
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ cmovcq %rcx,%rbx
|
||
|
+ andq %rbx,%rax
|
||
|
+ movq %rcx,%rbx
|
||
|
+ negq %rax
|
||
|
+ andq $512-1,%rbx
|
||
|
+ leaq (%rax,%rbp,1),%rsp
|
||
|
+ movq $512,%rax
|
||
|
+ cmovzq %rax,%rbx
|
||
|
+L$ctr32_reenter:
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ negl %eax
|
||
|
+ andl $31,%eax
|
||
|
+ movq $512,%rbx
|
||
|
+ shll $4,%eax
|
||
|
+ cmovzq %rbx,%rax
|
||
|
+ cmpq %rax,%rcx
|
||
|
+ cmovaq %rax,%rbx
|
||
|
+ cmovbeq %rcx,%rbx
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ ja L$ctr32_loop
|
||
|
+ movq %rsi,%rax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rdi,%rax
|
||
|
+ addq %rcx,%rax
|
||
|
+ negq %rax
|
||
|
+ andq $4095,%rax
|
||
|
+ cmpq $32,%rax
|
||
|
+ movq $-32,%rax
|
||
|
+ cmovaeq %rbx,%rax
|
||
|
+ andq %rax,%rbx
|
||
|
+ jz L$ctr32_unaligned_tail
|
||
|
+ jmp L$ctr32_loop
|
||
|
+.p2align 4
|
||
|
+L$ctr32_loop:
|
||
|
+ cmpq %rcx,%rbx
|
||
|
+ cmovaq %rcx,%rbx
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rsi,%r9
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+ testq $15,%rdi
|
||
|
+ cmovnzq %rsp,%rdi
|
||
|
+ testq $15,%rsi
|
||
|
+ jz L$ctr32_inp_aligned
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rdi,%rsi
|
||
|
+L$ctr32_inp_aligned:
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ testl $4294901760,%eax
|
||
|
+ jnz L$ctr32_no_carry
|
||
|
+ bswapl %eax
|
||
|
+ addl $65536,%eax
|
||
|
+ bswapl %eax
|
||
|
+ movl %eax,-4(%rdx)
|
||
|
+L$ctr32_no_carry:
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %r11,%rbx
|
||
|
+ testq $15,%rdi
|
||
|
+ jz L$ctr32_out_aligned
|
||
|
+ movq %rbx,%rcx
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ shrq $3,%rcx
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ subq %rbx,%rdi
|
||
|
+L$ctr32_out_aligned:
|
||
|
+ movq %r9,%rsi
|
||
|
+ movq %r10,%rcx
|
||
|
+ addq %rbx,%rdi
|
||
|
+ addq %rbx,%rsi
|
||
|
+ subq %rbx,%rcx
|
||
|
+ movq $512,%rbx
|
||
|
+ jz L$ctr32_break
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ jae L$ctr32_loop
|
||
|
+ movq %rcx,%rbx
|
||
|
+ movq %rsi,%rax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rdi,%rax
|
||
|
+ addq %rcx,%rax
|
||
|
+ negq %rax
|
||
|
+ andq $4095,%rax
|
||
|
+ cmpq $32,%rax
|
||
|
+ movq $-32,%rax
|
||
|
+ cmovaeq %rbx,%rax
|
||
|
+ andq %rax,%rbx
|
||
|
+ jnz L$ctr32_loop
|
||
|
+L$ctr32_unaligned_tail:
|
||
|
+ xorl %eax,%eax
|
||
|
+ cmpq %rsp,%rbp
|
||
|
+ cmoveq %rcx,%rax
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rcx,%rbx
|
||
|
+ subq %rax,%rsp
|
||
|
+ shrq $3,%rcx
|
||
|
+ leaq (%rsp),%rdi
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ movq %rsp,%rsi
|
||
|
+ movq %r8,%rdi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ jmp L$ctr32_loop
|
||
|
+.p2align 4
|
||
|
+L$ctr32_break:
|
||
|
+ cmpq %rbp,%rsp
|
||
|
+ je L$ctr32_done
|
||
|
+
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leaq (%rsp),%rax
|
||
|
+L$ctr32_bzero:
|
||
|
+ movaps %xmm0,(%rax)
|
||
|
+ leaq 16(%rax),%rax
|
||
|
+ cmpq %rax,%rbp
|
||
|
+ ja L$ctr32_bzero
|
||
|
+
|
||
|
+L$ctr32_done:
|
||
|
+ leaq (%rbp),%rsp
|
||
|
+ jmp L$ctr32_exit
|
||
|
+
|
||
|
+.p2align 4
|
||
|
+L$ctr32_aligned:
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ negl %eax
|
||
|
+ andl $65535,%eax
|
||
|
+ movq $1048576,%rbx
|
||
|
+ shll $4,%eax
|
||
|
+ cmovzq %rbx,%rax
|
||
|
+ cmpq %rax,%rcx
|
||
|
+ cmovaq %rax,%rbx
|
||
|
+ cmovbeq %rcx,%rbx
|
||
|
+ jbe L$ctr32_aligned_skip
|
||
|
+
|
||
|
+L$ctr32_aligned_loop:
|
||
|
+ movq %rcx,%r10
|
||
|
+ movq %rbx,%rcx
|
||
|
+ movq %rbx,%r11
|
||
|
+
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+
|
||
|
+ movl -4(%rdx),%eax
|
||
|
+ bswapl %eax
|
||
|
+ addl $65536,%eax
|
||
|
+ bswapl %eax
|
||
|
+ movl %eax,-4(%rdx)
|
||
|
+
|
||
|
+ movq %r10,%rcx
|
||
|
+ subq %r11,%rcx
|
||
|
+ movq $1048576,%rbx
|
||
|
+ jz L$ctr32_exit
|
||
|
+ cmpq %rbx,%rcx
|
||
|
+ jae L$ctr32_aligned_loop
|
||
|
+
|
||
|
+L$ctr32_aligned_skip:
|
||
|
+ leaq (%rsi,%rcx,1),%rbp
|
||
|
+ negq %rbp
|
||
|
+ andq $4095,%rbp
|
||
|
+ xorl %eax,%eax
|
||
|
+ cmpq $32,%rbp
|
||
|
+ movq $32-1,%rbp
|
||
|
+ cmovaeq %rax,%rbp
|
||
|
+ andq %rcx,%rbp
|
||
|
+ subq %rbp,%rcx
|
||
|
+ jz L$ctr32_aligned_tail
|
||
|
+ leaq -16(%rdx),%rax
|
||
|
+ leaq 16(%rdx),%rbx
|
||
|
+ shrq $4,%rcx
|
||
|
+.byte 0xf3,0x0f,0xa7,216
|
||
|
+ testq %rbp,%rbp
|
||
|
+ jz L$ctr32_exit
|
||
|
+
|
||
|
+L$ctr32_aligned_tail:
|
||
|
+ movq %rdi,%r8
|
||
|
+ movq %rbp,%rbx
|
||
|
+ movq %rbp,%rcx
|
||
|
+ leaq (%rsp),%rbp
|
||
|
+ subq %rcx,%rsp
|
||
|
+ shrq $3,%rcx
|
||
|
+ leaq (%rsp),%rdi
|
||
|
+.byte 0xf3,0x48,0xa5
|
||
|
+ leaq (%r8),%rdi
|
||
|
+ leaq (%rsp),%rsi
|
||
|
+ movq %rbx,%rcx
|
||
|
+ jmp L$ctr32_loop
|
||
|
+L$ctr32_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leaq 8(%rsp),%rsp
|
||
|
+L$ctr32_abort:
|
||
|
+ popq %rbx
|
||
|
+ popq %rbp
|
||
|
+ .byte 0xf3,0xc3
|
||
|
+
|
||
|
.byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||
|
.p2align 4
|
||
|
.data
|
||
|
diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
|
||
|
index 7a38b7c..1a2fa92 100644
|
||
|
--- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s
|
||
|
+++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
|
||
|
@@ -510,6 +510,351 @@ L016cbc_abort:
|
||
|
popl %ebx
|
||
|
popl %ebp
|
||
|
ret
|
||
|
+.globl _padlock_cfb_encrypt
|
||
|
+.align 4
|
||
|
+_padlock_cfb_encrypt:
|
||
|
+L_padlock_cfb_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz L028cfb_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz L028cfb_abort
|
||
|
+ leal Lpadlock_saved_context-L029cfb_pic_point,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call __padlock_verify_ctx
|
||
|
+L029cfb_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%edx)
|
||
|
+ jnz L030cfb_aligned
|
||
|
+ testl $15,%edi
|
||
|
+ setz %al
|
||
|
+ testl $15,%esi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz L030cfb_aligned
|
||
|
+ negl %eax
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp L031cfb_loop
|
||
|
+.align 4,0x90
|
||
|
+L031cfb_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ testl $15,%edi
|
||
|
+ cmovnzl %esp,%edi
|
||
|
+ testl $15,%esi
|
||
|
+ jz L032cfb_inp_aligned
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %edi,%esi
|
||
|
+L032cfb_inp_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,224
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ testl $15,%edi
|
||
|
+ jz L033cfb_out_aligned
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+L033cfb_out_aligned:
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz L031cfb_loop
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je L034cfb_done
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+L035cfb_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja L035cfb_bzero
|
||
|
+L034cfb_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ jmp L036cfb_exit
|
||
|
+.align 4,0x90
|
||
|
+L030cfb_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,224
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+L036cfb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+L028cfb_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
+.globl _padlock_ofb_encrypt
|
||
|
+.align 4
|
||
|
+_padlock_ofb_encrypt:
|
||
|
+L_padlock_ofb_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz L037ofb_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz L037ofb_abort
|
||
|
+ leal Lpadlock_saved_context-L038ofb_pic_point,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call __padlock_verify_ctx
|
||
|
+L038ofb_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ xorl %ebx,%ebx
|
||
|
+ testl $32,(%edx)
|
||
|
+ jnz L039ofb_aligned
|
||
|
+ testl $15,%edi
|
||
|
+ setz %al
|
||
|
+ testl $15,%esi
|
||
|
+ setz %bl
|
||
|
+ testl %ebx,%eax
|
||
|
+ jnz L039ofb_aligned
|
||
|
+ negl %eax
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp L040ofb_loop
|
||
|
+.align 4,0x90
|
||
|
+L040ofb_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ testl $15,%edi
|
||
|
+ cmovnzl %esp,%edi
|
||
|
+ testl $15,%esi
|
||
|
+ jz L041ofb_inp_aligned
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %edi,%esi
|
||
|
+L041ofb_inp_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,232
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ testl $15,%edi
|
||
|
+ jz L042ofb_out_aligned
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal (%esp),%esi
|
||
|
+ shrl $2,%ecx
|
||
|
+.byte 243,165
|
||
|
+ subl %ebx,%edi
|
||
|
+L042ofb_out_aligned:
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz L040ofb_loop
|
||
|
+ cmpl %ebp,%esp
|
||
|
+ je L043ofb_done
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+L044ofb_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja L044ofb_bzero
|
||
|
+L043ofb_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ jmp L045ofb_exit
|
||
|
+.align 4,0x90
|
||
|
+L039ofb_aligned:
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,232
|
||
|
+ movaps (%eax),%xmm0
|
||
|
+ movaps %xmm0,-16(%edx)
|
||
|
+L045ofb_exit:
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+L037ofb_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
+.globl _padlock_ctr32_encrypt
|
||
|
+.align 4
|
||
|
+_padlock_ctr32_encrypt:
|
||
|
+L_padlock_ctr32_encrypt_begin:
|
||
|
+ pushl %ebp
|
||
|
+ pushl %ebx
|
||
|
+ pushl %esi
|
||
|
+ pushl %edi
|
||
|
+ movl 20(%esp),%edi
|
||
|
+ movl 24(%esp),%esi
|
||
|
+ movl 28(%esp),%edx
|
||
|
+ movl 32(%esp),%ecx
|
||
|
+ testl $15,%edx
|
||
|
+ jnz L046ctr32_abort
|
||
|
+ testl $15,%ecx
|
||
|
+ jnz L046ctr32_abort
|
||
|
+ leal Lpadlock_saved_context-L047ctr32_pic_point,%eax
|
||
|
+ pushfl
|
||
|
+ cld
|
||
|
+ call __padlock_verify_ctx
|
||
|
+L047ctr32_pic_point:
|
||
|
+ leal 16(%edx),%edx
|
||
|
+ xorl %eax,%eax
|
||
|
+ movq -16(%edx),%mm0
|
||
|
+ movl $512,%ebx
|
||
|
+ notl %eax
|
||
|
+ leal -24(%esp),%ebp
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ cmovcl %ecx,%ebx
|
||
|
+ andl %ebx,%eax
|
||
|
+ movl %ecx,%ebx
|
||
|
+ negl %eax
|
||
|
+ andl $511,%ebx
|
||
|
+ leal (%eax,%ebp,1),%esp
|
||
|
+ movl $512,%eax
|
||
|
+ cmovzl %eax,%ebx
|
||
|
+ movl %ebp,%eax
|
||
|
+ andl $-16,%ebp
|
||
|
+ andl $-16,%esp
|
||
|
+ movl %eax,16(%ebp)
|
||
|
+ jmp L048ctr32_loop
|
||
|
+.align 4,0x90
|
||
|
+L048ctr32_loop:
|
||
|
+ movl %edi,(%ebp)
|
||
|
+ movl %esi,4(%ebp)
|
||
|
+ movl %ecx,8(%ebp)
|
||
|
+ movl %ebx,%ecx
|
||
|
+ movl %ebx,12(%ebp)
|
||
|
+ movl -4(%edx),%ecx
|
||
|
+ xorl %edi,%edi
|
||
|
+ movl -8(%edx),%eax
|
||
|
+L049ctr32_prepare:
|
||
|
+ movl %ecx,12(%esp,%edi,1)
|
||
|
+ bswap %ecx
|
||
|
+ movq %mm0,(%esp,%edi,1)
|
||
|
+ incl %ecx
|
||
|
+ movl %eax,8(%esp,%edi,1)
|
||
|
+ bswap %ecx
|
||
|
+ leal 16(%edi),%edi
|
||
|
+ cmpl %ebx,%edi
|
||
|
+ jb L049ctr32_prepare
|
||
|
+ movl %ecx,-4(%edx)
|
||
|
+ leal (%esp),%esi
|
||
|
+ leal (%esp),%edi
|
||
|
+ movl %ebx,%ecx
|
||
|
+ leal -16(%edx),%eax
|
||
|
+ leal 16(%edx),%ebx
|
||
|
+ shrl $4,%ecx
|
||
|
+.byte 243,15,167,200
|
||
|
+ movl (%ebp),%edi
|
||
|
+ movl 12(%ebp),%ebx
|
||
|
+ movl 4(%ebp),%esi
|
||
|
+ xorl %ecx,%ecx
|
||
|
+L050ctr32_xor:
|
||
|
+ movups (%esi,%ecx,1),%xmm1
|
||
|
+ leal 16(%ecx),%ecx
|
||
|
+ pxor -16(%esp,%ecx,1),%xmm1
|
||
|
+ movups %xmm1,-16(%edi,%ecx,1)
|
||
|
+ cmpl %ebx,%ecx
|
||
|
+ jb L050ctr32_xor
|
||
|
+ movl 8(%ebp),%ecx
|
||
|
+ addl %ebx,%edi
|
||
|
+ addl %ebx,%esi
|
||
|
+ subl %ebx,%ecx
|
||
|
+ movl $512,%ebx
|
||
|
+ jnz L048ctr32_loop
|
||
|
+ pxor %xmm0,%xmm0
|
||
|
+ leal (%esp),%eax
|
||
|
+L051ctr32_bzero:
|
||
|
+ movaps %xmm0,(%eax)
|
||
|
+ leal 16(%eax),%eax
|
||
|
+ cmpl %eax,%ebp
|
||
|
+ ja L051ctr32_bzero
|
||
|
+L052ctr32_done:
|
||
|
+ movl 16(%ebp),%ebp
|
||
|
+ leal 24(%ebp),%esp
|
||
|
+ movl $1,%eax
|
||
|
+ leal 4(%esp),%esp
|
||
|
+ emms
|
||
|
+L046ctr32_abort:
|
||
|
+ popl %edi
|
||
|
+ popl %esi
|
||
|
+ popl %ebx
|
||
|
+ popl %ebp
|
||
|
+ ret
|
||
|
.globl _padlock_xstore
|
||
|
.align 4
|
||
|
_padlock_xstore:
|
||
|
@@ -526,10 +871,10 @@ __win32_segv_handler:
|
||
|
movl 4(%esp),%edx
|
||
|
movl 12(%esp),%ecx
|
||
|
cmpl $3221225477,(%edx)
|
||
|
- jne L028ret
|
||
|
+ jne L053ret
|
||
|
addl $4,184(%ecx)
|
||
|
movl $0,%eax
|
||
|
-L028ret:
|
||
|
+L053ret:
|
||
|
ret
|
||
|
.globl _padlock_sha1_oneshot
|
||
|
.align 4
|
||
|
--
|
||
|
1.8.4.2
|
||
|
|