mirror of
https://github.com/oxen-io/session-android.git
synced 2025-01-11 18:03:39 +00:00
d83a3d71bc
Merge in RedPhone // FREEBIE
7835 lines
207 KiB
Diff
7835 lines
207 KiB
Diff
diff --git a/Configure b/Configure
|
||
index de78469..26743bb 100755
|
||
--- a/Configure
|
||
+++ b/Configure
|
||
@@ -136,7 +136,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a
|
||
my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
|
||
my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
|
||
my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
|
||
-my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
|
||
+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void";
|
||
+my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
|
||
my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
|
||
my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
|
||
my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
|
||
@@ -350,6 +351,7 @@ my %table=(
|
||
# It's believed that majority of ARM toolchains predefine appropriate -march.
|
||
# If you compiler does not, do complement config command line with one!
|
||
"linux-armv4", "gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||
+"linux-aarch64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||
#### IA-32 targets...
|
||
"linux-ia32-icc", "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||
"linux-elf", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||
@@ -1503,7 +1505,7 @@ if ($rmd160_obj =~ /\.o$/)
|
||
}
|
||
if ($aes_obj =~ /\.o$/)
|
||
{
|
||
- $cflags.=" -DAES_ASM";
|
||
+ $cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/);;
|
||
# aes-ctr.o is not a real file, only indication that assembler
|
||
# module implements AES_ctr32_encrypt...
|
||
$cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes\-ctr\.o//);
|
||
@@ -1525,7 +1527,7 @@ else {
|
||
$wp_obj="wp_block.o";
|
||
}
|
||
$cmll_obj=$cmll_enc unless ($cmll_obj =~ /.o$/);
|
||
-if ($modes_obj =~ /ghash/)
|
||
+if ($modes_obj =~ /ghash\-/)
|
||
{
|
||
$cflags.=" -DGHASH_ASM";
|
||
}
|
||
diff --git a/config b/config
|
||
index 41fa2a6..dff7df7 100755
|
||
--- a/config
|
||
+++ b/config
|
||
@@ -644,6 +644,7 @@ case "$GUESSOS" in
|
||
armv[1-3]*-*-linux2) OUT="linux-generic32" ;;
|
||
armv[7-9]*-*-linux2) OUT="linux-armv4"; options="$options -march=armv7-a" ;;
|
||
arm*-*-linux2) OUT="linux-armv4" ;;
|
||
+ aarch64-*-linux2) OUT="linux-aarch64" ;;
|
||
sh*b-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
|
||
sh*-*-linux2) OUT="linux-generic32"; options="$options -DL_ENDIAN" ;;
|
||
m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
|
||
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
|
||
index 45ede0a..9181a1a 100644
|
||
--- a/crypto/aes/Makefile
|
||
+++ b/crypto/aes/Makefile
|
||
@@ -78,9 +78,15 @@ aes-parisc.s: asm/aes-parisc.pl
|
||
aes-mips.S: asm/aes-mips.pl
|
||
$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@
|
||
|
||
+aesv8-armx.S: asm/aesv8-armx.pl
|
||
+ $(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@
|
||
+aesv8-armx.o: aesv8-armx.S
|
||
+
|
||
# GNU make "catch all"
|
||
aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@
|
||
aes-armv4.o: aes-armv4.S
|
||
+bsaes-%.S: asm/bsaes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
|
||
+bsaes-armv7.o: bsaes-armv7.S
|
||
|
||
files:
|
||
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
|
||
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
|
||
index 86b86c4..4f89170 100644
|
||
--- a/crypto/aes/asm/aes-armv4.pl
|
||
+++ b/crypto/aes/asm/aes-armv4.pl
|
||
@@ -1,7 +1,7 @@
|
||
#!/usr/bin/env perl
|
||
|
||
# ====================================================================
|
||
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
@@ -51,9 +51,23 @@ $key="r11";
|
||
$rounds="r12";
|
||
|
||
$code=<<___;
|
||
-#include "arm_arch.h"
|
||
+#ifndef __KERNEL__
|
||
+# include "arm_arch.h"
|
||
+#else
|
||
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
|
||
+#endif
|
||
+
|
||
.text
|
||
+#if __ARM_ARCH__<7
|
||
+.code 32
|
||
+#else
|
||
+.syntax unified
|
||
+# ifdef __thumb2__
|
||
+.thumb
|
||
+# else
|
||
.code 32
|
||
+# endif
|
||
+#endif
|
||
|
||
.type AES_Te,%object
|
||
.align 5
|
||
@@ -167,7 +181,11 @@ AES_Te:
|
||
.type AES_encrypt,%function
|
||
.align 5
|
||
AES_encrypt:
|
||
+#if __ARM_ARCH__<7
|
||
sub r3,pc,#8 @ AES_encrypt
|
||
+#else
|
||
+ adr r3,AES_encrypt
|
||
+#endif
|
||
stmdb sp!,{r1,r4-r12,lr}
|
||
mov $rounds,r0 @ inp
|
||
mov $key,r2
|
||
@@ -409,11 +427,21 @@ _armv4_AES_encrypt:
|
||
.align 5
|
||
private_AES_set_encrypt_key:
|
||
_armv4_AES_set_encrypt_key:
|
||
+#if __ARM_ARCH__<7
|
||
sub r3,pc,#8 @ AES_set_encrypt_key
|
||
+#else
|
||
+ adr r3,private_AES_set_encrypt_key
|
||
+#endif
|
||
teq r0,#0
|
||
+#if __ARM_ARCH__>=7
|
||
+ itt eq @ Thumb2 thing, sanity check in ARM
|
||
+#endif
|
||
moveq r0,#-1
|
||
beq .Labrt
|
||
teq r2,#0
|
||
+#if __ARM_ARCH__>=7
|
||
+ itt eq @ Thumb2 thing, sanity check in ARM
|
||
+#endif
|
||
moveq r0,#-1
|
||
beq .Labrt
|
||
|
||
@@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key:
|
||
teq r1,#192
|
||
beq .Lok
|
||
teq r1,#256
|
||
+#if __ARM_ARCH__>=7
|
||
+ itt ne @ Thumb2 thing, sanity check in ARM
|
||
+#endif
|
||
movne r0,#-1
|
||
bne .Labrt
|
||
|
||
@@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key:
|
||
str $s2,[$key,#-16]
|
||
subs $rounds,$rounds,#1
|
||
str $s3,[$key,#-12]
|
||
+#if __ARM_ARCH__>=7
|
||
+ itt eq @ Thumb2 thing, sanity check in ARM
|
||
+#endif
|
||
subeq r2,$key,#216
|
||
beq .Ldone
|
||
|
||
@@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key:
|
||
str $s2,[$key,#-24]
|
||
subs $rounds,$rounds,#1
|
||
str $s3,[$key,#-20]
|
||
+#if __ARM_ARCH__>=7
|
||
+ itt eq @ Thumb2 thing, sanity check in ARM
|
||
+#endif
|
||
subeq r2,$key,#256
|
||
beq .Ldone
|
||
|
||
@@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key:
|
||
str $i3,[$key,#-4]
|
||
b .L256_loop
|
||
|
||
+.align 2
|
||
.Ldone: mov r0,#0
|
||
ldmia sp!,{r4-r12,lr}
|
||
-.Labrt: tst lr,#1
|
||
+.Labrt:
|
||
+#if __ARM_ARCH__>=5
|
||
+ ret @ bx lr
|
||
+#else
|
||
+ tst lr,#1
|
||
moveq pc,lr @ be binary compatible with V4, yet
|
||
bx lr @ interoperable with Thumb ISA:-)
|
||
+#endif
|
||
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
|
||
|
||
.global private_AES_set_decrypt_key
|
||
@@ -688,34 +731,57 @@ private_AES_set_decrypt_key:
|
||
str lr,[sp,#-4]! @ push lr
|
||
bl _armv4_AES_set_encrypt_key
|
||
teq r0,#0
|
||
- ldrne lr,[sp],#4 @ pop lr
|
||
+ ldr lr,[sp],#4 @ pop lr
|
||
bne .Labrt
|
||
|
||
- stmdb sp!,{r4-r12}
|
||
+ mov r0,r2 @ AES_set_encrypt_key preserves r2,
|
||
+ mov r1,r2 @ which is AES_KEY *key
|
||
+ b _armv4_AES_set_enc2dec_key
|
||
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
|
||
|
||
- ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
|
||
- mov $key,r2 @ which is AES_KEY *key
|
||
- mov $i1,r2
|
||
- add $i2,r2,$rounds,lsl#4
|
||
+@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
|
||
+.global AES_set_enc2dec_key
|
||
+.type AES_set_enc2dec_key,%function
|
||
+.align 5
|
||
+AES_set_enc2dec_key:
|
||
+_armv4_AES_set_enc2dec_key:
|
||
+ stmdb sp!,{r4-r12,lr}
|
||
+
|
||
+ ldr $rounds,[r0,#240]
|
||
+ mov $i1,r0 @ input
|
||
+ add $i2,r0,$rounds,lsl#4
|
||
+ mov $key,r1 @ ouput
|
||
+ add $tbl,r1,$rounds,lsl#4
|
||
+ str $rounds,[r1,#240]
|
||
+
|
||
+.Linv: ldr $s0,[$i1],#16
|
||
+ ldr $s1,[$i1,#-12]
|
||
+ ldr $s2,[$i1,#-8]
|
||
+ ldr $s3,[$i1,#-4]
|
||
+ ldr $t1,[$i2],#-16
|
||
+ ldr $t2,[$i2,#16+4]
|
||
+ ldr $t3,[$i2,#16+8]
|
||
+ ldr $i3,[$i2,#16+12]
|
||
+ str $s0,[$tbl],#-16
|
||
+ str $s1,[$tbl,#16+4]
|
||
+ str $s2,[$tbl,#16+8]
|
||
+ str $s3,[$tbl,#16+12]
|
||
+ str $t1,[$key],#16
|
||
+ str $t2,[$key,#-12]
|
||
+ str $t3,[$key,#-8]
|
||
+ str $i3,[$key,#-4]
|
||
+ teq $i1,$i2
|
||
+ bne .Linv
|
||
|
||
-.Linv: ldr $s0,[$i1]
|
||
+ ldr $s0,[$i1]
|
||
ldr $s1,[$i1,#4]
|
||
ldr $s2,[$i1,#8]
|
||
ldr $s3,[$i1,#12]
|
||
- ldr $t1,[$i2]
|
||
- ldr $t2,[$i2,#4]
|
||
- ldr $t3,[$i2,#8]
|
||
- ldr $i3,[$i2,#12]
|
||
- str $s0,[$i2],#-16
|
||
- str $s1,[$i2,#16+4]
|
||
- str $s2,[$i2,#16+8]
|
||
- str $s3,[$i2,#16+12]
|
||
- str $t1,[$i1],#16
|
||
- str $t2,[$i1,#-12]
|
||
- str $t3,[$i1,#-8]
|
||
- str $i3,[$i1,#-4]
|
||
- teq $i1,$i2
|
||
- bne .Linv
|
||
+ str $s0,[$key]
|
||
+ str $s1,[$key,#4]
|
||
+ str $s2,[$key,#8]
|
||
+ str $s3,[$key,#12]
|
||
+ sub $key,$key,$rounds,lsl#3
|
||
___
|
||
$mask80=$i1;
|
||
$mask1b=$i2;
|
||
@@ -773,7 +839,7 @@ $code.=<<___;
|
||
moveq pc,lr @ be binary compatible with V4, yet
|
||
bx lr @ interoperable with Thumb ISA:-)
|
||
#endif
|
||
-.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
|
||
+.size AES_set_enc2dec_key,.-AES_set_enc2dec_key
|
||
|
||
.type AES_Td,%object
|
||
.align 5
|
||
@@ -883,7 +949,11 @@ AES_Td:
|
||
.type AES_decrypt,%function
|
||
.align 5
|
||
AES_decrypt:
|
||
+#if __ARM_ARCH__<7
|
||
sub r3,pc,#8 @ AES_decrypt
|
||
+#else
|
||
+ adr r3,AES_decrypt
|
||
+#endif
|
||
stmdb sp!,{r1,r4-r12,lr}
|
||
mov $rounds,r0 @ inp
|
||
mov $key,r2
|
||
@@ -1080,8 +1150,9 @@ _armv4_AES_decrypt:
|
||
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
|
||
and $i3,lr,$s1,lsr#8
|
||
|
||
+ add $s1,$tbl,$s1,lsr#24
|
||
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
|
||
- ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
|
||
+ ldrb $s1,[$s1] @ Td4[s1>>24]
|
||
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
|
||
eor $s0,$i1,$s0,lsl#24
|
||
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
|
||
@@ -1094,7 +1165,8 @@ _armv4_AES_decrypt:
|
||
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
|
||
and $i3,lr,$s2,lsr#16
|
||
|
||
- ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
|
||
+ add $s2,$tbl,$s2,lsr#24
|
||
+ ldrb $s2,[$s2] @ Td4[s2>>24]
|
||
eor $s0,$s0,$i1,lsl#8
|
||
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
|
||
eor $s1,$i2,$s1,lsl#16
|
||
@@ -1106,8 +1178,9 @@ _armv4_AES_decrypt:
|
||
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
|
||
and $i3,lr,$s3 @ i2
|
||
|
||
+ add $s3,$tbl,$s3,lsr#24
|
||
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
|
||
- ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
|
||
+ ldrb $s3,[$s3] @ Td4[s3>>24]
|
||
eor $s0,$s0,$i1,lsl#16
|
||
ldr $i1,[$key,#0]
|
||
eor $s1,$s1,$i2,lsl#8
|
||
@@ -1130,5 +1203,15 @@ _armv4_AES_decrypt:
|
||
___
|
||
|
||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
+$code =~ s/\bret\b/bx\tlr/gm;
|
||
+
|
||
+open SELF,$0;
|
||
+while(<SELF>) {
|
||
+ next if (/^#!/);
|
||
+ last if (!s/^#/@/ and !/^$/);
|
||
+ print;
|
||
+}
|
||
+close SELF;
|
||
+
|
||
print $code;
|
||
close STDOUT; # enforce flush
|
||
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
|
||
new file mode 100755
|
||
index 0000000..415dc04
|
||
--- /dev/null
|
||
+++ b/crypto/aes/asm/aesv8-armx.pl
|
||
@@ -0,0 +1,980 @@
|
||
+#!/usr/bin/env perl
|
||
+#
|
||
+# ====================================================================
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
+# project. The module is, however, dual licensed under OpenSSL and
|
||
+# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
+# details see http://www.openssl.org/~appro/cryptogams/.
|
||
+# ====================================================================
|
||
+#
|
||
+# This module implements support for ARMv8 AES instructions. The
|
||
+# module is endian-agnostic in sense that it supports both big- and
|
||
+# little-endian cases. As does it support both 32- and 64-bit modes
|
||
+# of operation. Latter is achieved by limiting amount of utilized
|
||
+# registers to 16, which implies additional instructions. This has
|
||
+# no effect on mighty Apple A7, as results are literally equal to
|
||
+# the theoretical estimates based on instruction latencies and issue
|
||
+# rate. It remains to be seen how does it affect other platforms...
|
||
+#
|
||
+# Performance in cycles per byte processed with 128-bit key:
|
||
+#
|
||
+# CBC enc CBC dec CTR
|
||
+# Apple A7 2.39 1.20 1.20
|
||
+# Cortex-A5x n/a n/a n/a
|
||
+
|
||
+$flavour = shift;
|
||
+open STDOUT,">".shift;
|
||
+
|
||
+$prefix="aes_v8";
|
||
+
|
||
+$code=<<___;
|
||
+#include "arm_arch.h"
|
||
+
|
||
+#if __ARM_ARCH__>=7
|
||
+.text
|
||
+___
|
||
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
|
||
+
|
||
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
|
||
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
|
||
+# maintain both 32- and 64-bit codes within single module and
|
||
+# transliterate common code to either flavour with regex vodoo.
|
||
+#
|
||
+{{{
|
||
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
|
||
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
|
||
+ $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
|
||
+
|
||
+
|
||
+$code.=<<___;
|
||
+.align 5
|
||
+rcon:
|
||
+.long 0x01,0x01,0x01,0x01
|
||
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
|
||
+.long 0x1b,0x1b,0x1b,0x1b
|
||
+
|
||
+.globl ${prefix}_set_encrypt_key
|
||
+.type ${prefix}_set_encrypt_key,%function
|
||
+.align 5
|
||
+${prefix}_set_encrypt_key:
|
||
+.Lenc_key:
|
||
+___
|
||
+$code.=<<___ if ($flavour =~ /64/);
|
||
+ stp x29,x30,[sp,#-16]!
|
||
+ add x29,sp,#0
|
||
+___
|
||
+$code.=<<___;
|
||
+ adr $ptr,rcon
|
||
+ cmp $bits,#192
|
||
+
|
||
+ veor $zero,$zero,$zero
|
||
+ vld1.8 {$in0},[$inp],#16
|
||
+ mov $bits,#8 // reuse $bits
|
||
+ vld1.32 {$rcon,$mask},[$ptr],#32
|
||
+
|
||
+ b.lt .Loop128
|
||
+ b.eq .L192
|
||
+ b .L256
|
||
+
|
||
+.align 4
|
||
+.Loop128:
|
||
+ vtbl.8 $key,{$in0},$mask
|
||
+ vext.8 $tmp,$zero,$in0,#12
|
||
+ vst1.32 {$in0},[$out],#16
|
||
+ aese $key,$zero
|
||
+ subs $bits,$bits,#1
|
||
+
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $key,$key,$rcon
|
||
+ veor $in0,$in0,$tmp
|
||
+ vshl.u8 $rcon,$rcon,#1
|
||
+ veor $in0,$in0,$key
|
||
+ b.ne .Loop128
|
||
+
|
||
+ vld1.32 {$rcon},[$ptr]
|
||
+
|
||
+ vtbl.8 $key,{$in0},$mask
|
||
+ vext.8 $tmp,$zero,$in0,#12
|
||
+ vst1.32 {$in0},[$out],#16
|
||
+ aese $key,$zero
|
||
+
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $key,$key,$rcon
|
||
+ veor $in0,$in0,$tmp
|
||
+ vshl.u8 $rcon,$rcon,#1
|
||
+ veor $in0,$in0,$key
|
||
+
|
||
+ vtbl.8 $key,{$in0},$mask
|
||
+ vext.8 $tmp,$zero,$in0,#12
|
||
+ vst1.32 {$in0},[$out],#16
|
||
+ aese $key,$zero
|
||
+
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $key,$key,$rcon
|
||
+ veor $in0,$in0,$tmp
|
||
+ veor $in0,$in0,$key
|
||
+ vst1.32 {$in0},[$out]
|
||
+ add $out,$out,#0x50
|
||
+
|
||
+ mov $rounds,#10
|
||
+ b .Ldone
|
||
+
|
||
+.align 4
|
||
+.L192:
|
||
+ vld1.8 {$in1},[$inp],#8
|
||
+ vmov.i8 $key,#8 // borrow $key
|
||
+ vst1.32 {$in0},[$out],#16
|
||
+ vsub.i8 $mask,$mask,$key // adjust the mask
|
||
+
|
||
+.Loop192:
|
||
+ vtbl.8 $key,{$in1},$mask
|
||
+ vext.8 $tmp,$zero,$in0,#12
|
||
+ vst1.32 {$in1},[$out],#8
|
||
+ aese $key,$zero
|
||
+ subs $bits,$bits,#1
|
||
+
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in0,$in0,$tmp
|
||
+
|
||
+ vdup.32 $tmp,${in0}[3]
|
||
+ veor $tmp,$tmp,$in1
|
||
+ veor $key,$key,$rcon
|
||
+ vext.8 $in1,$zero,$in1,#12
|
||
+ vshl.u8 $rcon,$rcon,#1
|
||
+ veor $in1,$in1,$tmp
|
||
+ veor $in0,$in0,$key
|
||
+ veor $in1,$in1,$key
|
||
+ vst1.32 {$in0},[$out],#16
|
||
+ b.ne .Loop192
|
||
+
|
||
+ mov $rounds,#12
|
||
+ add $out,$out,#0x20
|
||
+ b .Ldone
|
||
+
|
||
+.align 4
|
||
+.L256:
|
||
+ vld1.8 {$in1},[$inp]
|
||
+ mov $bits,#7
|
||
+ mov $rounds,#14
|
||
+ vst1.32 {$in0},[$out],#16
|
||
+
|
||
+.Loop256:
|
||
+ vtbl.8 $key,{$in1},$mask
|
||
+ vext.8 $tmp,$zero,$in0,#12
|
||
+ vst1.32 {$in1},[$out],#16
|
||
+ aese $key,$zero
|
||
+ subs $bits,$bits,#1
|
||
+
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in0,$in0,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $key,$key,$rcon
|
||
+ veor $in0,$in0,$tmp
|
||
+ vshl.u8 $rcon,$rcon,#1
|
||
+ veor $in0,$in0,$key
|
||
+ vst1.32 {$in0},[$out],#16
|
||
+ b.eq .Ldone
|
||
+
|
||
+ vdup.32 $key,${in0}[3] // just splat
|
||
+ vext.8 $tmp,$zero,$in1,#12
|
||
+ aese $key,$zero
|
||
+
|
||
+ veor $in1,$in1,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in1,$in1,$tmp
|
||
+ vext.8 $tmp,$zero,$tmp,#12
|
||
+ veor $in1,$in1,$tmp
|
||
+
|
||
+ veor $in1,$in1,$key
|
||
+ b .Loop256
|
||
+
|
||
+.Ldone:
|
||
+ str $rounds,[$out]
|
||
+
|
||
+ eor x0,x0,x0 // return value
|
||
+ `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
|
||
+ ret
|
||
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
|
||
+
|
||
+.globl ${prefix}_set_decrypt_key
|
||
+.type ${prefix}_set_decrypt_key,%function
|
||
+.align 5
|
||
+${prefix}_set_decrypt_key:
|
||
+___
|
||
+$code.=<<___ if ($flavour =~ /64/);
|
||
+ stp x29,x30,[sp,#-16]!
|
||
+ add x29,sp,#0
|
||
+___
|
||
+$code.=<<___ if ($flavour !~ /64/);
|
||
+ stmdb sp!,{r4,lr}
|
||
+___
|
||
+$code.=<<___;
|
||
+ bl .Lenc_key
|
||
+
|
||
+ sub $out,$out,#240 // restore original $out
|
||
+ mov x4,#-16
|
||
+ add $inp,$out,x12,lsl#4 // end of key schedule
|
||
+
|
||
+ vld1.32 {v0.16b},[$out]
|
||
+ vld1.32 {v1.16b},[$inp]
|
||
+ vst1.32 {v0.16b},[$inp],x4
|
||
+ vst1.32 {v1.16b},[$out],#16
|
||
+
|
||
+.Loop_imc:
|
||
+ vld1.32 {v0.16b},[$out]
|
||
+ vld1.32 {v1.16b},[$inp]
|
||
+ aesimc v0.16b,v0.16b
|
||
+ aesimc v1.16b,v1.16b
|
||
+ vst1.32 {v0.16b},[$inp],x4
|
||
+ vst1.32 {v1.16b},[$out],#16
|
||
+ cmp $inp,$out
|
||
+ b.hi .Loop_imc
|
||
+
|
||
+ vld1.32 {v0.16b},[$out]
|
||
+ aesimc v0.16b,v0.16b
|
||
+ vst1.32 {v0.16b},[$inp]
|
||
+
|
||
+ eor x0,x0,x0 // return value
|
||
+___
|
||
+$code.=<<___ if ($flavour !~ /64/);
|
||
+ ldmia sp!,{r4,pc}
|
||
+___
|
||
+$code.=<<___ if ($flavour =~ /64/);
|
||
+ ldp x29,x30,[sp],#16
|
||
+ ret
|
||
+___
|
||
+$code.=<<___;
|
||
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
|
||
+___
|
||
+}}}
|
||
+{{{
|
||
+sub gen_block () {
|
||
+my $dir = shift;
|
||
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
|
||
+my ($inp,$out,$key)=map("x$_",(0..2));
|
||
+my $rounds="w3";
|
||
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
|
||
+
|
||
+$code.=<<___;
|
||
+.globl ${prefix}_${dir}crypt
|
||
+.type ${prefix}_${dir}crypt,%function
|
||
+.align 5
|
||
+${prefix}_${dir}crypt:
|
||
+ ldr $rounds,[$key,#240]
|
||
+ vld1.32 {$rndkey0},[$key],#16
|
||
+ vld1.8 {$inout},[$inp]
|
||
+ sub $rounds,$rounds,#2
|
||
+ vld1.32 {$rndkey1},[$key],#16
|
||
+
|
||
+.Loop_${dir}c:
|
||
+ aes$e $inout,$rndkey0
|
||
+ vld1.32 {$rndkey0},[$key],#16
|
||
+ aes$mc $inout,$inout
|
||
+ subs $rounds,$rounds,#2
|
||
+ aes$e $inout,$rndkey1
|
||
+ vld1.32 {$rndkey1},[$key],#16
|
||
+ aes$mc $inout,$inout
|
||
+ b.gt .Loop_${dir}c
|
||
+
|
||
+ aes$e $inout,$rndkey0
|
||
+ vld1.32 {$rndkey0},[$key]
|
||
+ aes$mc $inout,$inout
|
||
+ aes$e $inout,$rndkey1
|
||
+ veor $inout,$inout,$rndkey0
|
||
+
|
||
+ vst1.8 {$inout},[$out]
|
||
+ ret
|
||
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
|
||
+___
|
||
+}
|
||
+&gen_block("en");
|
||
+&gen_block("de");
|
||
+}}}
|
||
+{{{
|
||
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
|
||
+my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
|
||
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
|
||
+
|
||
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
|
||
+
|
||
+### q8-q15 preloaded key schedule
|
||
+
|
||
+$code.=<<___;
|
||
+.globl ${prefix}_cbc_encrypt
|
||
+.type ${prefix}_cbc_encrypt,%function
|
||
+.align 5
|
||
+${prefix}_cbc_encrypt:
|
||
+___
|
||
+$code.=<<___ if ($flavour =~ /64/);
|
||
+ stp x29,x30,[sp,#-16]!
|
||
+ add x29,sp,#0
|
||
+___
|
||
+$code.=<<___ if ($flavour !~ /64/);
|
||
+ mov ip,sp
|
||
+ stmdb sp!,{r4-r8,lr}
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+ ldmia ip,{r4-r5} @ load remaining args
|
||
+___
|
||
+$code.=<<___;
|
||
+ subs $len,$len,#16
|
||
+ mov $step,#16
|
||
+ b.lo .Lcbc_abort
|
||
+ cclr $step,eq
|
||
+
|
||
+ cmp $enc,#0 // en- or decrypting?
|
||
+ ldr $rounds,[$key,#240]
|
||
+ and $len,$len,#-16
|
||
+ vld1.8 {$ivec},[$ivp]
|
||
+ vld1.8 {$dat},[$inp],$step
|
||
+
|
||
+ vld1.32 {q8-q9},[$key] // load key schedule...
|
||
+ sub $rounds,$rounds,#6
|
||
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
|
||
+ sub $rounds,$rounds,#2
|
||
+ vld1.32 {q10-q11},[$key_],#32
|
||
+ vld1.32 {q12-q13},[$key_],#32
|
||
+ vld1.32 {q14-q15},[$key_],#32
|
||
+ vld1.32 {$rndlast},[$key_]
|
||
+
|
||
+ add $key_,$key,#32
|
||
+ mov $cnt,$rounds
|
||
+ b.eq .Lcbc_dec
|
||
+
|
||
+ cmp $rounds,#2
|
||
+ veor $dat,$dat,$ivec
|
||
+ veor $rndzero_n_last,q8,$rndlast
|
||
+ b.eq .Lcbc_enc128
|
||
+
|
||
+.Loop_cbc_enc:
|
||
+ aese $dat,q8
|
||
+ vld1.32 {q8},[$key_],#16
|
||
+ aesmc $dat,$dat
|
||
+ subs $cnt,$cnt,#2
|
||
+ aese $dat,q9
|
||
+ vld1.32 {q9},[$key_],#16
|
||
+ aesmc $dat,$dat
|
||
+ b.gt .Loop_cbc_enc
|
||
+
|
||
+ aese $dat,q8
|
||
+ aesmc $dat,$dat
|
||
+ subs $len,$len,#16
|
||
+ aese $dat,q9
|
||
+ aesmc $dat,$dat
|
||
+ cclr $step,eq
|
||
+ aese $dat,q10
|
||
+ aesmc $dat,$dat
|
||
+ add $key_,$key,#16
|
||
+ aese $dat,q11
|
||
+ aesmc $dat,$dat
|
||
+ vld1.8 {q8},[$inp],$step
|
||
+ aese $dat,q12
|
||
+ aesmc $dat,$dat
|
||
+ veor q8,q8,$rndzero_n_last
|
||
+ aese $dat,q13
|
||
+ aesmc $dat,$dat
|
||
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
|
||
+ aese $dat,q14
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q15
|
||
+
|
||
+ mov $cnt,$rounds
|
||
+ veor $ivec,$dat,$rndlast
|
||
+ vst1.8 {$ivec},[$out],#16
|
||
+ b.hs .Loop_cbc_enc
|
||
+
|
||
+ b .Lcbc_done
|
||
+
|
||
+.align 5
|
||
+.Lcbc_enc128:
|
||
+ vld1.32 {$in0-$in1},[$key_]
|
||
+ aese $dat,q8
|
||
+ aesmc $dat,$dat
|
||
+ b .Lenter_cbc_enc128
|
||
+.Loop_cbc_enc128:
|
||
+ aese $dat,q8
|
||
+ aesmc $dat,$dat
|
||
+ vst1.8 {$ivec},[$out],#16
|
||
+.Lenter_cbc_enc128:
|
||
+ aese $dat,q9
|
||
+ aesmc $dat,$dat
|
||
+ subs $len,$len,#16
|
||
+ aese $dat,$in0
|
||
+ aesmc $dat,$dat
|
||
+ cclr $step,eq
|
||
+ aese $dat,$in1
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q10
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q11
|
||
+ aesmc $dat,$dat
|
||
+ vld1.8 {q8},[$inp],$step
|
||
+ aese $dat,q12
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q13
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q14
|
||
+ aesmc $dat,$dat
|
||
+ veor q8,q8,$rndzero_n_last
|
||
+ aese $dat,q15
|
||
+ veor $ivec,$dat,$rndlast
|
||
+ b.hs .Loop_cbc_enc128
|
||
+
|
||
+ vst1.8 {$ivec},[$out],#16
|
||
+ b .Lcbc_done
|
||
+
|
||
+.align 5
|
||
+.Lcbc_dec128:
|
||
+ vld1.32 {$tmp0-$tmp1},[$key_]
|
||
+ veor $ivec,$ivec,$rndlast
|
||
+ veor $in0,$dat0,$rndlast
|
||
+ mov $step1,$step
|
||
+
|
||
+.Loop2x_cbc_dec128:
|
||
+ aesd $dat0,q8
|
||
+ aesd $dat1,q8
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ subs $len,$len,#32
|
||
+ aesd $dat0,q9
|
||
+ aesd $dat1,q9
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ cclr $step,lo
|
||
+ aesd $dat0,$tmp0
|
||
+ aesd $dat1,$tmp0
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ cclr $step1,ls
|
||
+ aesd $dat0,$tmp1
|
||
+ aesd $dat1,$tmp1
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ aesd $dat0,q10
|
||
+ aesd $dat1,q10
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ aesd $dat0,q11
|
||
+ aesd $dat1,q11
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ aesd $dat0,q12
|
||
+ aesd $dat1,q12
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ aesd $dat0,q13
|
||
+ aesd $dat1,q13
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ aesd $dat0,q14
|
||
+ aesd $dat1,q14
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ aesd $dat0,q15
|
||
+ aesd $dat1,q15
|
||
+
|
||
+ veor $ivec,$ivec,$dat0
|
||
+ vld1.8 {$dat0},[$inp],$step
|
||
+ veor $in0,$in0,$dat1
|
||
+ vld1.8 {$dat1},[$inp],$step1
|
||
+ vst1.8 {$ivec},[$out],#16
|
||
+ veor $ivec,$in1,$rndlast
|
||
+ vst1.8 {$in0},[$out],#16
|
||
+ veor $in0,$dat0,$rndlast
|
||
+ vorr $in1,$dat1,$dat1
|
||
+ b.hs .Loop2x_cbc_dec128
|
||
+
|
||
+ adds $len,$len,#32
|
||
+ veor $ivec,$ivec,$rndlast
|
||
+ b.eq .Lcbc_done
|
||
+ veor $in0,$in0,$rndlast
|
||
+ b .Lcbc_dec_tail
|
||
+
|
||
+.align 5
|
||
+.Lcbc_dec:
|
||
+ subs $len,$len,#16
|
||
+ vorr $in0,$dat,$dat
|
||
+ b.lo .Lcbc_dec_tail
|
||
+
|
||
+ cclr $step,eq
|
||
+ cmp $rounds,#2
|
||
+ vld1.8 {$dat1},[$inp],$step
|
||
+ vorr $in1,$dat1,$dat1
|
||
+ b.eq .Lcbc_dec128
|
||
+
|
||
+.Loop2x_cbc_dec:
|
||
+ aesd $dat0,q8
|
||
+ aesd $dat1,q8
|
||
+ vld1.32 {q8},[$key_],#16
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ subs $cnt,$cnt,#2
|
||
+ aesd $dat0,q9
|
||
+ aesd $dat1,q9
|
||
+ vld1.32 {q9},[$key_],#16
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ b.gt .Loop2x_cbc_dec
|
||
+
|
||
+ aesd $dat0,q8
|
||
+ aesd $dat1,q8
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ veor $tmp0,$ivec,$rndlast
|
||
+ veor $tmp1,$in0,$rndlast
|
||
+ aesd $dat0,q9
|
||
+ aesd $dat1,q9
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ vorr $ivec,$in1,$in1
|
||
+ subs $len,$len,#32
|
||
+ aesd $dat0,q10
|
||
+ aesd $dat1,q10
|
||
+ aesimc $dat0,$dat0
|
||
+ cclr $step,lo
|
||
+ aesimc $dat1,$dat1
|
||
+ mov $key_,$key
|
||
+ aesd $dat0,q11
|
||
+ aesd $dat1,q11
|
||
+ aesimc $dat0,$dat0
|
||
+ vld1.8 {$in0},[$inp],$step
|
||
+ aesimc $dat1,$dat1
|
||
+ cclr $step,ls
|
||
+ aesd $dat0,q12
|
||
+ aesd $dat1,q12
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ vld1.8 {$in1},[$inp],$step
|
||
+ aesd $dat0,q13
|
||
+ aesd $dat1,q13
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
|
||
+ aesd $dat0,q14
|
||
+ aesd $dat1,q14
|
||
+ aesimc $dat0,$dat0
|
||
+ aesimc $dat1,$dat1
|
||
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
|
||
+ aesd $dat0,q15
|
||
+ aesd $dat1,q15
|
||
+
|
||
+ mov $cnt,$rounds
|
||
+ veor $tmp0,$tmp0,$dat0
|
||
+ veor $tmp1,$tmp1,$dat1
|
||
+ vorr $dat0,$in0,$in0
|
||
+ vst1.8 {$tmp0},[$out],#16
|
||
+ vorr $dat1,$in1,$in1
|
||
+ vst1.8 {$tmp1},[$out],#16
|
||
+ b.hs .Loop2x_cbc_dec
|
||
+
|
||
+ adds $len,$len,#32
|
||
+ b.eq .Lcbc_done
|
||
+
|
||
+.Lcbc_dec_tail:
|
||
+ aesd $dat,q8
|
||
+ vld1.32 {q8},[$key_],#16
|
||
+ aesimc $dat,$dat
|
||
+ subs $cnt,$cnt,#2
|
||
+ aesd $dat,q9
|
||
+ vld1.32 {q9},[$key_],#16
|
||
+ aesimc $dat,$dat
|
||
+ b.gt .Lcbc_dec_tail
|
||
+
|
||
+ aesd $dat,q8
|
||
+ aesimc $dat,$dat
|
||
+ aesd $dat,q9
|
||
+ aesimc $dat,$dat
|
||
+ veor $tmp,$ivec,$rndlast
|
||
+ aesd $dat,q10
|
||
+ aesimc $dat,$dat
|
||
+ vorr $ivec,$in0,$in0
|
||
+ aesd $dat,q11
|
||
+ aesimc $dat,$dat
|
||
+ aesd $dat,q12
|
||
+ aesimc $dat,$dat
|
||
+ aesd $dat,q13
|
||
+ aesimc $dat,$dat
|
||
+ aesd $dat,q14
|
||
+ aesimc $dat,$dat
|
||
+ aesd $dat,q15
|
||
+
|
||
+ veor $tmp,$tmp,$dat
|
||
+ vst1.8 {$tmp},[$out],#16
|
||
+
|
||
+.Lcbc_done:
|
||
+ vst1.8 {$ivec},[$ivp]
|
||
+.Lcbc_abort:
|
||
+___
|
||
+$code.=<<___ if ($flavour !~ /64/);
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r8,pc}
|
||
+___
|
||
+$code.=<<___ if ($flavour =~ /64/);
|
||
+ ldr x29,[sp],#16
|
||
+ ret
|
||
+___
|
||
+$code.=<<___;
|
||
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
|
||
+___
|
||
+}}}
|
||
+{{{
|
||
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
|
||
+my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
|
||
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
|
||
+
|
||
+my ($dat,$tmp)=($dat0,$tmp0);
|
||
+
|
||
+### q8-q15 preloaded key schedule
|
||
+
|
||
+$code.=<<___;
|
||
+.globl ${prefix}_ctr32_encrypt_blocks
|
||
+.type ${prefix}_ctr32_encrypt_blocks,%function
|
||
+.align 5
|
||
+${prefix}_ctr32_encrypt_blocks:
|
||
+___
|
||
+$code.=<<___ if ($flavour =~ /64/);
|
||
+ stp x29,x30,[sp,#-16]!
|
||
+ add x29,sp,#0
|
||
+___
|
||
+$code.=<<___ if ($flavour !~ /64/);
|
||
+ mov ip,sp
|
||
+ stmdb sp!,{r4-r10,lr}
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+ ldr r4, [ip] @ load remaining arg
|
||
+___
|
||
+$code.=<<___;
|
||
+ ldr $rounds,[$key,#240]
|
||
+
|
||
+ ldr $ctr, [$ivp, #12]
|
||
+ vld1.32 {$dat0},[$ivp]
|
||
+
|
||
+ vld1.32 {q8-q9},[$key] // load key schedule...
|
||
+ sub $rounds,$rounds,#6
|
||
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
|
||
+ sub $rounds,$rounds,#2
|
||
+ vld1.32 {q10-q11},[$key_],#32
|
||
+ vld1.32 {q12-q13},[$key_],#32
|
||
+ vld1.32 {q14-q15},[$key_],#32
|
||
+ vld1.32 {$rndlast},[$key_]
|
||
+
|
||
+ add $key_,$key,#32
|
||
+ mov $cnt,$rounds
|
||
+
|
||
+ subs $len,$len,#2
|
||
+ b.lo .Lctr32_tail
|
||
+
|
||
+#ifndef __ARMEB__
|
||
+ rev $ctr, $ctr
|
||
+#endif
|
||
+ vorr $dat1,$dat0,$dat0
|
||
+ add $ctr, $ctr, #1
|
||
+ vorr $ivec,$dat0,$dat0
|
||
+ rev $tctr1, $ctr
|
||
+ cmp $rounds,#2
|
||
+ vmov.32 ${dat1}[3],$tctr1
|
||
+ b.eq .Lctr32_128
|
||
+
|
||
+.Loop2x_ctr32:
|
||
+ aese $dat0,q8
|
||
+ aese $dat1,q8
|
||
+ vld1.32 {q8},[$key_],#16
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ subs $cnt,$cnt,#2
|
||
+ aese $dat0,q9
|
||
+ aese $dat1,q9
|
||
+ vld1.32 {q9},[$key_],#16
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ b.gt .Loop2x_ctr32
|
||
+
|
||
+ aese $dat0,q8
|
||
+ aese $dat1,q8
|
||
+ aesmc $tmp0,$dat0
|
||
+ vorr $dat0,$ivec,$ivec
|
||
+ aesmc $tmp1,$dat1
|
||
+ vorr $dat1,$ivec,$ivec
|
||
+ aese $tmp0,q9
|
||
+ aese $tmp1,q9
|
||
+ vld1.8 {$in0},[$inp],#16
|
||
+ aesmc $tmp0,$tmp0
|
||
+ vld1.8 {$in1},[$inp],#16
|
||
+ aesmc $tmp1,$tmp1
|
||
+ add $ctr,$ctr,#1
|
||
+ aese $tmp0,q10
|
||
+ aese $tmp1,q10
|
||
+ rev $tctr,$ctr
|
||
+ aesmc $tmp0,$tmp0
|
||
+ aesmc $tmp1,$tmp1
|
||
+ add $ctr,$ctr,#1
|
||
+ aese $tmp0,q11
|
||
+ aese $tmp1,q11
|
||
+ veor $in0,$in0,$rndlast
|
||
+ rev $tctr1,$ctr
|
||
+ aesmc $tmp0,$tmp0
|
||
+ aesmc $tmp1,$tmp1
|
||
+ veor $in1,$in1,$rndlast
|
||
+ mov $key_,$key
|
||
+ aese $tmp0,q12
|
||
+ aese $tmp1,q12
|
||
+ subs $len,$len,#2
|
||
+ aesmc $tmp0,$tmp0
|
||
+ aesmc $tmp1,$tmp1
|
||
+ vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1]
|
||
+ aese $tmp0,q13
|
||
+ aese $tmp1,q13
|
||
+ aesmc $tmp0,$tmp0
|
||
+ aesmc $tmp1,$tmp1
|
||
+ aese $tmp0,q14
|
||
+ aese $tmp1,q14
|
||
+ vmov.32 ${dat0}[3], $tctr
|
||
+ aesmc $tmp0,$tmp0
|
||
+ vmov.32 ${dat1}[3], $tctr1
|
||
+ aesmc $tmp1,$tmp1
|
||
+ aese $tmp0,q15
|
||
+ aese $tmp1,q15
|
||
+
|
||
+ mov $cnt,$rounds
|
||
+ veor $in0,$in0,$tmp0
|
||
+ veor $in1,$in1,$tmp1
|
||
+ vst1.8 {$in0},[$out],#16
|
||
+ vst1.8 {$in1},[$out],#16
|
||
+ b.hs .Loop2x_ctr32
|
||
+
|
||
+ adds $len,$len,#2
|
||
+ b.eq .Lctr32_done
|
||
+ b .Lctr32_tail
|
||
+
|
||
+.Lctr32_128:
|
||
+ vld1.32 {$tmp0-$tmp1},[$key_]
|
||
+
|
||
+.Loop2x_ctr32_128:
|
||
+ aese $dat0,q8
|
||
+ aese $dat1,q8
|
||
+ aesmc $dat0,$dat0
|
||
+ vld1.8 {$in0},[$inp],#16
|
||
+ aesmc $dat1,$dat1
|
||
+ vld1.8 {$in1},[$inp],#16
|
||
+ aese $dat0,q9
|
||
+ aese $dat1,q9
|
||
+ add $ctr,$ctr,#1
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ rev $tctr,$ctr
|
||
+ aese $dat0,$tmp0
|
||
+ aese $dat1,$tmp0
|
||
+ add $ctr,$ctr,#1
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ rev $tctr1,$ctr
|
||
+ aese $dat0,$tmp1
|
||
+ aese $dat1,$tmp1
|
||
+ subs $len,$len,#2
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ aese $dat0,q10
|
||
+ aese $dat1,q10
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ aese $dat0,q11
|
||
+ aese $dat1,q11
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ aese $dat0,q12
|
||
+ aese $dat1,q12
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ aese $dat0,q13
|
||
+ aese $dat1,q13
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ aese $dat0,q14
|
||
+ aese $dat1,q14
|
||
+ aesmc $dat0,$dat0
|
||
+ aesmc $dat1,$dat1
|
||
+ veor $in0,$in0,$rndlast
|
||
+ aese $dat0,q15
|
||
+ veor $in1,$in1,$rndlast
|
||
+ aese $dat1,q15
|
||
+
|
||
+ veor $in0,$in0,$dat0
|
||
+ vorr $dat0,$ivec,$ivec
|
||
+ veor $in1,$in1,$dat1
|
||
+ vorr $dat1,$ivec,$ivec
|
||
+ vst1.8 {$in0},[$out],#16
|
||
+ vmov.32 ${dat0}[3], $tctr
|
||
+ vst1.8 {$in1},[$out],#16
|
||
+ vmov.32 ${dat1}[3], $tctr1
|
||
+ b.hs .Loop2x_ctr32_128
|
||
+
|
||
+ adds $len,$len,#2
|
||
+ b.eq .Lctr32_done
|
||
+
|
||
+.Lctr32_tail:
|
||
+ aese $dat,q8
|
||
+ vld1.32 {q8},[$key_],#16
|
||
+ aesmc $dat,$dat
|
||
+ subs $cnt,$cnt,#2
|
||
+ aese $dat,q9
|
||
+ vld1.32 {q9},[$key_],#16
|
||
+ aesmc $dat,$dat
|
||
+ b.gt .Lctr32_tail
|
||
+
|
||
+ aese $dat,q8
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q9
|
||
+ aesmc $dat,$dat
|
||
+ vld1.8 {$in0},[$inp]
|
||
+ aese $dat,q10
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q11
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q12
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q13
|
||
+ aesmc $dat,$dat
|
||
+ aese $dat,q14
|
||
+ aesmc $dat,$dat
|
||
+ veor $in0,$in0,$rndlast
|
||
+ aese $dat,q15
|
||
+
|
||
+ veor $in0,$in0,$dat
|
||
+ vst1.8 {$in0},[$out]
|
||
+
|
||
+.Lctr32_done:
|
||
+___
|
||
+$code.=<<___ if ($flavour !~ /64/);
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r10,pc}
|
||
+___
|
||
+$code.=<<___ if ($flavour =~ /64/);
|
||
+ ldr x29,[sp],#16
|
||
+ ret
|
||
+___
|
||
+$code.=<<___;
|
||
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
|
||
+___
|
||
+}}}
|
||
+$code.=<<___;
|
||
+#endif
|
||
+___
|
||
+########################################
|
||
+if ($flavour =~ /64/) { ######## 64-bit code
|
||
+ my %opcode = (
|
||
+ "aesd" => 0x4e285800, "aese" => 0x4e284800,
|
||
+ "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
|
||
+
|
||
+ local *unaes = sub {
|
||
+ my ($mnemonic,$arg)=@_;
|
||
+
|
||
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
|
||
+ sprintf ".inst\t0x%08x\t//%s %s",
|
||
+ $opcode{$mnemonic}|$1|($2<<5),
|
||
+ $mnemonic,$arg;
|
||
+ };
|
||
+
|
||
+ foreach(split("\n",$code)) {
|
||
+ s/\`([^\`]*)\`/eval($1)/geo;
|
||
+
|
||
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
|
||
+ s/@\s/\/\//o; # old->new style commentary
|
||
+
|
||
+ #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
|
||
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
|
||
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
|
||
+ s/vext\.8/ext/o or
|
||
+ s/vrev32\.8/rev32/o or
|
||
+ s/vtst\.8/cmtst/o or
|
||
+ s/vshr/ushr/o or
|
||
+ s/^(\s+)v/$1/o or # strip off v prefix
|
||
+ s/\bbx\s+lr\b/ret/o;
|
||
+
|
||
+ # fix up remainig legacy suffixes
|
||
+ s/\.[ui]?8//o;
|
||
+ m/\],#8/o and s/\.16b/\.8b/go;
|
||
+ s/\.[ui]?32//o and s/\.16b/\.4s/go;
|
||
+ s/\.[ui]?64//o and s/\.16b/\.2d/go;
|
||
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
|
||
+
|
||
+ print $_,"\n";
|
||
+ }
|
||
+} else { ######## 32-bit code
|
||
+ my %opcode = (
|
||
+ "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
|
||
+ "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
|
||
+
|
||
+ local *unaes = sub {
|
||
+ my ($mnemonic,$arg)=@_;
|
||
+
|
||
+ if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
|
||
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||
+ |(($2&7)<<1) |(($2&8)<<2);
|
||
+ # since ARMv7 instructions are always encoded little-endian.
|
||
+ # correct solution is to use .inst directive, but older
|
||
+ # assemblers don't implement it:-(
|
||
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||
+ $word&0xff,($word>>8)&0xff,
|
||
+ ($word>>16)&0xff,($word>>24)&0xff,
|
||
+ $mnemonic,$arg;
|
||
+ }
|
||
+ };
|
||
+
|
||
+ sub unvtbl {
|
||
+ my $arg=shift;
|
||
+
|
||
+ $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
|
||
+ sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
|
||
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
|
||
+ }
|
||
+
|
||
+ sub unvdup32 {
|
||
+ my $arg=shift;
|
||
+
|
||
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||
+ }
|
||
+
|
||
+ sub unvmov32 {
|
||
+ my $arg=shift;
|
||
+
|
||
+ $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
|
||
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
|
||
+ }
|
||
+
|
||
+ foreach(split("\n",$code)) {
|
||
+ s/\`([^\`]*)\`/eval($1)/geo;
|
||
+
|
||
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
||
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
||
+ s/\/\/\s?/@ /o; # new->old style commentary
|
||
+
|
||
+ # fix up remainig new-style suffixes
|
||
+ s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
|
||
+ s/\],#[0-9]+/]!/o;
|
||
+
|
||
+ s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
|
||
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
|
||
+ s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
|
||
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
||
+ s/vmov\.32\s+(.*)/unvmov32($1)/geo or
|
||
+ s/^(\s+)b\./$1b/o or
|
||
+ s/^(\s+)ret/$1bx\tlr/o;
|
||
+
|
||
+ print $_,"\n";
|
||
+ }
|
||
+}
|
||
+
|
||
+close STDOUT;
|
||
diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl
|
||
new file mode 100644
|
||
index 0000000..f3d96d9
|
||
--- /dev/null
|
||
+++ b/crypto/aes/asm/bsaes-armv7.pl
|
||
@@ -0,0 +1,2467 @@
|
||
+#!/usr/bin/env perl
|
||
+
|
||
+# ====================================================================
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
+# project. The module is, however, dual licensed under OpenSSL and
|
||
+# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
+# details see http://www.openssl.org/~appro/cryptogams/.
|
||
+#
|
||
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
|
||
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
|
||
+# granted.
|
||
+# ====================================================================
|
||
+
|
||
+# Bit-sliced AES for ARM NEON
|
||
+#
|
||
+# February 2012.
|
||
+#
|
||
+# This implementation is direct adaptation of bsaes-x86_64 module for
|
||
+# ARM NEON. Except that this module is endian-neutral [in sense that
|
||
+# it can be compiled for either endianness] by courtesy of vld1.8's
|
||
+# neutrality. Initial version doesn't implement interface to OpenSSL,
|
||
+# only low-level primitives and unsupported entry points, just enough
|
||
+# to collect performance results, which for Cortex-A8 core are:
|
||
+#
|
||
+# encrypt 19.5 cycles per byte processed with 128-bit key
|
||
+# decrypt 22.1 cycles per byte processed with 128-bit key
|
||
+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
|
||
+#
|
||
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
|
||
+# which is [much] worse than anticipated (for further details see
|
||
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
|
||
+#
|
||
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
|
||
+# manages in 20.0 cycles].
|
||
+#
|
||
+# When comparing to x86_64 results keep in mind that NEON unit is
|
||
+# [mostly] single-issue and thus can't [fully] benefit from
|
||
+# instruction-level parallelism. And when comparing to aes-armv4
|
||
+# results keep in mind key schedule conversion overhead (see
|
||
+# bsaes-x86_64.pl for further details)...
|
||
+#
|
||
+# <appro@openssl.org>
|
||
+
|
||
+# April-August 2013
|
||
+#
|
||
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
|
||
+#
|
||
+# <ard.biesheuvel@linaro.org>
|
||
+
|
||
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||
+open STDOUT,">$output";
|
||
+
|
||
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
|
||
+my @XMM=map("q$_",(0..15));
|
||
+
|
||
+{
|
||
+my ($key,$rounds,$const)=("r4","r5","r6");
|
||
+
|
||
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||
+
|
||
+sub Sbox {
|
||
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
|
||
+my @b=@_[0..7];
|
||
+my @t=@_[8..11];
|
||
+my @s=@_[12..15];
|
||
+ &InBasisChange (@b);
|
||
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
|
||
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
|
||
+}
|
||
+
|
||
+sub InBasisChange {
|
||
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
||
+my @b=@_[0..7];
|
||
+$code.=<<___;
|
||
+ veor @b[2], @b[2], @b[1]
|
||
+ veor @b[5], @b[5], @b[6]
|
||
+ veor @b[3], @b[3], @b[0]
|
||
+ veor @b[6], @b[6], @b[2]
|
||
+ veor @b[5], @b[5], @b[0]
|
||
+
|
||
+ veor @b[6], @b[6], @b[3]
|
||
+ veor @b[3], @b[3], @b[7]
|
||
+ veor @b[7], @b[7], @b[5]
|
||
+ veor @b[3], @b[3], @b[4]
|
||
+ veor @b[4], @b[4], @b[5]
|
||
+
|
||
+ veor @b[2], @b[2], @b[7]
|
||
+ veor @b[3], @b[3], @b[1]
|
||
+ veor @b[1], @b[1], @b[5]
|
||
+___
|
||
+}
|
||
+
|
||
+sub OutBasisChange {
|
||
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
|
||
+my @b=@_[0..7];
|
||
+$code.=<<___;
|
||
+ veor @b[0], @b[0], @b[6]
|
||
+ veor @b[1], @b[1], @b[4]
|
||
+ veor @b[4], @b[4], @b[6]
|
||
+ veor @b[2], @b[2], @b[0]
|
||
+ veor @b[6], @b[6], @b[1]
|
||
+
|
||
+ veor @b[1], @b[1], @b[5]
|
||
+ veor @b[5], @b[5], @b[3]
|
||
+ veor @b[3], @b[3], @b[7]
|
||
+ veor @b[7], @b[7], @b[5]
|
||
+ veor @b[2], @b[2], @b[5]
|
||
+
|
||
+ veor @b[4], @b[4], @b[7]
|
||
+___
|
||
+}
|
||
+
|
||
+sub InvSbox {
|
||
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
||
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
|
||
+my @b=@_[0..7];
|
||
+my @t=@_[8..11];
|
||
+my @s=@_[12..15];
|
||
+ &InvInBasisChange (@b);
|
||
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
|
||
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
|
||
+}
|
||
+
|
||
+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
|
||
+my @b=@_[5,1,2,6,3,7,0,4];
|
||
+$code.=<<___
|
||
+ veor @b[1], @b[1], @b[7]
|
||
+ veor @b[4], @b[4], @b[7]
|
||
+
|
||
+ veor @b[7], @b[7], @b[5]
|
||
+ veor @b[1], @b[1], @b[3]
|
||
+ veor @b[2], @b[2], @b[5]
|
||
+ veor @b[3], @b[3], @b[7]
|
||
+
|
||
+ veor @b[6], @b[6], @b[1]
|
||
+ veor @b[2], @b[2], @b[0]
|
||
+ veor @b[5], @b[5], @b[3]
|
||
+ veor @b[4], @b[4], @b[6]
|
||
+ veor @b[0], @b[0], @b[6]
|
||
+ veor @b[1], @b[1], @b[4]
|
||
+___
|
||
+}
|
||
+
|
||
+sub InvOutBasisChange { # InBasisChange in reverse
|
||
+my @b=@_[2,5,7,3,6,1,0,4];
|
||
+$code.=<<___;
|
||
+ veor @b[1], @b[1], @b[5]
|
||
+ veor @b[2], @b[2], @b[7]
|
||
+
|
||
+ veor @b[3], @b[3], @b[1]
|
||
+ veor @b[4], @b[4], @b[5]
|
||
+ veor @b[7], @b[7], @b[5]
|
||
+ veor @b[3], @b[3], @b[4]
|
||
+ veor @b[5], @b[5], @b[0]
|
||
+ veor @b[3], @b[3], @b[7]
|
||
+ veor @b[6], @b[6], @b[2]
|
||
+ veor @b[2], @b[2], @b[1]
|
||
+ veor @b[6], @b[6], @b[3]
|
||
+
|
||
+ veor @b[3], @b[3], @b[0]
|
||
+ veor @b[5], @b[5], @b[6]
|
||
+___
|
||
+}
|
||
+
|
||
+sub Mul_GF4 {
|
||
+#;*************************************************************
|
||
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
|
||
+#;*************************************************************
|
||
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
|
||
+$code.=<<___;
|
||
+ veor $t0, $y0, $y1
|
||
+ vand $t0, $t0, $x0
|
||
+ veor $x0, $x0, $x1
|
||
+ vand $t1, $x1, $y0
|
||
+ vand $x0, $x0, $y1
|
||
+ veor $x1, $t1, $t0
|
||
+ veor $x0, $x0, $t1
|
||
+___
|
||
+}
|
||
+
|
||
+sub Mul_GF4_N { # not used, see next subroutine
|
||
+# multiply and scale by N
|
||
+my ($x0,$x1,$y0,$y1,$t0)=@_;
|
||
+$code.=<<___;
|
||
+ veor $t0, $y0, $y1
|
||
+ vand $t0, $t0, $x0
|
||
+ veor $x0, $x0, $x1
|
||
+ vand $x1, $x1, $y0
|
||
+ vand $x0, $x0, $y1
|
||
+ veor $x1, $x1, $x0
|
||
+ veor $x0, $x0, $t0
|
||
+___
|
||
+}
|
||
+
|
||
+sub Mul_GF4_N_GF4 {
|
||
+# interleaved Mul_GF4_N and Mul_GF4
|
||
+my ($x0,$x1,$y0,$y1,$t0,
|
||
+ $x2,$x3,$y2,$y3,$t1)=@_;
|
||
+$code.=<<___;
|
||
+ veor $t0, $y0, $y1
|
||
+ veor $t1, $y2, $y3
|
||
+ vand $t0, $t0, $x0
|
||
+ vand $t1, $t1, $x2
|
||
+ veor $x0, $x0, $x1
|
||
+ veor $x2, $x2, $x3
|
||
+ vand $x1, $x1, $y0
|
||
+ vand $x3, $x3, $y2
|
||
+ vand $x0, $x0, $y1
|
||
+ vand $x2, $x2, $y3
|
||
+ veor $x1, $x1, $x0
|
||
+ veor $x2, $x2, $x3
|
||
+ veor $x0, $x0, $t0
|
||
+ veor $x3, $x3, $t1
|
||
+___
|
||
+}
|
||
+sub Mul_GF16_2 {
|
||
+my @x=@_[0..7];
|
||
+my @y=@_[8..11];
|
||
+my @t=@_[12..15];
|
||
+$code.=<<___;
|
||
+ veor @t[0], @x[0], @x[2]
|
||
+ veor @t[1], @x[1], @x[3]
|
||
+___
|
||
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
|
||
+$code.=<<___;
|
||
+ veor @y[0], @y[0], @y[2]
|
||
+ veor @y[1], @y[1], @y[3]
|
||
+___
|
||
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
|
||
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
|
||
+$code.=<<___;
|
||
+ veor @x[0], @x[0], @t[0]
|
||
+ veor @x[2], @x[2], @t[0]
|
||
+ veor @x[1], @x[1], @t[1]
|
||
+ veor @x[3], @x[3], @t[1]
|
||
+
|
||
+ veor @t[0], @x[4], @x[6]
|
||
+ veor @t[1], @x[5], @x[7]
|
||
+___
|
||
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
|
||
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
|
||
+$code.=<<___;
|
||
+ veor @y[0], @y[0], @y[2]
|
||
+ veor @y[1], @y[1], @y[3]
|
||
+___
|
||
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
|
||
+$code.=<<___;
|
||
+ veor @x[4], @x[4], @t[0]
|
||
+ veor @x[6], @x[6], @t[0]
|
||
+ veor @x[5], @x[5], @t[1]
|
||
+ veor @x[7], @x[7], @t[1]
|
||
+___
|
||
+}
|
||
+sub Inv_GF256 {
|
||
+#;********************************************************************
|
||
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
|
||
+#;********************************************************************
|
||
+my @x=@_[0..7];
|
||
+my @t=@_[8..11];
|
||
+my @s=@_[12..15];
|
||
+# direct optimizations from hardware
|
||
+$code.=<<___;
|
||
+ veor @t[3], @x[4], @x[6]
|
||
+ veor @t[2], @x[5], @x[7]
|
||
+ veor @t[1], @x[1], @x[3]
|
||
+ veor @s[1], @x[7], @x[6]
|
||
+ vmov @t[0], @t[2]
|
||
+ veor @s[0], @x[0], @x[2]
|
||
+
|
||
+ vorr @t[2], @t[2], @t[1]
|
||
+ veor @s[3], @t[3], @t[0]
|
||
+ vand @s[2], @t[3], @s[0]
|
||
+ vorr @t[3], @t[3], @s[0]
|
||
+ veor @s[0], @s[0], @t[1]
|
||
+ vand @t[0], @t[0], @t[1]
|
||
+ veor @t[1], @x[3], @x[2]
|
||
+ vand @s[3], @s[3], @s[0]
|
||
+ vand @s[1], @s[1], @t[1]
|
||
+ veor @t[1], @x[4], @x[5]
|
||
+ veor @s[0], @x[1], @x[0]
|
||
+ veor @t[3], @t[3], @s[1]
|
||
+ veor @t[2], @t[2], @s[1]
|
||
+ vand @s[1], @t[1], @s[0]
|
||
+ vorr @t[1], @t[1], @s[0]
|
||
+ veor @t[3], @t[3], @s[3]
|
||
+ veor @t[0], @t[0], @s[1]
|
||
+ veor @t[2], @t[2], @s[2]
|
||
+ veor @t[1], @t[1], @s[3]
|
||
+ veor @t[0], @t[0], @s[2]
|
||
+ vand @s[0], @x[7], @x[3]
|
||
+ veor @t[1], @t[1], @s[2]
|
||
+ vand @s[1], @x[6], @x[2]
|
||
+ vand @s[2], @x[5], @x[1]
|
||
+ vorr @s[3], @x[4], @x[0]
|
||
+ veor @t[3], @t[3], @s[0]
|
||
+ veor @t[1], @t[1], @s[2]
|
||
+ veor @t[0], @t[0], @s[3]
|
||
+ veor @t[2], @t[2], @s[1]
|
||
+
|
||
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
|
||
+
|
||
+ @ new smaller inversion
|
||
+
|
||
+ vand @s[2], @t[3], @t[1]
|
||
+ vmov @s[0], @t[0]
|
||
+
|
||
+ veor @s[1], @t[2], @s[2]
|
||
+ veor @s[3], @t[0], @s[2]
|
||
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
|
||
+
|
||
+ vbsl @s[1], @t[1], @t[0]
|
||
+ vbsl @s[3], @t[3], @t[2]
|
||
+ veor @t[3], @t[3], @t[2]
|
||
+
|
||
+ vbsl @s[0], @s[1], @s[2]
|
||
+ vbsl @t[0], @s[2], @s[1]
|
||
+
|
||
+ vand @s[2], @s[0], @s[3]
|
||
+ veor @t[1], @t[1], @t[0]
|
||
+
|
||
+ veor @s[2], @s[2], @t[3]
|
||
+___
|
||
+# output in s3, s2, s1, t1
|
||
+
|
||
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
|
||
+
|
||
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
|
||
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
|
||
+
|
||
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
|
||
+}
|
||
+
|
||
+# AES linear components
|
||
+
|
||
+sub ShiftRows {
|
||
+my @x=@_[0..7];
|
||
+my @t=@_[8..11];
|
||
+my $mask=pop;
|
||
+$code.=<<___;
|
||
+ vldmia $key!, {@t[0]-@t[3]}
|
||
+ veor @t[0], @t[0], @x[0]
|
||
+ veor @t[1], @t[1], @x[1]
|
||
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
|
||
+ vldmia $key!, {@t[0]}
|
||
+ veor @t[2], @t[2], @x[2]
|
||
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
|
||
+ vldmia $key!, {@t[1]}
|
||
+ veor @t[3], @t[3], @x[3]
|
||
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
|
||
+ vldmia $key!, {@t[2]}
|
||
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
|
||
+ vldmia $key!, {@t[3]}
|
||
+ veor @t[0], @t[0], @x[4]
|
||
+ veor @t[1], @t[1], @x[5]
|
||
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
|
||
+ veor @t[2], @t[2], @x[6]
|
||
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
|
||
+ veor @t[3], @t[3], @x[7]
|
||
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
|
||
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
|
||
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
|
||
+___
|
||
+}
|
||
+
|
||
+sub MixColumns {
|
||
+# modified to emit output in order suitable for feeding back to aesenc[last]
|
||
+my @x=@_[0..7];
|
||
+my @t=@_[8..15];
|
||
+my $inv=@_[16]; # optional
|
||
+$code.=<<___;
|
||
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
|
||
+ vext.8 @t[1], @x[1], @x[1], #12
|
||
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
|
||
+ vext.8 @t[2], @x[2], @x[2], #12
|
||
+ veor @x[1], @x[1], @t[1]
|
||
+ vext.8 @t[3], @x[3], @x[3], #12
|
||
+ veor @x[2], @x[2], @t[2]
|
||
+ vext.8 @t[4], @x[4], @x[4], #12
|
||
+ veor @x[3], @x[3], @t[3]
|
||
+ vext.8 @t[5], @x[5], @x[5], #12
|
||
+ veor @x[4], @x[4], @t[4]
|
||
+ vext.8 @t[6], @x[6], @x[6], #12
|
||
+ veor @x[5], @x[5], @t[5]
|
||
+ vext.8 @t[7], @x[7], @x[7], #12
|
||
+ veor @x[6], @x[6], @t[6]
|
||
+
|
||
+ veor @t[1], @t[1], @x[0]
|
||
+ veor @x[7], @x[7], @t[7]
|
||
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
|
||
+ veor @t[2], @t[2], @x[1]
|
||
+ veor @t[0], @t[0], @x[7]
|
||
+ veor @t[1], @t[1], @x[7]
|
||
+ vext.8 @x[1], @x[1], @x[1], #8
|
||
+ veor @t[5], @t[5], @x[4]
|
||
+ veor @x[0], @x[0], @t[0]
|
||
+ veor @t[6], @t[6], @x[5]
|
||
+ veor @x[1], @x[1], @t[1]
|
||
+ vext.8 @t[0], @x[4], @x[4], #8
|
||
+ veor @t[4], @t[4], @x[3]
|
||
+ vext.8 @t[1], @x[5], @x[5], #8
|
||
+ veor @t[7], @t[7], @x[6]
|
||
+ vext.8 @x[4], @x[3], @x[3], #8
|
||
+ veor @t[3], @t[3], @x[2]
|
||
+ vext.8 @x[5], @x[7], @x[7], #8
|
||
+ veor @t[4], @t[4], @x[7]
|
||
+ vext.8 @x[3], @x[6], @x[6], #8
|
||
+ veor @t[3], @t[3], @x[7]
|
||
+ vext.8 @x[6], @x[2], @x[2], #8
|
||
+ veor @x[7], @t[1], @t[5]
|
||
+___
|
||
+$code.=<<___ if (!$inv);
|
||
+ veor @x[2], @t[0], @t[4]
|
||
+ veor @x[4], @x[4], @t[3]
|
||
+ veor @x[5], @x[5], @t[7]
|
||
+ veor @x[3], @x[3], @t[6]
|
||
+ @ vmov @x[2], @t[0]
|
||
+ veor @x[6], @x[6], @t[2]
|
||
+ @ vmov @x[7], @t[1]
|
||
+___
|
||
+$code.=<<___ if ($inv);
|
||
+ veor @t[3], @t[3], @x[4]
|
||
+ veor @x[5], @x[5], @t[7]
|
||
+ veor @x[2], @x[3], @t[6]
|
||
+ veor @x[3], @t[0], @t[4]
|
||
+ veor @x[4], @x[6], @t[2]
|
||
+ vmov @x[6], @t[3]
|
||
+ @ vmov @x[7], @t[1]
|
||
+___
|
||
+}
|
||
+
|
||
+sub InvMixColumns_orig {
|
||
+my @x=@_[0..7];
|
||
+my @t=@_[8..15];
|
||
+
|
||
+$code.=<<___;
|
||
+ @ multiplication by 0x0e
|
||
+ vext.8 @t[7], @x[7], @x[7], #12
|
||
+ vmov @t[2], @x[2]
|
||
+ veor @x[2], @x[2], @x[5] @ 2 5
|
||
+ veor @x[7], @x[7], @x[5] @ 7 5
|
||
+ vext.8 @t[0], @x[0], @x[0], #12
|
||
+ vmov @t[5], @x[5]
|
||
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
|
||
+ veor @x[0], @x[0], @x[1] @ 0 1
|
||
+ vext.8 @t[1], @x[1], @x[1], #12
|
||
+ veor @x[1], @x[1], @x[2] @ 1 25
|
||
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
|
||
+ vext.8 @t[3], @x[3], @x[3], #12
|
||
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
|
||
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
|
||
+ veor @x[3], @x[3], @x[7] @ 3 75
|
||
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
|
||
+ vext.8 @t[6], @x[6], @x[6], #12
|
||
+ vmov @t[4], @x[4]
|
||
+ veor @x[6], @x[6], @x[4] @ 6 4
|
||
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
|
||
+ veor @x[3], @x[3], @x[7] @ 375 756=36
|
||
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
|
||
+ veor @x[3], @x[3], @t[2] @ 36 2
|
||
+ vext.8 @t[5], @t[5], @t[5], #12
|
||
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
|
||
+___
|
||
+ my @y = @x[7,5,0,2,1,3,4,6];
|
||
+$code.=<<___;
|
||
+ @ multiplication by 0x0b
|
||
+ veor @y[1], @y[1], @y[0]
|
||
+ veor @y[0], @y[0], @t[0]
|
||
+ vext.8 @t[2], @t[2], @t[2], #12
|
||
+ veor @y[1], @y[1], @t[1]
|
||
+ veor @y[0], @y[0], @t[5]
|
||
+ vext.8 @t[4], @t[4], @t[4], #12
|
||
+ veor @y[1], @y[1], @t[6]
|
||
+ veor @y[0], @y[0], @t[7]
|
||
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
|
||
+
|
||
+ veor @y[3], @y[3], @t[0]
|
||
+ veor @y[1], @y[1], @y[0]
|
||
+ vext.8 @t[0], @t[0], @t[0], #12
|
||
+ veor @y[2], @y[2], @t[1]
|
||
+ veor @y[4], @y[4], @t[1]
|
||
+ vext.8 @t[1], @t[1], @t[1], #12
|
||
+ veor @y[2], @y[2], @t[2]
|
||
+ veor @y[3], @y[3], @t[2]
|
||
+ veor @y[5], @y[5], @t[2]
|
||
+ veor @y[2], @y[2], @t[7]
|
||
+ vext.8 @t[2], @t[2], @t[2], #12
|
||
+ veor @y[3], @y[3], @t[3]
|
||
+ veor @y[6], @y[6], @t[3]
|
||
+ veor @y[4], @y[4], @t[3]
|
||
+ veor @y[7], @y[7], @t[4]
|
||
+ vext.8 @t[3], @t[3], @t[3], #12
|
||
+ veor @y[5], @y[5], @t[4]
|
||
+ veor @y[7], @y[7], @t[7]
|
||
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
|
||
+ veor @y[3], @y[3], @t[5]
|
||
+ veor @y[4], @y[4], @t[4]
|
||
+
|
||
+ veor @y[5], @y[5], @t[7]
|
||
+ vext.8 @t[4], @t[4], @t[4], #12
|
||
+ veor @y[6], @y[6], @t[7]
|
||
+ veor @y[4], @y[4], @t[7]
|
||
+
|
||
+ veor @t[7], @t[7], @t[5]
|
||
+ vext.8 @t[5], @t[5], @t[5], #12
|
||
+
|
||
+ @ multiplication by 0x0d
|
||
+ veor @y[4], @y[4], @y[7]
|
||
+ veor @t[7], @t[7], @t[6] @ restore t[7]
|
||
+ veor @y[7], @y[7], @t[4]
|
||
+ vext.8 @t[6], @t[6], @t[6], #12
|
||
+ veor @y[2], @y[2], @t[0]
|
||
+ veor @y[7], @y[7], @t[5]
|
||
+ vext.8 @t[7], @t[7], @t[7], #12
|
||
+ veor @y[2], @y[2], @t[2]
|
||
+
|
||
+ veor @y[3], @y[3], @y[1]
|
||
+ veor @y[1], @y[1], @t[1]
|
||
+ veor @y[0], @y[0], @t[0]
|
||
+ veor @y[3], @y[3], @t[0]
|
||
+ veor @y[1], @y[1], @t[5]
|
||
+ veor @y[0], @y[0], @t[5]
|
||
+ vext.8 @t[0], @t[0], @t[0], #12
|
||
+ veor @y[1], @y[1], @t[7]
|
||
+ veor @y[0], @y[0], @t[6]
|
||
+ veor @y[3], @y[3], @y[1]
|
||
+ veor @y[4], @y[4], @t[1]
|
||
+ vext.8 @t[1], @t[1], @t[1], #12
|
||
+
|
||
+ veor @y[7], @y[7], @t[7]
|
||
+ veor @y[4], @y[4], @t[2]
|
||
+ veor @y[5], @y[5], @t[2]
|
||
+ veor @y[2], @y[2], @t[6]
|
||
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
|
||
+ vext.8 @t[2], @t[2], @t[2], #12
|
||
+ veor @y[4], @y[4], @y[7]
|
||
+ veor @y[3], @y[3], @t[6]
|
||
+
|
||
+ veor @y[6], @y[6], @t[6]
|
||
+ veor @y[5], @y[5], @t[5]
|
||
+ vext.8 @t[5], @t[5], @t[5], #12
|
||
+ veor @y[6], @y[6], @t[4]
|
||
+ vext.8 @t[4], @t[4], @t[4], #12
|
||
+ veor @y[5], @y[5], @t[6]
|
||
+ veor @y[6], @y[6], @t[7]
|
||
+ vext.8 @t[7], @t[7], @t[7], #12
|
||
+ veor @t[6], @t[6], @t[3] @ restore t[6]
|
||
+ vext.8 @t[3], @t[3], @t[3], #12
|
||
+
|
||
+ @ multiplication by 0x09
|
||
+ veor @y[4], @y[4], @y[1]
|
||
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
|
||
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
|
||
+ vext.8 @t[6], @t[6], @t[6], #12
|
||
+ veor @t[1], @t[1], @t[5]
|
||
+ veor @y[3], @y[3], @t[0]
|
||
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
|
||
+ veor @t[1], @t[1], @t[6]
|
||
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
|
||
+ veor @y[4], @y[4], @t[1]
|
||
+ veor @y[7], @y[7], @t[4]
|
||
+ veor @y[6], @y[6], @t[3]
|
||
+ veor @y[5], @y[5], @t[2]
|
||
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
|
||
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
|
||
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
|
||
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
|
||
+ veor @t[3], @t[3], @t[7]
|
||
+ veor @XMM[5], @t[5], @t[6]
|
||
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
|
||
+ veor @XMM[2], @t[2], @t[6]
|
||
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
|
||
+
|
||
+ vmov @XMM[0], @t[0]
|
||
+ vmov @XMM[1], @t[1]
|
||
+ @ vmov @XMM[2], @t[2]
|
||
+ vmov @XMM[3], @t[3]
|
||
+ vmov @XMM[4], @t[4]
|
||
+ @ vmov @XMM[5], @t[5]
|
||
+ @ vmov @XMM[6], @t[6]
|
||
+ @ vmov @XMM[7], @t[7]
|
||
+___
|
||
+}
|
||
+
|
||
+sub InvMixColumns {
|
||
+my @x=@_[0..7];
|
||
+my @t=@_[8..15];
|
||
+
|
||
+# Thanks to Jussi Kivilinna for providing pointer to
|
||
+#
|
||
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
|
||
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
|
||
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
|
||
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
|
||
+
|
||
+$code.=<<___;
|
||
+ @ multiplication by 0x05-0x00-0x04-0x00
|
||
+ vext.8 @t[0], @x[0], @x[0], #8
|
||
+ vext.8 @t[6], @x[6], @x[6], #8
|
||
+ vext.8 @t[7], @x[7], @x[7], #8
|
||
+ veor @t[0], @t[0], @x[0]
|
||
+ vext.8 @t[1], @x[1], @x[1], #8
|
||
+ veor @t[6], @t[6], @x[6]
|
||
+ vext.8 @t[2], @x[2], @x[2], #8
|
||
+ veor @t[7], @t[7], @x[7]
|
||
+ vext.8 @t[3], @x[3], @x[3], #8
|
||
+ veor @t[1], @t[1], @x[1]
|
||
+ vext.8 @t[4], @x[4], @x[4], #8
|
||
+ veor @t[2], @t[2], @x[2]
|
||
+ vext.8 @t[5], @x[5], @x[5], #8
|
||
+ veor @t[3], @t[3], @x[3]
|
||
+ veor @t[4], @t[4], @x[4]
|
||
+ veor @t[5], @t[5], @x[5]
|
||
+
|
||
+ veor @x[0], @x[0], @t[6]
|
||
+ veor @x[1], @x[1], @t[6]
|
||
+ veor @x[2], @x[2], @t[0]
|
||
+ veor @x[4], @x[4], @t[2]
|
||
+ veor @x[3], @x[3], @t[1]
|
||
+ veor @x[1], @x[1], @t[7]
|
||
+ veor @x[2], @x[2], @t[7]
|
||
+ veor @x[4], @x[4], @t[6]
|
||
+ veor @x[5], @x[5], @t[3]
|
||
+ veor @x[3], @x[3], @t[6]
|
||
+ veor @x[6], @x[6], @t[4]
|
||
+ veor @x[4], @x[4], @t[7]
|
||
+ veor @x[5], @x[5], @t[7]
|
||
+ veor @x[7], @x[7], @t[5]
|
||
+___
|
||
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
|
||
+}
|
||
+
|
||
+sub swapmove {
|
||
+my ($a,$b,$n,$mask,$t)=@_;
|
||
+$code.=<<___;
|
||
+ vshr.u64 $t, $b, #$n
|
||
+ veor $t, $t, $a
|
||
+ vand $t, $t, $mask
|
||
+ veor $a, $a, $t
|
||
+ vshl.u64 $t, $t, #$n
|
||
+ veor $b, $b, $t
|
||
+___
|
||
+}
|
||
+sub swapmove2x {
|
||
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
|
||
+$code.=<<___;
|
||
+ vshr.u64 $t0, $b0, #$n
|
||
+ vshr.u64 $t1, $b1, #$n
|
||
+ veor $t0, $t0, $a0
|
||
+ veor $t1, $t1, $a1
|
||
+ vand $t0, $t0, $mask
|
||
+ vand $t1, $t1, $mask
|
||
+ veor $a0, $a0, $t0
|
||
+ vshl.u64 $t0, $t0, #$n
|
||
+ veor $a1, $a1, $t1
|
||
+ vshl.u64 $t1, $t1, #$n
|
||
+ veor $b0, $b0, $t0
|
||
+ veor $b1, $b1, $t1
|
||
+___
|
||
+}
|
||
+
|
||
+sub bitslice {
|
||
+my @x=reverse(@_[0..7]);
|
||
+my ($t0,$t1,$t2,$t3)=@_[8..11];
|
||
+$code.=<<___;
|
||
+ vmov.i8 $t0,#0x55 @ compose .LBS0
|
||
+ vmov.i8 $t1,#0x33 @ compose .LBS1
|
||
+___
|
||
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
|
||
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
|
||
+$code.=<<___;
|
||
+ vmov.i8 $t0,#0x0f @ compose .LBS2
|
||
+___
|
||
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
|
||
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
|
||
+
|
||
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
|
||
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
|
||
+}
|
||
+
|
||
+$code.=<<___;
|
||
+#ifndef __KERNEL__
|
||
+# include "arm_arch.h"
|
||
+
|
||
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
|
||
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
|
||
+# define VFP_ABI_FRAME 0x40
|
||
+#else
|
||
+# define VFP_ABI_PUSH
|
||
+# define VFP_ABI_POP
|
||
+# define VFP_ABI_FRAME 0
|
||
+# define BSAES_ASM_EXTENDED_KEY
|
||
+# define XTS_CHAIN_TWEAK
|
||
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
|
||
+#endif
|
||
+
|
||
+#ifdef __thumb__
|
||
+# define adrl adr
|
||
+#endif
|
||
+
|
||
+#if __ARM_ARCH__>=7
|
||
+.text
|
||
+.syntax unified @ ARMv7-capable assembler is expected to handle this
|
||
+#ifdef __thumb2__
|
||
+.thumb
|
||
+#else
|
||
+.code 32
|
||
+#endif
|
||
+
|
||
+.fpu neon
|
||
+
|
||
+.type _bsaes_decrypt8,%function
|
||
+.align 4
|
||
+_bsaes_decrypt8:
|
||
+ adr $const,_bsaes_decrypt8
|
||
+ vldmia $key!, {@XMM[9]} @ round 0 key
|
||
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
|
||
+
|
||
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
|
||
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
|
||
+ veor @XMM[11], @XMM[1], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[12], @XMM[2], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[13], @XMM[3], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[14], @XMM[4], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[15], @XMM[5], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[10], @XMM[6], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[11], @XMM[7], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
|
||
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
|
||
+___
|
||
+ &bitslice (@XMM[0..7, 8..11]);
|
||
+$code.=<<___;
|
||
+ sub $rounds,$rounds,#1
|
||
+ b .Ldec_sbox
|
||
+.align 4
|
||
+.Ldec_loop:
|
||
+___
|
||
+ &ShiftRows (@XMM[0..7, 8..12]);
|
||
+$code.=".Ldec_sbox:\n";
|
||
+ &InvSbox (@XMM[0..7, 8..15]);
|
||
+$code.=<<___;
|
||
+ subs $rounds,$rounds,#1
|
||
+ bcc .Ldec_done
|
||
+___
|
||
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
|
||
+$code.=<<___;
|
||
+ vldmia $const, {@XMM[12]} @ .LISR
|
||
+ ite eq @ Thumb2 thing, sanity check in ARM
|
||
+ addeq $const,$const,#0x10
|
||
+ bne .Ldec_loop
|
||
+ vldmia $const, {@XMM[12]} @ .LISRM0
|
||
+ b .Ldec_loop
|
||
+.align 4
|
||
+.Ldec_done:
|
||
+___
|
||
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
|
||
+$code.=<<___;
|
||
+ vldmia $key, {@XMM[8]} @ last round key
|
||
+ veor @XMM[6], @XMM[6], @XMM[8]
|
||
+ veor @XMM[4], @XMM[4], @XMM[8]
|
||
+ veor @XMM[2], @XMM[2], @XMM[8]
|
||
+ veor @XMM[7], @XMM[7], @XMM[8]
|
||
+ veor @XMM[3], @XMM[3], @XMM[8]
|
||
+ veor @XMM[5], @XMM[5], @XMM[8]
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ bx lr
|
||
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
|
||
+
|
||
+.type _bsaes_const,%object
|
||
+.align 6
|
||
+_bsaes_const:
|
||
+.LM0ISR: @ InvShiftRows constants
|
||
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
|
||
+.LISR:
|
||
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
|
||
+.LISRM0:
|
||
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
|
||
+.LM0SR: @ ShiftRows constants
|
||
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
|
||
+.LSR:
|
||
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
|
||
+.LSRM0:
|
||
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
|
||
+.LM0:
|
||
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
|
||
+.LREVM0SR:
|
||
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
|
||
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||
+.align 6
|
||
+.size _bsaes_const,.-_bsaes_const
|
||
+
|
||
+.type _bsaes_encrypt8,%function
|
||
+.align 4
|
||
+_bsaes_encrypt8:
|
||
+ adr $const,_bsaes_encrypt8
|
||
+ vldmia $key!, {@XMM[9]} @ round 0 key
|
||
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
|
||
+
|
||
+ vldmia $const!, {@XMM[8]} @ .LM0SR
|
||
+_bsaes_encrypt8_alt:
|
||
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
|
||
+ veor @XMM[11], @XMM[1], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[12], @XMM[2], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[13], @XMM[3], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[14], @XMM[4], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[15], @XMM[5], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[10], @XMM[6], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
|
||
+ veor @XMM[11], @XMM[7], @XMM[9]
|
||
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
|
||
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
|
||
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
|
||
+_bsaes_encrypt8_bitslice:
|
||
+___
|
||
+ &bitslice (@XMM[0..7, 8..11]);
|
||
+$code.=<<___;
|
||
+ sub $rounds,$rounds,#1
|
||
+ b .Lenc_sbox
|
||
+.align 4
|
||
+.Lenc_loop:
|
||
+___
|
||
+ &ShiftRows (@XMM[0..7, 8..12]);
|
||
+$code.=".Lenc_sbox:\n";
|
||
+ &Sbox (@XMM[0..7, 8..15]);
|
||
+$code.=<<___;
|
||
+ subs $rounds,$rounds,#1
|
||
+ bcc .Lenc_done
|
||
+___
|
||
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
|
||
+$code.=<<___;
|
||
+ vldmia $const, {@XMM[12]} @ .LSR
|
||
+ ite eq @ Thumb2 thing, samity check in ARM
|
||
+ addeq $const,$const,#0x10
|
||
+ bne .Lenc_loop
|
||
+ vldmia $const, {@XMM[12]} @ .LSRM0
|
||
+ b .Lenc_loop
|
||
+.align 4
|
||
+.Lenc_done:
|
||
+___
|
||
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
|
||
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
|
||
+$code.=<<___;
|
||
+ vldmia $key, {@XMM[8]} @ last round key
|
||
+ veor @XMM[4], @XMM[4], @XMM[8]
|
||
+ veor @XMM[6], @XMM[6], @XMM[8]
|
||
+ veor @XMM[3], @XMM[3], @XMM[8]
|
||
+ veor @XMM[7], @XMM[7], @XMM[8]
|
||
+ veor @XMM[2], @XMM[2], @XMM[8]
|
||
+ veor @XMM[5], @XMM[5], @XMM[8]
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ bx lr
|
||
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
|
||
+___
|
||
+}
|
||
+{
|
||
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
|
||
+
|
||
+sub bitslice_key {
|
||
+my @x=reverse(@_[0..7]);
|
||
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
|
||
+
|
||
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
|
||
+$code.=<<___;
|
||
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
|
||
+ vmov @x[2], @x[0]
|
||
+ vmov @x[3], @x[1]
|
||
+___
|
||
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
|
||
+
|
||
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
|
||
+$code.=<<___;
|
||
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
|
||
+ vmov @x[4], @x[0]
|
||
+ vmov @x[6], @x[2]
|
||
+ vmov @x[5], @x[1]
|
||
+ vmov @x[7], @x[3]
|
||
+___
|
||
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
|
||
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
|
||
+}
|
||
+
|
||
+$code.=<<___;
|
||
+.type _bsaes_key_convert,%function
|
||
+.align 4
|
||
+_bsaes_key_convert:
|
||
+ adr $const,_bsaes_key_convert
|
||
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
|
||
+ sub $const,$const,#_bsaes_key_convert-.LM0
|
||
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
|
||
+
|
||
+ vmov.i8 @XMM[8], #0x01 @ bit masks
|
||
+ vmov.i8 @XMM[9], #0x02
|
||
+ vmov.i8 @XMM[10], #0x04
|
||
+ vmov.i8 @XMM[11], #0x08
|
||
+ vmov.i8 @XMM[12], #0x10
|
||
+ vmov.i8 @XMM[13], #0x20
|
||
+ vldmia $const, {@XMM[14]} @ .LM0
|
||
+
|
||
+#ifdef __ARMEL__
|
||
+ vrev32.8 @XMM[7], @XMM[7]
|
||
+ vrev32.8 @XMM[15], @XMM[15]
|
||
+#endif
|
||
+ sub $rounds,$rounds,#1
|
||
+ vstmia $out!, {@XMM[7]} @ save round 0 key
|
||
+ b .Lkey_loop
|
||
+
|
||
+.align 4
|
||
+.Lkey_loop:
|
||
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
|
||
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
|
||
+ vmov.i8 @XMM[6], #0x40
|
||
+ vmov.i8 @XMM[15], #0x80
|
||
+
|
||
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
|
||
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
|
||
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
|
||
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
|
||
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
|
||
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
|
||
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
|
||
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
|
||
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
|
||
+ vmvn @XMM[0], @XMM[0] @ "pnot"
|
||
+ vmvn @XMM[1], @XMM[1]
|
||
+ vmvn @XMM[5], @XMM[5]
|
||
+ vmvn @XMM[6], @XMM[6]
|
||
+#ifdef __ARMEL__
|
||
+ vrev32.8 @XMM[15], @XMM[15]
|
||
+#endif
|
||
+ subs $rounds,$rounds,#1
|
||
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
|
||
+ bne .Lkey_loop
|
||
+
|
||
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
|
||
+ @ don't save last round key
|
||
+ bx lr
|
||
+.size _bsaes_key_convert,.-_bsaes_key_convert
|
||
+___
|
||
+}
|
||
+
|
||
+if (0) { # following four functions are unsupported interface
|
||
+ # used for benchmarking...
|
||
+$code.=<<___;
|
||
+.globl bsaes_enc_key_convert
|
||
+.type bsaes_enc_key_convert,%function
|
||
+.align 4
|
||
+bsaes_enc_key_convert:
|
||
+ stmdb sp!,{r4-r6,lr}
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+
|
||
+ ldr r5,[$inp,#240] @ pass rounds
|
||
+ mov r4,$inp @ pass key
|
||
+ mov r12,$out @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
|
||
+ vstmia r12, {@XMM[7]} @ save last round key
|
||
+
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r6,pc}
|
||
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
|
||
+
|
||
+.globl bsaes_encrypt_128
|
||
+.type bsaes_encrypt_128,%function
|
||
+.align 4
|
||
+bsaes_encrypt_128:
|
||
+ stmdb sp!,{r4-r6,lr}
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+.Lenc128_loop:
|
||
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
|
||
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
|
||
+ mov r4,$key @ pass the key
|
||
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
|
||
+ mov r5,#10 @ pass rounds
|
||
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ vst1.8 {@XMM[3]}, [$out]!
|
||
+ vst1.8 {@XMM[7]}, [$out]!
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+ subs $len,$len,#0x80
|
||
+ vst1.8 {@XMM[5]}, [$out]!
|
||
+ bhi .Lenc128_loop
|
||
+
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r6,pc}
|
||
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
|
||
+
|
||
+.globl bsaes_dec_key_convert
|
||
+.type bsaes_dec_key_convert,%function
|
||
+.align 4
|
||
+bsaes_dec_key_convert:
|
||
+ stmdb sp!,{r4-r6,lr}
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+
|
||
+ ldr r5,[$inp,#240] @ pass rounds
|
||
+ mov r4,$inp @ pass key
|
||
+ mov r12,$out @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ vldmia $out, {@XMM[6]}
|
||
+ vstmia r12, {@XMM[15]} @ save last round key
|
||
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
|
||
+ vstmia $out, {@XMM[7]}
|
||
+
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r6,pc}
|
||
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
|
||
+
|
||
+.globl bsaes_decrypt_128
|
||
+.type bsaes_decrypt_128,%function
|
||
+.align 4
|
||
+bsaes_decrypt_128:
|
||
+ stmdb sp!,{r4-r6,lr}
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+.Ldec128_loop:
|
||
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
|
||
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
|
||
+ mov r4,$key @ pass the key
|
||
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
|
||
+ mov r5,#10 @ pass rounds
|
||
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+ vst1.8 {@XMM[7]}, [$out]!
|
||
+ vst1.8 {@XMM[3]}, [$out]!
|
||
+ subs $len,$len,#0x80
|
||
+ vst1.8 {@XMM[5]}, [$out]!
|
||
+ bhi .Ldec128_loop
|
||
+
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r6,pc}
|
||
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
|
||
+___
|
||
+}
|
||
+{
|
||
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
|
||
+my ($keysched)=("sp");
|
||
+
|
||
+$code.=<<___;
|
||
+.extern AES_cbc_encrypt
|
||
+.extern AES_decrypt
|
||
+
|
||
+.global bsaes_cbc_encrypt
|
||
+.type bsaes_cbc_encrypt,%function
|
||
+.align 5
|
||
+bsaes_cbc_encrypt:
|
||
+#ifndef __KERNEL__
|
||
+ cmp $len, #128
|
||
+#ifndef __thumb__
|
||
+ blo AES_cbc_encrypt
|
||
+#else
|
||
+ bhs 1f
|
||
+ b AES_cbc_encrypt
|
||
+1:
|
||
+#endif
|
||
+#endif
|
||
+
|
||
+ @ it is up to the caller to make sure we are called with enc == 0
|
||
+
|
||
+ mov ip, sp
|
||
+ stmdb sp!, {r4-r10, lr}
|
||
+ VFP_ABI_PUSH
|
||
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
|
||
+ mov $len, $len, lsr#4 @ len in 16 byte blocks
|
||
+ sub sp, #0x10 @ scratch space to carry over the IV
|
||
+ mov $fp, sp @ save sp
|
||
+
|
||
+ ldr $rounds, [$key, #240] @ get # of rounds
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ @ allocate the key schedule on the stack
|
||
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
|
||
+ add r12, #`128-32` @ sifze of bit-slices key schedule
|
||
+
|
||
+ @ populate the key schedule
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ mov sp, r12 @ sp is $keysched
|
||
+ bl _bsaes_key_convert
|
||
+ vldmia $keysched, {@XMM[6]}
|
||
+ vstmia r12, {@XMM[15]} @ save last round key
|
||
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
|
||
+ vstmia $keysched, {@XMM[7]}
|
||
+#else
|
||
+ ldr r12, [$key, #244]
|
||
+ eors r12, #1
|
||
+ beq 0f
|
||
+
|
||
+ @ populate the key schedule
|
||
+ str r12, [$key, #244]
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ add r12, $key, #248 @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ add r4, $key, #248
|
||
+ vldmia r4, {@XMM[6]}
|
||
+ vstmia r12, {@XMM[15]} @ save last round key
|
||
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
|
||
+ vstmia r4, {@XMM[7]}
|
||
+
|
||
+.align 2
|
||
+0:
|
||
+#endif
|
||
+
|
||
+ vld1.8 {@XMM[15]}, [$ivp] @ load IV
|
||
+ b .Lcbc_dec_loop
|
||
+
|
||
+.align 4
|
||
+.Lcbc_dec_loop:
|
||
+ subs $len, $len, #0x8
|
||
+ bmi .Lcbc_dec_loop_finish
|
||
+
|
||
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
|
||
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ mov r4, $keysched @ pass the key
|
||
+#else
|
||
+ add r4, $key, #248
|
||
+#endif
|
||
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
|
||
+ mov r5, $rounds
|
||
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
|
||
+ sub $inp, $inp, #0x60
|
||
+ vstmia $fp, {@XMM[15]} @ put aside IV
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vldmia $fp, {@XMM[14]} @ reload IV
|
||
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
|
||
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
|
||
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ veor @XMM[6], @XMM[6], @XMM[9]
|
||
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
|
||
+ veor @XMM[4], @XMM[4], @XMM[10]
|
||
+ veor @XMM[2], @XMM[2], @XMM[11]
|
||
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
|
||
+ veor @XMM[7], @XMM[7], @XMM[12]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ veor @XMM[3], @XMM[3], @XMM[13]
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ veor @XMM[5], @XMM[5], @XMM[14]
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+ vst1.8 {@XMM[7]}, [$out]!
|
||
+ vst1.8 {@XMM[3]}, [$out]!
|
||
+ vst1.8 {@XMM[5]}, [$out]!
|
||
+
|
||
+ b .Lcbc_dec_loop
|
||
+
|
||
+.Lcbc_dec_loop_finish:
|
||
+ adds $len, $len, #8
|
||
+ beq .Lcbc_dec_done
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
|
||
+ cmp $len, #2
|
||
+ blo .Lcbc_dec_one
|
||
+ vld1.8 {@XMM[1]}, [$inp]!
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ mov r4, $keysched @ pass the key
|
||
+#else
|
||
+ add r4, $key, #248
|
||
+#endif
|
||
+ mov r5, $rounds
|
||
+ vstmia $fp, {@XMM[15]} @ put aside IV
|
||
+ beq .Lcbc_dec_two
|
||
+ vld1.8 {@XMM[2]}, [$inp]!
|
||
+ cmp $len, #4
|
||
+ blo .Lcbc_dec_three
|
||
+ vld1.8 {@XMM[3]}, [$inp]!
|
||
+ beq .Lcbc_dec_four
|
||
+ vld1.8 {@XMM[4]}, [$inp]!
|
||
+ cmp $len, #6
|
||
+ blo .Lcbc_dec_five
|
||
+ vld1.8 {@XMM[5]}, [$inp]!
|
||
+ beq .Lcbc_dec_six
|
||
+ vld1.8 {@XMM[6]}, [$inp]!
|
||
+ sub $inp, $inp, #0x70
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vldmia $fp, {@XMM[14]} @ reload IV
|
||
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
|
||
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
|
||
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ veor @XMM[6], @XMM[6], @XMM[9]
|
||
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
|
||
+ veor @XMM[4], @XMM[4], @XMM[10]
|
||
+ veor @XMM[2], @XMM[2], @XMM[11]
|
||
+ vld1.8 {@XMM[15]}, [$inp]!
|
||
+ veor @XMM[7], @XMM[7], @XMM[12]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ veor @XMM[3], @XMM[3], @XMM[13]
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+ vst1.8 {@XMM[7]}, [$out]!
|
||
+ vst1.8 {@XMM[3]}, [$out]!
|
||
+ b .Lcbc_dec_done
|
||
+.align 4
|
||
+.Lcbc_dec_six:
|
||
+ sub $inp, $inp, #0x60
|
||
+ bl _bsaes_decrypt8
|
||
+ vldmia $fp,{@XMM[14]} @ reload IV
|
||
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
|
||
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
|
||
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ veor @XMM[6], @XMM[6], @XMM[9]
|
||
+ vld1.8 {@XMM[12]}, [$inp]!
|
||
+ veor @XMM[4], @XMM[4], @XMM[10]
|
||
+ veor @XMM[2], @XMM[2], @XMM[11]
|
||
+ vld1.8 {@XMM[15]}, [$inp]!
|
||
+ veor @XMM[7], @XMM[7], @XMM[12]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+ vst1.8 {@XMM[7]}, [$out]!
|
||
+ b .Lcbc_dec_done
|
||
+.align 4
|
||
+.Lcbc_dec_five:
|
||
+ sub $inp, $inp, #0x50
|
||
+ bl _bsaes_decrypt8
|
||
+ vldmia $fp, {@XMM[14]} @ reload IV
|
||
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
|
||
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
|
||
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ veor @XMM[6], @XMM[6], @XMM[9]
|
||
+ vld1.8 {@XMM[15]}, [$inp]!
|
||
+ veor @XMM[4], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ veor @XMM[2], @XMM[2], @XMM[11]
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+ b .Lcbc_dec_done
|
||
+.align 4
|
||
+.Lcbc_dec_four:
|
||
+ sub $inp, $inp, #0x40
|
||
+ bl _bsaes_decrypt8
|
||
+ vldmia $fp, {@XMM[14]} @ reload IV
|
||
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
|
||
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
|
||
+ vld1.8 {@XMM[10]}, [$inp]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ veor @XMM[6], @XMM[6], @XMM[9]
|
||
+ vld1.8 {@XMM[15]}, [$inp]!
|
||
+ veor @XMM[4], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ b .Lcbc_dec_done
|
||
+.align 4
|
||
+.Lcbc_dec_three:
|
||
+ sub $inp, $inp, #0x30
|
||
+ bl _bsaes_decrypt8
|
||
+ vldmia $fp, {@XMM[14]} @ reload IV
|
||
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
|
||
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
|
||
+ vld1.8 {@XMM[15]}, [$inp]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ veor @XMM[6], @XMM[6], @XMM[9]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ b .Lcbc_dec_done
|
||
+.align 4
|
||
+.Lcbc_dec_two:
|
||
+ sub $inp, $inp, #0x20
|
||
+ bl _bsaes_decrypt8
|
||
+ vldmia $fp, {@XMM[14]} @ reload IV
|
||
+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
|
||
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
|
||
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
|
||
+ veor @XMM[1], @XMM[1], @XMM[8]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ b .Lcbc_dec_done
|
||
+.align 4
|
||
+.Lcbc_dec_one:
|
||
+ sub $inp, $inp, #0x10
|
||
+ mov $rounds, $out @ save original out pointer
|
||
+ mov $out, $fp @ use the iv scratch space as out buffer
|
||
+ mov r2, $key
|
||
+ vmov @XMM[4],@XMM[15] @ just in case ensure that IV
|
||
+ vmov @XMM[5],@XMM[0] @ and input are preserved
|
||
+ bl AES_decrypt
|
||
+ vld1.8 {@XMM[0]}, [$fp,:64] @ load result
|
||
+ veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
|
||
+ vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
|
||
+ vst1.8 {@XMM[0]}, [$rounds] @ write output
|
||
+
|
||
+.Lcbc_dec_done:
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ vmov.i32 q0, #0
|
||
+ vmov.i32 q1, #0
|
||
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
|
||
+ vstmia $keysched!, {q0-q1}
|
||
+ cmp $keysched, $fp
|
||
+ bne .Lcbc_dec_bzero
|
||
+#endif
|
||
+
|
||
+ mov sp, $fp
|
||
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
|
||
+ vst1.8 {@XMM[15]}, [$ivp] @ return IV
|
||
+ VFP_ABI_POP
|
||
+ ldmia sp!, {r4-r10, pc}
|
||
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
|
||
+___
|
||
+}
|
||
+{
|
||
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
|
||
+my $const = "r6"; # shared with _bsaes_encrypt8_alt
|
||
+my $keysched = "sp";
|
||
+
|
||
+$code.=<<___;
|
||
+.extern AES_encrypt
|
||
+.global bsaes_ctr32_encrypt_blocks
|
||
+.type bsaes_ctr32_encrypt_blocks,%function
|
||
+.align 5
|
||
+bsaes_ctr32_encrypt_blocks:
|
||
+ cmp $len, #8 @ use plain AES for
|
||
+ blo .Lctr_enc_short @ small sizes
|
||
+
|
||
+ mov ip, sp
|
||
+ stmdb sp!, {r4-r10, lr}
|
||
+ VFP_ABI_PUSH
|
||
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
|
||
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
|
||
+ mov $fp, sp @ save sp
|
||
+
|
||
+ ldr $rounds, [$key, #240] @ get # of rounds
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ @ allocate the key schedule on the stack
|
||
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
|
||
+ add r12, #`128-32` @ size of bit-sliced key schedule
|
||
+
|
||
+ @ populate the key schedule
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ mov sp, r12 @ sp is $keysched
|
||
+ bl _bsaes_key_convert
|
||
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
|
||
+ vstmia r12, {@XMM[7]} @ save last round key
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
|
||
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
|
||
+ vldmia $keysched, {@XMM[4]} @ load round0 key
|
||
+#else
|
||
+ ldr r12, [$key, #244]
|
||
+ eors r12, #1
|
||
+ beq 0f
|
||
+
|
||
+ @ populate the key schedule
|
||
+ str r12, [$key, #244]
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ add r12, $key, #248 @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
|
||
+ vstmia r12, {@XMM[7]} @ save last round key
|
||
+
|
||
+.align 2
|
||
+0: add r12, $key, #248
|
||
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
|
||
+ adrl $ctr, .LREVM0SR @ borrow $ctr
|
||
+ vldmia r12, {@XMM[4]} @ load round0 key
|
||
+ sub sp, #0x10 @ place for adjusted round0 key
|
||
+#endif
|
||
+
|
||
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
|
||
+ veor @XMM[9],@XMM[9],@XMM[9]
|
||
+ vrev32.8 @XMM[0],@XMM[0]
|
||
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
|
||
+ vrev32.8 @XMM[4],@XMM[4]
|
||
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
|
||
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
|
||
+ b .Lctr_enc_loop
|
||
+
|
||
+.align 4
|
||
+.Lctr_enc_loop:
|
||
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
|
||
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
|
||
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
|
||
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
|
||
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
|
||
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
|
||
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
|
||
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
|
||
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
|
||
+
|
||
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
|
||
+ @ to flip byte order in 32-bit counter
|
||
+
|
||
+ vldmia $keysched, {@XMM[9]} @ load round0 key
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, $keysched, #0x10 @ pass next round key
|
||
+#else
|
||
+ add r4, $key, #`248+16`
|
||
+#endif
|
||
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ vstmia $fp, {@XMM[10]} @ save next counter
|
||
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
|
||
+
|
||
+ bl _bsaes_encrypt8_alt
|
||
+
|
||
+ subs $len, $len, #8
|
||
+ blo .Lctr_enc_loop_done
|
||
+
|
||
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
|
||
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
|
||
+ veor @XMM[0], @XMM[8]
|
||
+ veor @XMM[1], @XMM[9]
|
||
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
|
||
+ veor @XMM[4], @XMM[10]
|
||
+ veor @XMM[6], @XMM[11]
|
||
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
|
||
+ veor @XMM[3], @XMM[12]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
|
||
+ veor @XMM[7], @XMM[13]
|
||
+ veor @XMM[2], @XMM[14]
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ veor @XMM[5], @XMM[15]
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
|
||
+ vst1.8 {@XMM[3]}, [$out]!
|
||
+ veor @XMM[9], @XMM[9], @XMM[9]
|
||
+ vst1.8 {@XMM[7]}, [$out]!
|
||
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
|
||
+ vst1.8 {@XMM[5]}, [$out]!
|
||
+ vldmia $fp, {@XMM[0]} @ load counter
|
||
+
|
||
+ bne .Lctr_enc_loop
|
||
+ b .Lctr_enc_done
|
||
+
|
||
+.align 4
|
||
+.Lctr_enc_loop_done:
|
||
+ add $len, $len, #8
|
||
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
|
||
+ veor @XMM[0], @XMM[8]
|
||
+ vst1.8 {@XMM[0]}, [$out]! @ write output
|
||
+ cmp $len, #2
|
||
+ blo .Lctr_enc_done
|
||
+ vld1.8 {@XMM[9]}, [$inp]!
|
||
+ veor @XMM[1], @XMM[9]
|
||
+ vst1.8 {@XMM[1]}, [$out]!
|
||
+ beq .Lctr_enc_done
|
||
+ vld1.8 {@XMM[10]}, [$inp]!
|
||
+ veor @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[4]}, [$out]!
|
||
+ cmp $len, #4
|
||
+ blo .Lctr_enc_done
|
||
+ vld1.8 {@XMM[11]}, [$inp]!
|
||
+ veor @XMM[6], @XMM[11]
|
||
+ vst1.8 {@XMM[6]}, [$out]!
|
||
+ beq .Lctr_enc_done
|
||
+ vld1.8 {@XMM[12]}, [$inp]!
|
||
+ veor @XMM[3], @XMM[12]
|
||
+ vst1.8 {@XMM[3]}, [$out]!
|
||
+ cmp $len, #6
|
||
+ blo .Lctr_enc_done
|
||
+ vld1.8 {@XMM[13]}, [$inp]!
|
||
+ veor @XMM[7], @XMM[13]
|
||
+ vst1.8 {@XMM[7]}, [$out]!
|
||
+ beq .Lctr_enc_done
|
||
+ vld1.8 {@XMM[14]}, [$inp]
|
||
+ veor @XMM[2], @XMM[14]
|
||
+ vst1.8 {@XMM[2]}, [$out]!
|
||
+
|
||
+.Lctr_enc_done:
|
||
+ vmov.i32 q0, #0
|
||
+ vmov.i32 q1, #0
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+.Lctr_enc_bzero: @ wipe key schedule [if any]
|
||
+ vstmia $keysched!, {q0-q1}
|
||
+ cmp $keysched, $fp
|
||
+ bne .Lctr_enc_bzero
|
||
+#else
|
||
+ vstmia $keysched, {q0-q1}
|
||
+#endif
|
||
+
|
||
+ mov sp, $fp
|
||
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
|
||
+ VFP_ABI_POP
|
||
+ ldmia sp!, {r4-r10, pc} @ return
|
||
+
|
||
+.align 4
|
||
+.Lctr_enc_short:
|
||
+ ldr ip, [sp] @ ctr pointer is passed on stack
|
||
+ stmdb sp!, {r4-r8, lr}
|
||
+
|
||
+ mov r4, $inp @ copy arguments
|
||
+ mov r5, $out
|
||
+ mov r6, $len
|
||
+ mov r7, $key
|
||
+ ldr r8, [ip, #12] @ load counter LSW
|
||
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
|
||
+#ifdef __ARMEL__
|
||
+ rev r8, r8
|
||
+#endif
|
||
+ sub sp, sp, #0x10
|
||
+ vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
|
||
+ sub sp, sp, #0x10
|
||
+
|
||
+.Lctr_enc_short_loop:
|
||
+ add r0, sp, #0x10 @ input counter value
|
||
+ mov r1, sp @ output on the stack
|
||
+ mov r2, r7 @ key
|
||
+
|
||
+ bl AES_encrypt
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [r4]! @ load input
|
||
+ vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
|
||
+ add r8, r8, #1
|
||
+#ifdef __ARMEL__
|
||
+ rev r0, r8
|
||
+ str r0, [sp, #0x1c] @ next counter value
|
||
+#else
|
||
+ str r8, [sp, #0x1c] @ next counter value
|
||
+#endif
|
||
+ veor @XMM[0],@XMM[0],@XMM[1]
|
||
+ vst1.8 {@XMM[0]}, [r5]! @ store output
|
||
+ subs r6, r6, #1
|
||
+ bne .Lctr_enc_short_loop
|
||
+
|
||
+ vmov.i32 q0, #0
|
||
+ vmov.i32 q1, #0
|
||
+ vstmia sp!, {q0-q1}
|
||
+
|
||
+ ldmia sp!, {r4-r8, pc}
|
||
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
|
||
+___
|
||
+}
|
||
+{
|
||
+######################################################################
|
||
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
|
||
+# const AES_KEY *key1, const AES_KEY *key2,
|
||
+# const unsigned char iv[16]);
|
||
+#
|
||
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
|
||
+my $const="r6"; # returned by _bsaes_key_convert
|
||
+my $twmask=@XMM[5];
|
||
+my @T=@XMM[6..7];
|
||
+
|
||
+$code.=<<___;
|
||
+.globl bsaes_xts_encrypt
|
||
+.type bsaes_xts_encrypt,%function
|
||
+.align 4
|
||
+bsaes_xts_encrypt:
|
||
+ mov ip, sp
|
||
+ stmdb sp!, {r4-r10, lr} @ 0x20
|
||
+ VFP_ABI_PUSH
|
||
+ mov r6, sp @ future $fp
|
||
+
|
||
+ mov $inp, r0
|
||
+ mov $out, r1
|
||
+ mov $len, r2
|
||
+ mov $key, r3
|
||
+
|
||
+ sub r0, sp, #0x10 @ 0x10
|
||
+ bic r0, #0xf @ align at 16 bytes
|
||
+ mov sp, r0
|
||
+
|
||
+#ifdef XTS_CHAIN_TWEAK
|
||
+ ldr r0, [ip] @ pointer to input tweak
|
||
+#else
|
||
+ @ generate initial tweak
|
||
+ ldr r0, [ip, #4] @ iv[]
|
||
+ mov r1, sp
|
||
+ ldr r2, [ip, #0] @ key2
|
||
+ bl AES_encrypt
|
||
+ mov r0,sp @ pointer to initial tweak
|
||
+#endif
|
||
+
|
||
+ ldr $rounds, [$key, #240] @ get # of rounds
|
||
+ mov $fp, r6
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ @ allocate the key schedule on the stack
|
||
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
|
||
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
|
||
+ sub r12, #`32+16` @ place for tweak[9]
|
||
+
|
||
+ @ populate the key schedule
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ mov sp, r12
|
||
+ add r12, #0x90 @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
|
||
+ vstmia r12, {@XMM[7]} @ save last round key
|
||
+#else
|
||
+ ldr r12, [$key, #244]
|
||
+ eors r12, #1
|
||
+ beq 0f
|
||
+
|
||
+ str r12, [$key, #244]
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ add r12, $key, #248 @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
|
||
+ vstmia r12, {@XMM[7]}
|
||
+
|
||
+.align 2
|
||
+0: sub sp, #0x90 @ place for tweak[9]
|
||
+#endif
|
||
+
|
||
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
|
||
+ adr $magic, .Lxts_magic
|
||
+
|
||
+ subs $len, #0x80
|
||
+ blo .Lxts_enc_short
|
||
+ b .Lxts_enc_loop
|
||
+
|
||
+.align 4
|
||
+.Lxts_enc_loop:
|
||
+ vldmia $magic, {$twmask} @ load XTS magic
|
||
+ vshr.s64 @T[0], @XMM[8], #63
|
||
+ mov r0, sp
|
||
+ vand @T[0], @T[0], $twmask
|
||
+___
|
||
+for($i=9;$i<16;$i++) {
|
||
+$code.=<<___;
|
||
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
|
||
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
|
||
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
|
||
+ vshr.s64 @T[1], @XMM[$i], #63
|
||
+ veor @XMM[$i], @XMM[$i], @T[0]
|
||
+ vand @T[1], @T[1], $twmask
|
||
+___
|
||
+ @T=reverse(@T);
|
||
+
|
||
+$code.=<<___ if ($i>=10);
|
||
+ vld1.8 {@XMM[$i-10]}, [$inp]!
|
||
+___
|
||
+$code.=<<___ if ($i>=11);
|
||
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
|
||
+___
|
||
+}
|
||
+$code.=<<___;
|
||
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
|
||
+ vst1.64 {@XMM[15]}, [r0,:128]!
|
||
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
|
||
+ veor @XMM[8], @XMM[8], @T[0]
|
||
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
|
||
+ veor @XMM[5], @XMM[5], @XMM[13]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[6], @XMM[6], @XMM[14]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ veor @XMM[7], @XMM[7], @XMM[15]
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[6], @XMM[11]
|
||
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
|
||
+ veor @XMM[10], @XMM[3], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ veor @XMM[11], @XMM[7], @XMM[13]
|
||
+ veor @XMM[12], @XMM[2], @XMM[14]
|
||
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
|
||
+ veor @XMM[13], @XMM[5], @XMM[15]
|
||
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ subs $len, #0x80
|
||
+ bpl .Lxts_enc_loop
|
||
+
|
||
+.Lxts_enc_short:
|
||
+ adds $len, #0x70
|
||
+ bmi .Lxts_enc_done
|
||
+
|
||
+ vldmia $magic, {$twmask} @ load XTS magic
|
||
+ vshr.s64 @T[0], @XMM[8], #63
|
||
+ mov r0, sp
|
||
+ vand @T[0], @T[0], $twmask
|
||
+___
|
||
+for($i=9;$i<16;$i++) {
|
||
+$code.=<<___;
|
||
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
|
||
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
|
||
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
|
||
+ vshr.s64 @T[1], @XMM[$i], #63
|
||
+ veor @XMM[$i], @XMM[$i], @T[0]
|
||
+ vand @T[1], @T[1], $twmask
|
||
+___
|
||
+ @T=reverse(@T);
|
||
+
|
||
+$code.=<<___ if ($i>=10);
|
||
+ vld1.8 {@XMM[$i-10]}, [$inp]!
|
||
+ subs $len, #0x10
|
||
+ bmi .Lxts_enc_`$i-9`
|
||
+___
|
||
+$code.=<<___ if ($i>=11);
|
||
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
|
||
+___
|
||
+}
|
||
+$code.=<<___;
|
||
+ sub $len, #0x10
|
||
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ vld1.8 {@XMM[6]}, [$inp]!
|
||
+ veor @XMM[5], @XMM[5], @XMM[13]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[6], @XMM[6], @XMM[14]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[6], @XMM[11]
|
||
+ vld1.64 {@XMM[14]}, [r0,:128]!
|
||
+ veor @XMM[10], @XMM[3], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ veor @XMM[11], @XMM[7], @XMM[13]
|
||
+ veor @XMM[12], @XMM[2], @XMM[14]
|
||
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
|
||
+ vst1.8 {@XMM[12]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_enc_done
|
||
+.align 4
|
||
+.Lxts_enc_6:
|
||
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[4], @XMM[4], @XMM[12]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[5], @XMM[5], @XMM[13]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[6], @XMM[11]
|
||
+ veor @XMM[10], @XMM[3], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ veor @XMM[11], @XMM[7], @XMM[13]
|
||
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_enc_done
|
||
+
|
||
+@ put this in range for both ARM and Thumb mode adr instructions
|
||
+.align 5
|
||
+.Lxts_magic:
|
||
+ .quad 1, 0x87
|
||
+
|
||
+.align 5
|
||
+.Lxts_enc_5:
|
||
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[3], @XMM[3], @XMM[11]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[4], @XMM[4], @XMM[12]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[6], @XMM[11]
|
||
+ veor @XMM[10], @XMM[3], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ vst1.8 {@XMM[10]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_enc_done
|
||
+.align 4
|
||
+.Lxts_enc_4:
|
||
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[2], @XMM[2], @XMM[10]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[3], @XMM[3], @XMM[11]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[6], @XMM[11]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_enc_done
|
||
+.align 4
|
||
+.Lxts_enc_3:
|
||
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[1], @XMM[1], @XMM[9]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[2], @XMM[2], @XMM[10]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[4], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ vst1.8 {@XMM[8]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_enc_done
|
||
+.align 4
|
||
+.Lxts_enc_2:
|
||
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[1], @XMM[1], @XMM[9]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_encrypt8
|
||
+
|
||
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_enc_done
|
||
+.align 4
|
||
+.Lxts_enc_1:
|
||
+ mov r0, sp
|
||
+ veor @XMM[0], @XMM[8]
|
||
+ mov r1, sp
|
||
+ vst1.8 {@XMM[0]}, [sp,:128]
|
||
+ mov r2, $key
|
||
+ mov r4, $fp @ preserve fp
|
||
+
|
||
+ bl AES_encrypt
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [sp,:128]
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+ vst1.8 {@XMM[0]}, [$out]!
|
||
+ mov $fp, r4
|
||
+
|
||
+ vmov @XMM[8], @XMM[9] @ next round tweak
|
||
+
|
||
+.Lxts_enc_done:
|
||
+#ifndef XTS_CHAIN_TWEAK
|
||
+ adds $len, #0x10
|
||
+ beq .Lxts_enc_ret
|
||
+ sub r6, $out, #0x10
|
||
+
|
||
+.Lxts_enc_steal:
|
||
+ ldrb r0, [$inp], #1
|
||
+ ldrb r1, [$out, #-0x10]
|
||
+ strb r0, [$out, #-0x10]
|
||
+ strb r1, [$out], #1
|
||
+
|
||
+ subs $len, #1
|
||
+ bhi .Lxts_enc_steal
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [r6]
|
||
+ mov r0, sp
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+ mov r1, sp
|
||
+ vst1.8 {@XMM[0]}, [sp,:128]
|
||
+ mov r2, $key
|
||
+ mov r4, $fp @ preserve fp
|
||
+
|
||
+ bl AES_encrypt
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [sp,:128]
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+ vst1.8 {@XMM[0]}, [r6]
|
||
+ mov $fp, r4
|
||
+#endif
|
||
+
|
||
+.Lxts_enc_ret:
|
||
+ bic r0, $fp, #0xf
|
||
+ vmov.i32 q0, #0
|
||
+ vmov.i32 q1, #0
|
||
+#ifdef XTS_CHAIN_TWEAK
|
||
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
|
||
+#endif
|
||
+.Lxts_enc_bzero: @ wipe key schedule [if any]
|
||
+ vstmia sp!, {q0-q1}
|
||
+ cmp sp, r0
|
||
+ bne .Lxts_enc_bzero
|
||
+
|
||
+ mov sp, $fp
|
||
+#ifdef XTS_CHAIN_TWEAK
|
||
+ vst1.8 {@XMM[8]}, [r1]
|
||
+#endif
|
||
+ VFP_ABI_POP
|
||
+ ldmia sp!, {r4-r10, pc} @ return
|
||
+
|
||
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
|
||
+
|
||
+.globl bsaes_xts_decrypt
|
||
+.type bsaes_xts_decrypt,%function
|
||
+.align 4
|
||
+bsaes_xts_decrypt:
|
||
+ mov ip, sp
|
||
+ stmdb sp!, {r4-r10, lr} @ 0x20
|
||
+ VFP_ABI_PUSH
|
||
+ mov r6, sp @ future $fp
|
||
+
|
||
+ mov $inp, r0
|
||
+ mov $out, r1
|
||
+ mov $len, r2
|
||
+ mov $key, r3
|
||
+
|
||
+ sub r0, sp, #0x10 @ 0x10
|
||
+ bic r0, #0xf @ align at 16 bytes
|
||
+ mov sp, r0
|
||
+
|
||
+#ifdef XTS_CHAIN_TWEAK
|
||
+ ldr r0, [ip] @ pointer to input tweak
|
||
+#else
|
||
+ @ generate initial tweak
|
||
+ ldr r0, [ip, #4] @ iv[]
|
||
+ mov r1, sp
|
||
+ ldr r2, [ip, #0] @ key2
|
||
+ bl AES_encrypt
|
||
+ mov r0, sp @ pointer to initial tweak
|
||
+#endif
|
||
+
|
||
+ ldr $rounds, [$key, #240] @ get # of rounds
|
||
+ mov $fp, r6
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ @ allocate the key schedule on the stack
|
||
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
|
||
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
|
||
+ sub r12, #`32+16` @ place for tweak[9]
|
||
+
|
||
+ @ populate the key schedule
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ mov sp, r12
|
||
+ add r12, #0x90 @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ add r4, sp, #0x90
|
||
+ vldmia r4, {@XMM[6]}
|
||
+ vstmia r12, {@XMM[15]} @ save last round key
|
||
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
|
||
+ vstmia r4, {@XMM[7]}
|
||
+#else
|
||
+ ldr r12, [$key, #244]
|
||
+ eors r12, #1
|
||
+ beq 0f
|
||
+
|
||
+ str r12, [$key, #244]
|
||
+ mov r4, $key @ pass key
|
||
+ mov r5, $rounds @ pass # of rounds
|
||
+ add r12, $key, #248 @ pass key schedule
|
||
+ bl _bsaes_key_convert
|
||
+ add r4, $key, #248
|
||
+ vldmia r4, {@XMM[6]}
|
||
+ vstmia r12, {@XMM[15]} @ save last round key
|
||
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
|
||
+ vstmia r4, {@XMM[7]}
|
||
+
|
||
+.align 2
|
||
+0: sub sp, #0x90 @ place for tweak[9]
|
||
+#endif
|
||
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
|
||
+ adr $magic, .Lxts_magic
|
||
+
|
||
+ tst $len, #0xf @ if not multiple of 16
|
||
+ it ne @ Thumb2 thing, sanity check in ARM
|
||
+ subne $len, #0x10 @ subtract another 16 bytes
|
||
+ subs $len, #0x80
|
||
+
|
||
+ blo .Lxts_dec_short
|
||
+ b .Lxts_dec_loop
|
||
+
|
||
+.align 4
|
||
+.Lxts_dec_loop:
|
||
+ vldmia $magic, {$twmask} @ load XTS magic
|
||
+ vshr.s64 @T[0], @XMM[8], #63
|
||
+ mov r0, sp
|
||
+ vand @T[0], @T[0], $twmask
|
||
+___
|
||
+for($i=9;$i<16;$i++) {
|
||
+$code.=<<___;
|
||
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
|
||
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
|
||
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
|
||
+ vshr.s64 @T[1], @XMM[$i], #63
|
||
+ veor @XMM[$i], @XMM[$i], @T[0]
|
||
+ vand @T[1], @T[1], $twmask
|
||
+___
|
||
+ @T=reverse(@T);
|
||
+
|
||
+$code.=<<___ if ($i>=10);
|
||
+ vld1.8 {@XMM[$i-10]}, [$inp]!
|
||
+___
|
||
+$code.=<<___ if ($i>=11);
|
||
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
|
||
+___
|
||
+}
|
||
+$code.=<<___;
|
||
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
|
||
+ vst1.64 {@XMM[15]}, [r0,:128]!
|
||
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
|
||
+ veor @XMM[8], @XMM[8], @T[0]
|
||
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
|
||
+ veor @XMM[5], @XMM[5], @XMM[13]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[6], @XMM[6], @XMM[14]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ veor @XMM[7], @XMM[7], @XMM[15]
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[6], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[4], @XMM[11]
|
||
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
|
||
+ veor @XMM[10], @XMM[2], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ veor @XMM[11], @XMM[7], @XMM[13]
|
||
+ veor @XMM[12], @XMM[3], @XMM[14]
|
||
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
|
||
+ veor @XMM[13], @XMM[5], @XMM[15]
|
||
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ subs $len, #0x80
|
||
+ bpl .Lxts_dec_loop
|
||
+
|
||
+.Lxts_dec_short:
|
||
+ adds $len, #0x70
|
||
+ bmi .Lxts_dec_done
|
||
+
|
||
+ vldmia $magic, {$twmask} @ load XTS magic
|
||
+ vshr.s64 @T[0], @XMM[8], #63
|
||
+ mov r0, sp
|
||
+ vand @T[0], @T[0], $twmask
|
||
+___
|
||
+for($i=9;$i<16;$i++) {
|
||
+$code.=<<___;
|
||
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
|
||
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
|
||
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
|
||
+ vshr.s64 @T[1], @XMM[$i], #63
|
||
+ veor @XMM[$i], @XMM[$i], @T[0]
|
||
+ vand @T[1], @T[1], $twmask
|
||
+___
|
||
+ @T=reverse(@T);
|
||
+
|
||
+$code.=<<___ if ($i>=10);
|
||
+ vld1.8 {@XMM[$i-10]}, [$inp]!
|
||
+ subs $len, #0x10
|
||
+ bmi .Lxts_dec_`$i-9`
|
||
+___
|
||
+$code.=<<___ if ($i>=11);
|
||
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
|
||
+___
|
||
+}
|
||
+$code.=<<___;
|
||
+ sub $len, #0x10
|
||
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ vld1.8 {@XMM[6]}, [$inp]!
|
||
+ veor @XMM[5], @XMM[5], @XMM[13]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[6], @XMM[6], @XMM[14]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[6], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[4], @XMM[11]
|
||
+ vld1.64 {@XMM[14]}, [r0,:128]!
|
||
+ veor @XMM[10], @XMM[2], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ veor @XMM[11], @XMM[7], @XMM[13]
|
||
+ veor @XMM[12], @XMM[3], @XMM[14]
|
||
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
|
||
+ vst1.8 {@XMM[12]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_dec_done
|
||
+.align 4
|
||
+.Lxts_dec_6:
|
||
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[4], @XMM[4], @XMM[12]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[5], @XMM[5], @XMM[13]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[6], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[4], @XMM[11]
|
||
+ veor @XMM[10], @XMM[2], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ veor @XMM[11], @XMM[7], @XMM[13]
|
||
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_dec_done
|
||
+.align 4
|
||
+.Lxts_dec_5:
|
||
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[3], @XMM[3], @XMM[11]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[4], @XMM[4], @XMM[12]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ vld1.64 {@XMM[12]}, [r0,:128]!
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[6], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[4], @XMM[11]
|
||
+ veor @XMM[10], @XMM[2], @XMM[12]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+ vst1.8 {@XMM[10]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_dec_done
|
||
+.align 4
|
||
+.Lxts_dec_4:
|
||
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[2], @XMM[2], @XMM[10]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[3], @XMM[3], @XMM[11]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[6], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ veor @XMM[9], @XMM[4], @XMM[11]
|
||
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_dec_done
|
||
+.align 4
|
||
+.Lxts_dec_3:
|
||
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[1], @XMM[1], @XMM[9]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[2], @XMM[2], @XMM[10]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
|
||
+ vld1.64 {@XMM[10]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ veor @XMM[8], @XMM[6], @XMM[10]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+ vst1.8 {@XMM[8]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_dec_done
|
||
+.align 4
|
||
+.Lxts_dec_2:
|
||
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
|
||
+
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+#ifndef BSAES_ASM_EXTENDED_KEY
|
||
+ add r4, sp, #0x90 @ pass key schedule
|
||
+#else
|
||
+ add r4, $key, #248 @ pass key schedule
|
||
+#endif
|
||
+ veor @XMM[1], @XMM[1], @XMM[9]
|
||
+ mov r5, $rounds @ pass rounds
|
||
+ mov r0, sp
|
||
+
|
||
+ bl _bsaes_decrypt8
|
||
+
|
||
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
|
||
+ veor @XMM[0], @XMM[0], @XMM[ 8]
|
||
+ veor @XMM[1], @XMM[1], @XMM[ 9]
|
||
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
|
||
+
|
||
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
|
||
+ b .Lxts_dec_done
|
||
+.align 4
|
||
+.Lxts_dec_1:
|
||
+ mov r0, sp
|
||
+ veor @XMM[0], @XMM[8]
|
||
+ mov r1, sp
|
||
+ vst1.8 {@XMM[0]}, [sp,:128]
|
||
+ mov r2, $key
|
||
+ mov r4, $fp @ preserve fp
|
||
+ mov r5, $magic @ preserve magic
|
||
+
|
||
+ bl AES_decrypt
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [sp,:128]
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+ vst1.8 {@XMM[0]}, [$out]!
|
||
+ mov $fp, r4
|
||
+ mov $magic, r5
|
||
+
|
||
+ vmov @XMM[8], @XMM[9] @ next round tweak
|
||
+
|
||
+.Lxts_dec_done:
|
||
+#ifndef XTS_CHAIN_TWEAK
|
||
+ adds $len, #0x10
|
||
+ beq .Lxts_dec_ret
|
||
+
|
||
+ @ calculate one round of extra tweak for the stolen ciphertext
|
||
+ vldmia $magic, {$twmask}
|
||
+ vshr.s64 @XMM[6], @XMM[8], #63
|
||
+ vand @XMM[6], @XMM[6], $twmask
|
||
+ vadd.u64 @XMM[9], @XMM[8], @XMM[8]
|
||
+ vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
|
||
+ veor @XMM[9], @XMM[9], @XMM[6]
|
||
+
|
||
+ @ perform the final decryption with the last tweak value
|
||
+ vld1.8 {@XMM[0]}, [$inp]!
|
||
+ mov r0, sp
|
||
+ veor @XMM[0], @XMM[0], @XMM[9]
|
||
+ mov r1, sp
|
||
+ vst1.8 {@XMM[0]}, [sp,:128]
|
||
+ mov r2, $key
|
||
+ mov r4, $fp @ preserve fp
|
||
+
|
||
+ bl AES_decrypt
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [sp,:128]
|
||
+ veor @XMM[0], @XMM[0], @XMM[9]
|
||
+ vst1.8 {@XMM[0]}, [$out]
|
||
+
|
||
+ mov r6, $out
|
||
+.Lxts_dec_steal:
|
||
+ ldrb r1, [$out]
|
||
+ ldrb r0, [$inp], #1
|
||
+ strb r1, [$out, #0x10]
|
||
+ strb r0, [$out], #1
|
||
+
|
||
+ subs $len, #1
|
||
+ bhi .Lxts_dec_steal
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [r6]
|
||
+ mov r0, sp
|
||
+ veor @XMM[0], @XMM[8]
|
||
+ mov r1, sp
|
||
+ vst1.8 {@XMM[0]}, [sp,:128]
|
||
+ mov r2, $key
|
||
+
|
||
+ bl AES_decrypt
|
||
+
|
||
+ vld1.8 {@XMM[0]}, [sp,:128]
|
||
+ veor @XMM[0], @XMM[0], @XMM[8]
|
||
+ vst1.8 {@XMM[0]}, [r6]
|
||
+ mov $fp, r4
|
||
+#endif
|
||
+
|
||
+.Lxts_dec_ret:
|
||
+ bic r0, $fp, #0xf
|
||
+ vmov.i32 q0, #0
|
||
+ vmov.i32 q1, #0
|
||
+#ifdef XTS_CHAIN_TWEAK
|
||
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
|
||
+#endif
|
||
+.Lxts_dec_bzero: @ wipe key schedule [if any]
|
||
+ vstmia sp!, {q0-q1}
|
||
+ cmp sp, r0
|
||
+ bne .Lxts_dec_bzero
|
||
+
|
||
+ mov sp, $fp
|
||
+#ifdef XTS_CHAIN_TWEAK
|
||
+ vst1.8 {@XMM[8]}, [r1]
|
||
+#endif
|
||
+ VFP_ABI_POP
|
||
+ ldmia sp!, {r4-r10, pc} @ return
|
||
+
|
||
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
|
||
+___
|
||
+}
|
||
+$code.=<<___;
|
||
+#endif
|
||
+___
|
||
+
|
||
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||
+
|
||
+open SELF,$0;
|
||
+while(<SELF>) {
|
||
+ next if (/^#!/);
|
||
+ last if (!s/^#/@/ and !/^$/);
|
||
+ print;
|
||
+}
|
||
+close SELF;
|
||
+
|
||
+print $code;
|
||
+
|
||
+close STDOUT;
|
||
diff --git a/crypto/arm64cpuid.S b/crypto/arm64cpuid.S
|
||
new file mode 100644
|
||
index 0000000..4778ac1
|
||
--- /dev/null
|
||
+++ b/crypto/arm64cpuid.S
|
||
@@ -0,0 +1,46 @@
|
||
+#include "arm_arch.h"
|
||
+
|
||
+.text
|
||
+.arch armv8-a+crypto
|
||
+
|
||
+.align 5
|
||
+.global _armv7_neon_probe
|
||
+.type _armv7_neon_probe,%function
|
||
+_armv7_neon_probe:
|
||
+ orr v15.16b, v15.16b, v15.16b
|
||
+ ret
|
||
+.size _armv7_neon_probe,.-_armv7_neon_probe
|
||
+
|
||
+.global _armv7_tick
|
||
+.type _armv7_tick,%function
|
||
+_armv7_tick:
|
||
+ mrs x0, CNTVCT_EL0
|
||
+ ret
|
||
+.size _armv7_tick,.-_armv7_tick
|
||
+
|
||
+.global _armv8_aes_probe
|
||
+.type _armv8_aes_probe,%function
|
||
+_armv8_aes_probe:
|
||
+ aese v0.16b, v0.16b
|
||
+ ret
|
||
+.size _armv8_aes_probe,.-_armv8_aes_probe
|
||
+
|
||
+.global _armv8_sha1_probe
|
||
+.type _armv8_sha1_probe,%function
|
||
+_armv8_sha1_probe:
|
||
+ sha1h s0, s0
|
||
+ ret
|
||
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
|
||
+
|
||
+.global _armv8_sha256_probe
|
||
+.type _armv8_sha256_probe,%function
|
||
+_armv8_sha256_probe:
|
||
+ sha256su0 v0.4s, v0.4s
|
||
+ ret
|
||
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
|
||
+.global _armv8_pmull_probe
|
||
+.type _armv8_pmull_probe,%function
|
||
+_armv8_pmull_probe:
|
||
+ pmull v0.1q, v0.1d, v0.1d
|
||
+ ret
|
||
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
|
||
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
|
||
index 5a83107..6fa8724 100644
|
||
--- a/crypto/arm_arch.h
|
||
+++ b/crypto/arm_arch.h
|
||
@@ -10,13 +10,24 @@
|
||
# define __ARMEL__
|
||
# endif
|
||
# elif defined(__GNUC__)
|
||
+# if defined(__aarch64__)
|
||
+# define __ARM_ARCH__ 8
|
||
+# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
|
||
+# define __ARMEB__
|
||
+# else
|
||
+# define __ARMEL__
|
||
+# endif
|
||
/*
|
||
* Why doesn't gcc define __ARM_ARCH__? Instead it defines
|
||
* bunch of below macros. See all_architectires[] table in
|
||
* gcc/config/arm/arm.c. On a side note it defines
|
||
* __ARMEL__/__ARMEB__ for little-/big-endian.
|
||
*/
|
||
-# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
|
||
+# elif defined(__ARM_ARCH)
|
||
+# define __ARM_ARCH__ __ARM_ARCH
|
||
+# elif defined(__ARM_ARCH_8A__)
|
||
+# define __ARM_ARCH__ 8
|
||
+# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
|
||
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
|
||
defined(__ARM_ARCH_7EM__)
|
||
# define __ARM_ARCH__ 7
|
||
@@ -43,9 +54,13 @@
|
||
|
||
#if !__ASSEMBLER__
|
||
extern unsigned int OPENSSL_armcap_P;
|
||
+#endif
|
||
|
||
#define ARMV7_NEON (1<<0)
|
||
#define ARMV7_TICK (1<<1)
|
||
-#endif
|
||
+#define ARMV8_AES (1<<2)
|
||
+#define ARMV8_SHA1 (1<<3)
|
||
+#define ARMV8_SHA256 (1<<4)
|
||
+#define ARMV8_PMULL (1<<5)
|
||
|
||
#endif
|
||
diff --git a/crypto/armcap.c b/crypto/armcap.c
|
||
index 9abaf39..7e46d07 100644
|
||
--- a/crypto/armcap.c
|
||
+++ b/crypto/armcap.c
|
||
@@ -19,9 +19,13 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
|
||
* ARM compilers support inline assembler...
|
||
*/
|
||
void _armv7_neon_probe(void);
|
||
-unsigned int _armv7_tick(void);
|
||
+void _armv8_aes_probe(void);
|
||
+void _armv8_sha1_probe(void);
|
||
+void _armv8_sha256_probe(void);
|
||
+void _armv8_pmull_probe(void);
|
||
+unsigned long _armv7_tick(void);
|
||
|
||
-unsigned int OPENSSL_rdtsc(void)
|
||
+unsigned long OPENSSL_rdtsc(void)
|
||
{
|
||
if (OPENSSL_armcap_P & ARMV7_TICK)
|
||
return _armv7_tick();
|
||
@@ -29,9 +33,41 @@ unsigned int OPENSSL_rdtsc(void)
|
||
return 0;
|
||
}
|
||
|
||
+/*
|
||
+ * Use a weak reference to getauxval() so we can use it if it is available but
|
||
+ * don't break the build if it is not.
|
||
+ */
|
||
#if defined(__GNUC__) && __GNUC__>=2
|
||
void OPENSSL_cpuid_setup(void) __attribute__((constructor));
|
||
+extern unsigned long getauxval(unsigned long type) __attribute__((weak));
|
||
+#else
|
||
+static unsigned long (*getauxval)(unsigned long) = NULL;
|
||
#endif
|
||
+
|
||
+/*
|
||
+ * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas
|
||
+ * AArch64 used AT_HWCAP.
|
||
+ */
|
||
+#if defined(__arm__) || defined (__arm)
|
||
+# define HWCAP 16 /* AT_HWCAP */
|
||
+# define HWCAP_NEON (1 << 12)
|
||
+
|
||
+# define HWCAP_CE 26 /* AT_HWCAP2 */
|
||
+# define HWCAP_CE_AES (1 << 0)
|
||
+# define HWCAP_CE_PMULL (1 << 1)
|
||
+# define HWCAP_CE_SHA1 (1 << 2)
|
||
+# define HWCAP_CE_SHA256 (1 << 3)
|
||
+#elif defined(__aarch64__)
|
||
+# define HWCAP 16 /* AT_HWCAP */
|
||
+# define HWCAP_NEON (1 << 1)
|
||
+
|
||
+# define HWCAP_CE HWCAP
|
||
+# define HWCAP_CE_AES (1 << 3)
|
||
+# define HWCAP_CE_PMULL (1 << 4)
|
||
+# define HWCAP_CE_SHA1 (1 << 5)
|
||
+# define HWCAP_CE_SHA256 (1 << 6)
|
||
+#endif
|
||
+
|
||
void OPENSSL_cpuid_setup(void)
|
||
{
|
||
char *e;
|
||
@@ -44,7 +80,7 @@ void OPENSSL_cpuid_setup(void)
|
||
|
||
if ((e=getenv("OPENSSL_armcap")))
|
||
{
|
||
- OPENSSL_armcap_P=strtoul(e,NULL,0);
|
||
+ OPENSSL_armcap_P=(unsigned int)strtoul(e,NULL,0);
|
||
return;
|
||
}
|
||
|
||
@@ -64,10 +100,51 @@ void OPENSSL_cpuid_setup(void)
|
||
sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
|
||
sigaction(SIGILL,&ill_act,&ill_oact);
|
||
|
||
- if (sigsetjmp(ill_jmp,1) == 0)
|
||
+ if (getauxval != NULL)
|
||
+ {
|
||
+ if (getauxval(HWCAP) & HWCAP_NEON)
|
||
+ {
|
||
+ unsigned long hwcap = getauxval(HWCAP_CE);
|
||
+
|
||
+ OPENSSL_armcap_P |= ARMV7_NEON;
|
||
+
|
||
+ if (hwcap & HWCAP_CE_AES)
|
||
+ OPENSSL_armcap_P |= ARMV8_AES;
|
||
+
|
||
+ if (hwcap & HWCAP_CE_PMULL)
|
||
+ OPENSSL_armcap_P |= ARMV8_PMULL;
|
||
+
|
||
+ if (hwcap & HWCAP_CE_SHA1)
|
||
+ OPENSSL_armcap_P |= ARMV8_SHA1;
|
||
+
|
||
+ if (hwcap & HWCAP_CE_SHA256)
|
||
+ OPENSSL_armcap_P |= ARMV8_SHA256;
|
||
+ }
|
||
+ }
|
||
+ else if (sigsetjmp(ill_jmp,1) == 0)
|
||
{
|
||
_armv7_neon_probe();
|
||
OPENSSL_armcap_P |= ARMV7_NEON;
|
||
+ if (sigsetjmp(ill_jmp,1) == 0)
|
||
+ {
|
||
+ _armv8_pmull_probe();
|
||
+ OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES;
|
||
+ }
|
||
+ else if (sigsetjmp(ill_jmp,1) == 0)
|
||
+ {
|
||
+ _armv8_aes_probe();
|
||
+ OPENSSL_armcap_P |= ARMV8_AES;
|
||
+ }
|
||
+ if (sigsetjmp(ill_jmp,1) == 0)
|
||
+ {
|
||
+ _armv8_sha1_probe();
|
||
+ OPENSSL_armcap_P |= ARMV8_SHA1;
|
||
+ }
|
||
+ if (sigsetjmp(ill_jmp,1) == 0)
|
||
+ {
|
||
+ _armv8_sha256_probe();
|
||
+ OPENSSL_armcap_P |= ARMV8_SHA256;
|
||
+ }
|
||
}
|
||
if (sigsetjmp(ill_jmp,1) == 0)
|
||
{
|
||
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
|
||
index 2d618de..add11d4 100644
|
||
--- a/crypto/armv4cpuid.S
|
||
+++ b/crypto/armv4cpuid.S
|
||
@@ -7,17 +7,49 @@
|
||
.global _armv7_neon_probe
|
||
.type _armv7_neon_probe,%function
|
||
_armv7_neon_probe:
|
||
- .word 0xf26ee1fe @ vorr q15,q15,q15
|
||
- .word 0xe12fff1e @ bx lr
|
||
+ .byte 0xf0,0x01,0x60,0xf2 @ vorr q8,q8,q8
|
||
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||
.size _armv7_neon_probe,.-_armv7_neon_probe
|
||
|
||
.global _armv7_tick
|
||
.type _armv7_tick,%function
|
||
_armv7_tick:
|
||
- mrc p15,0,r0,c9,c13,0
|
||
- .word 0xe12fff1e @ bx lr
|
||
+ mrrc p15,1,r0,r1,c14 @ CNTVCT
|
||
+#if __ARM_ARCH__>=5
|
||
+ bx lr
|
||
+#else
|
||
+ .word 0xe12fff1e @ bx lr
|
||
+#endif
|
||
.size _armv7_tick,.-_armv7_tick
|
||
|
||
+.global _armv8_aes_probe
|
||
+.type _armv8_aes_probe,%function
|
||
+_armv8_aes_probe:
|
||
+ .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
|
||
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||
+.size _armv8_aes_probe,.-_armv8_aes_probe
|
||
+
|
||
+.global _armv8_sha1_probe
|
||
+.type _armv8_sha1_probe,%function
|
||
+_armv8_sha1_probe:
|
||
+ .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
|
||
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
|
||
+
|
||
+.global _armv8_sha256_probe
|
||
+.type _armv8_sha256_probe,%function
|
||
+_armv8_sha256_probe:
|
||
+ .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
|
||
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
|
||
+.global _armv8_pmull_probe
|
||
+.type _armv8_pmull_probe,%function
|
||
+_armv8_pmull_probe:
|
||
+ .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
|
||
+ .byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
|
||
+
|
||
+.align 5
|
||
.global OPENSSL_atomic_add
|
||
.type OPENSSL_atomic_add,%function
|
||
OPENSSL_atomic_add:
|
||
@@ -28,7 +60,7 @@ OPENSSL_atomic_add:
|
||
cmp r2,#0
|
||
bne .Ladd
|
||
mov r0,r3
|
||
- .word 0xe12fff1e @ bx lr
|
||
+ bx lr
|
||
#else
|
||
stmdb sp!,{r4-r6,lr}
|
||
ldr r2,.Lspinlock
|
||
@@ -81,9 +113,13 @@ OPENSSL_cleanse:
|
||
adds r1,r1,#4
|
||
bne .Little
|
||
.Lcleanse_done:
|
||
+#if __ARM_ARCH__>=5
|
||
+ bx lr
|
||
+#else
|
||
tst lr,#1
|
||
moveq pc,lr
|
||
.word 0xe12fff1e @ bx lr
|
||
+#endif
|
||
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
||
|
||
.global OPENSSL_wipe_cpu
|
||
@@ -97,41 +133,53 @@ OPENSSL_wipe_cpu:
|
||
eor ip,ip,ip
|
||
tst r0,#1
|
||
beq .Lwipe_done
|
||
- .word 0xf3000150 @ veor q0, q0, q0
|
||
- .word 0xf3022152 @ veor q1, q1, q1
|
||
- .word 0xf3044154 @ veor q2, q2, q2
|
||
- .word 0xf3066156 @ veor q3, q3, q3
|
||
- .word 0xf34001f0 @ veor q8, q8, q8
|
||
- .word 0xf34221f2 @ veor q9, q9, q9
|
||
- .word 0xf34441f4 @ veor q10, q10, q10
|
||
- .word 0xf34661f6 @ veor q11, q11, q11
|
||
- .word 0xf34881f8 @ veor q12, q12, q12
|
||
- .word 0xf34aa1fa @ veor q13, q13, q13
|
||
- .word 0xf34cc1fc @ veor q14, q14, q14
|
||
- .word 0xf34ee1fe @ veor q15, q15, q15
|
||
+ .byte 0x50,0x01,0x00,0xf3 @ veor q0, q0, q0
|
||
+ .byte 0x52,0x21,0x02,0xf3 @ veor q1, q1, q1
|
||
+ .byte 0x54,0x41,0x04,0xf3 @ veor q2, q2, q2
|
||
+ .byte 0x56,0x61,0x06,0xf3 @ veor q3, q3, q3
|
||
+ .byte 0xf0,0x01,0x40,0xf3 @ veor q8, q8, q8
|
||
+ .byte 0xf2,0x21,0x42,0xf3 @ veor q9, q9, q9
|
||
+ .byte 0xf4,0x41,0x44,0xf3 @ veor q10, q10, q10
|
||
+ .byte 0xf6,0x61,0x46,0xf3 @ veor q11, q11, q11
|
||
+ .byte 0xf8,0x81,0x48,0xf3 @ veor q12, q12, q12
|
||
+ .byte 0xfa,0xa1,0x4a,0xf3 @ veor q13, q13, q13
|
||
+ .byte 0xfc,0xc1,0x4c,0xf3 @ veor q14, q14, q14
|
||
+ .byte 0xfe,0xe1,0x4e,0xf3 @ veor q14, q14, q14
|
||
.Lwipe_done:
|
||
mov r0,sp
|
||
+#if __ARM_ARCH__>=5
|
||
+ bx lr
|
||
+#else
|
||
tst lr,#1
|
||
moveq pc,lr
|
||
.word 0xe12fff1e @ bx lr
|
||
+#endif
|
||
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
|
||
|
||
.global OPENSSL_instrument_bus
|
||
.type OPENSSL_instrument_bus,%function
|
||
OPENSSL_instrument_bus:
|
||
eor r0,r0,r0
|
||
+#if __ARM_ARCH__>=5
|
||
+ bx lr
|
||
+#else
|
||
tst lr,#1
|
||
moveq pc,lr
|
||
.word 0xe12fff1e @ bx lr
|
||
+#endif
|
||
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
|
||
|
||
.global OPENSSL_instrument_bus2
|
||
.type OPENSSL_instrument_bus2,%function
|
||
OPENSSL_instrument_bus2:
|
||
eor r0,r0,r0
|
||
+#if __ARM_ARCH__>=5
|
||
+ bx lr
|
||
+#else
|
||
tst lr,#1
|
||
moveq pc,lr
|
||
.word 0xe12fff1e @ bx lr
|
||
+#endif
|
||
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
|
||
|
||
.align 5
|
||
diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile
|
||
index 6dd136b..effc409 100644
|
||
--- a/crypto/bn/Makefile
|
||
+++ b/crypto/bn/Makefile
|
||
@@ -130,9 +130,10 @@ alpha-mont.s: asm/alpha-mont.pl
|
||
$(CC) -E $$preproc > $@ && rm $$preproc)
|
||
|
||
# GNU make "catch all"
|
||
-%-mont.s: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@
|
||
+%-mont.S: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@
|
||
%-gf2m.S: asm/%-gf2m.pl; $(PERL) $< $(PERLASM_SCHEME) $@
|
||
|
||
+armv4-mont.o: armv4-mont.S
|
||
armv4-gf2m.o: armv4-gf2m.S
|
||
|
||
files:
|
||
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
|
||
index c52e0b7..b781afb 100644
|
||
--- a/crypto/bn/asm/armv4-gf2m.pl
|
||
+++ b/crypto/bn/asm/armv4-gf2m.pl
|
||
@@ -20,14 +20,21 @@
|
||
# length, more for longer keys. Even though NEON 1x1 multiplication
|
||
# runs in even less cycles, ~30, improvement is measurable only on
|
||
# longer keys. One has to optimize code elsewhere to get NEON glow...
|
||
+#
|
||
+# April 2014
|
||
+#
|
||
+# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
|
||
+# referred below, which improves ECDH and ECDSA verify benchmarks
|
||
+# by 18-40%.
|
||
+#
|
||
+# C<>mara, D.; Gouv<75>a, C. P. L.; L<>pez, J. & Dahab, R.: Fast Software
|
||
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||
+#
|
||
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||
|
||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||
open STDOUT,">$output";
|
||
|
||
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
|
||
-
|
||
$code=<<___;
|
||
#include "arm_arch.h"
|
||
|
||
@@ -36,31 +43,6 @@ $code=<<___;
|
||
|
||
#if __ARM_ARCH__>=7
|
||
.fpu neon
|
||
-
|
||
-.type mul_1x1_neon,%function
|
||
-.align 5
|
||
-mul_1x1_neon:
|
||
- vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
|
||
- vmull.p8 `&Q("d0")`,d16,d17 @ a<>bb
|
||
- vshl.u64 `&Dlo("q2")`,d16,#16
|
||
- vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8<>bb
|
||
- vshl.u64 `&Dlo("q3")`,d16,#24
|
||
- vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16<31>bb
|
||
- vshr.u64 `&Dlo("q1")`,#8
|
||
- vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24<32>bb
|
||
- vshl.u64 `&Dhi("q1")`,#24
|
||
- veor d0,`&Dlo("q1")`
|
||
- vshr.u64 `&Dlo("q2")`,#16
|
||
- veor d0,`&Dhi("q1")`
|
||
- vshl.u64 `&Dhi("q2")`,#16
|
||
- veor d0,`&Dlo("q2")`
|
||
- vshr.u64 `&Dlo("q3")`,#24
|
||
- veor d0,`&Dhi("q2")`
|
||
- vshl.u64 `&Dhi("q3")`,#8
|
||
- veor d0,`&Dlo("q3")`
|
||
- veor d0,`&Dhi("q3")`
|
||
- bx lr
|
||
-.size mul_1x1_neon,.-mul_1x1_neon
|
||
#endif
|
||
___
|
||
################
|
||
@@ -159,8 +141,9 @@ ___
|
||
# void bn_GF2m_mul_2x2(BN_ULONG *r,
|
||
# BN_ULONG a1,BN_ULONG a0,
|
||
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0<61>b1b0
|
||
-
|
||
-($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
|
||
+{
|
||
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
|
||
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
|
||
|
||
$code.=<<___;
|
||
.global bn_GF2m_mul_2x2
|
||
@@ -173,44 +156,58 @@ bn_GF2m_mul_2x2:
|
||
tst r12,#1
|
||
beq .Lialu
|
||
|
||
- veor $A1,$A1
|
||
- vmov.32 $B1,r3,r3 @ two copies of b1
|
||
- vmov.32 ${A1}[0],r1 @ a1
|
||
-
|
||
- veor $A0,$A0
|
||
- vld1.32 ${B0}[],[sp,:32] @ two copies of b0
|
||
- vmov.32 ${A0}[0],r2 @ a0
|
||
- mov r12,lr
|
||
-
|
||
- vmov d16,$A1
|
||
- vmov d17,$B1
|
||
- bl mul_1x1_neon @ a1<61>b1
|
||
- vmov $A1B1,d0
|
||
-
|
||
- vmov d16,$A0
|
||
- vmov d17,$B0
|
||
- bl mul_1x1_neon @ a0<61>b0
|
||
- vmov $A0B0,d0
|
||
-
|
||
- veor d16,$A0,$A1
|
||
- veor d17,$B0,$B1
|
||
- veor $A0,$A0B0,$A1B1
|
||
- bl mul_1x1_neon @ (a0+a1)<29>(b0+b1)
|
||
-
|
||
- veor d0,$A0 @ (a0+a1)<29>(b0+b1)-a0<61>b0-a1<61>b1
|
||
- vshl.u64 d1,d0,#32
|
||
- vshr.u64 d0,d0,#32
|
||
- veor $A0B0,d1
|
||
- veor $A1B1,d0
|
||
- vst1.32 {${A0B0}[0]},[r0,:32]!
|
||
- vst1.32 {${A0B0}[1]},[r0,:32]!
|
||
- vst1.32 {${A1B1}[0]},[r0,:32]!
|
||
- vst1.32 {${A1B1}[1]},[r0,:32]
|
||
- bx r12
|
||
+ ldr r12, [sp] @ 5th argument
|
||
+ vmov.32 $a, r2, r1
|
||
+ vmov.32 $b, r12, r3
|
||
+ vmov.i64 $k48, #0x0000ffffffffffff
|
||
+ vmov.i64 $k32, #0x00000000ffffffff
|
||
+ vmov.i64 $k16, #0x000000000000ffff
|
||
+
|
||
+ vext.8 $t0#lo, $a, $a, #1 @ A1
|
||
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
|
||
+ vext.8 $r#lo, $b, $b, #1 @ B1
|
||
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
|
||
+ vext.8 $t1#lo, $a, $a, #2 @ A2
|
||
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
|
||
+ vext.8 $t3#lo, $b, $b, #2 @ B2
|
||
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
|
||
+ vext.8 $t2#lo, $a, $a, #3 @ A3
|
||
+ veor $t0, $t0, $r @ L = E + F
|
||
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
|
||
+ vext.8 $r#lo, $b, $b, #3 @ B3
|
||
+ veor $t1, $t1, $t3 @ M = G + H
|
||
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
|
||
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
|
||
+ vand $t0#hi, $t0#hi, $k48
|
||
+ vext.8 $t3#lo, $b, $b, #4 @ B4
|
||
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
|
||
+ vand $t1#hi, $t1#hi, $k32
|
||
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
|
||
+ veor $t2, $t2, $r @ N = I + J
|
||
+ veor $t0#lo, $t0#lo, $t0#hi
|
||
+ veor $t1#lo, $t1#lo, $t1#hi
|
||
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
|
||
+ vand $t2#hi, $t2#hi, $k16
|
||
+ vext.8 $t0, $t0, $t0, #15
|
||
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
|
||
+ vmov.i64 $t3#hi, #0
|
||
+ vext.8 $t1, $t1, $t1, #14
|
||
+ veor $t2#lo, $t2#lo, $t2#hi
|
||
+ vmull.p8 $r, $a, $b @ D = A*B
|
||
+ vext.8 $t3, $t3, $t3, #12
|
||
+ vext.8 $t2, $t2, $t2, #13
|
||
+ veor $t0, $t0, $t1
|
||
+ veor $t2, $t2, $t3
|
||
+ veor $r, $r, $t0
|
||
+ veor $r, $r, $t2
|
||
+
|
||
+ vst1.32 {$r}, [r0]
|
||
+ ret @ bx lr
|
||
.align 4
|
||
.Lialu:
|
||
#endif
|
||
___
|
||
+}
|
||
$ret="r10"; # reassigned 1st argument
|
||
$code.=<<___;
|
||
stmdb sp!,{r4-r10,lr}
|
||
@@ -272,7 +269,13 @@ $code.=<<___;
|
||
.comm OPENSSL_armcap_P,4,4
|
||
___
|
||
|
||
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
-print $code;
|
||
+foreach (split("\n",$code)) {
|
||
+ s/\`([^\`]*)\`/eval $1/geo;
|
||
+
|
||
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||
+ s/\bret\b/bx lr/go or
|
||
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||
+
|
||
+ print $_,"\n";
|
||
+}
|
||
close STDOUT; # enforce flush
|
||
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
|
||
index f78a8b5..72bad8e 100644
|
||
--- a/crypto/bn/asm/armv4-mont.pl
|
||
+++ b/crypto/bn/asm/armv4-mont.pl
|
||
@@ -1,7 +1,7 @@
|
||
#!/usr/bin/env perl
|
||
|
||
# ====================================================================
|
||
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
@@ -23,6 +23,21 @@
|
||
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
|
||
# about decorations, ABI and instruction syntax are identical.
|
||
|
||
+# November 2013
|
||
+#
|
||
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
|
||
+# performance improvement on Cortex-A8 is ~45-100% depending on key
|
||
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
|
||
+# On Snapdragon S4 improvement was measured to vary from ~70% to
|
||
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
|
||
+# rather because original integer-only code seems to perform
|
||
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
|
||
+# different. It's being looked into, but the trouble is that
|
||
+# performance for vectors longer than 256 bits is actually couple
|
||
+# of percent worse than for integer-only code. The code is chosen
|
||
+# for execution on all NEON-capable processors, because gain on
|
||
+# others outweighs the marginal loss on Cortex-A9.
|
||
+
|
||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||
open STDOUT,">$output";
|
||
|
||
@@ -52,16 +67,40 @@ $_n0="$num,#14*4";
|
||
$_num="$num,#15*4"; $_bpend=$_num;
|
||
|
||
$code=<<___;
|
||
+#include "arm_arch.h"
|
||
+
|
||
.text
|
||
+.code 32
|
||
+
|
||
+#if __ARM_ARCH__>=7
|
||
+.align 5
|
||
+.LOPENSSL_armcap:
|
||
+.word OPENSSL_armcap_P-bn_mul_mont
|
||
+#endif
|
||
|
||
.global bn_mul_mont
|
||
.type bn_mul_mont,%function
|
||
|
||
-.align 2
|
||
+.align 5
|
||
bn_mul_mont:
|
||
+ ldr ip,[sp,#4] @ load num
|
||
stmdb sp!,{r0,r2} @ sp points at argument block
|
||
- ldr $num,[sp,#3*4] @ load num
|
||
- cmp $num,#2
|
||
+#if __ARM_ARCH__>=7
|
||
+ tst ip,#7
|
||
+ bne .Lialu
|
||
+ adr r0,bn_mul_mont
|
||
+ ldr r2,.LOPENSSL_armcap
|
||
+ ldr r0,[r0,r2]
|
||
+ tst r0,#1 @ NEON available?
|
||
+ ldmia sp, {r0,r2}
|
||
+ beq .Lialu
|
||
+ add sp,sp,#8
|
||
+ b bn_mul8x_mont_neon
|
||
+.align 4
|
||
+.Lialu:
|
||
+#endif
|
||
+ cmp ip,#2
|
||
+ mov $num,ip @ load num
|
||
movlt r0,#0
|
||
addlt sp,sp,#2*4
|
||
blt .Labrt
|
||
@@ -191,14 +230,446 @@ bn_mul_mont:
|
||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||
mov r0,#1
|
||
-.Labrt: tst lr,#1
|
||
+.Labrt:
|
||
+#if __ARM_ARCH__>=5
|
||
+ ret @ bx lr
|
||
+#else
|
||
+ tst lr,#1
|
||
moveq pc,lr @ be binary compatible with V4, yet
|
||
bx lr @ interoperable with Thumb ISA:-)
|
||
+#endif
|
||
.size bn_mul_mont,.-bn_mul_mont
|
||
-.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
||
+___
|
||
+{
|
||
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||
+
|
||
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
|
||
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
|
||
+my ($Z,$Temp)=("q4","q5");
|
||
+my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
|
||
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
|
||
+my $zero=&Dlo($Z);
|
||
+my $temp=&Dlo($Temp);
|
||
+
|
||
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
|
||
+my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
|
||
+
|
||
+$code.=<<___;
|
||
+#if __ARM_ARCH__>=7
|
||
+.fpu neon
|
||
+
|
||
+.type bn_mul8x_mont_neon,%function
|
||
+.align 5
|
||
+bn_mul8x_mont_neon:
|
||
+ mov ip,sp
|
||
+ stmdb sp!,{r4-r11}
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+ ldmia ip,{r4-r5} @ load rest of parameter block
|
||
+
|
||
+ sub $toutptr,sp,#16
|
||
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||
+ sub $toutptr,$toutptr,$num,lsl#4
|
||
+ vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
|
||
+ and $toutptr,$toutptr,#-64
|
||
+ vld1.32 {${M0}[0]}, [$n0,:32]
|
||
+ mov sp,$toutptr @ alloca
|
||
+ veor $zero,$zero,$zero
|
||
+ subs $inner,$num,#8
|
||
+ vzip.16 $Bi,$zero
|
||
+
|
||
+ vmull.u32 $A0xB,$Bi,${A0}[0]
|
||
+ vmull.u32 $A1xB,$Bi,${A0}[1]
|
||
+ vmull.u32 $A2xB,$Bi,${A1}[0]
|
||
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||
+ vmull.u32 $A3xB,$Bi,${A1}[1]
|
||
+
|
||
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||
+ veor $zero,$zero,$zero
|
||
+ vmul.u32 $Ni,$temp,$M0
|
||
+
|
||
+ vmull.u32 $A4xB,$Bi,${A2}[0]
|
||
+ vld1.32 {$N0-$N3}, [$nptr]!
|
||
+ vmull.u32 $A5xB,$Bi,${A2}[1]
|
||
+ vmull.u32 $A6xB,$Bi,${A3}[0]
|
||
+ vzip.16 $Ni,$zero
|
||
+ vmull.u32 $A7xB,$Bi,${A3}[1]
|
||
+
|
||
+ bne .LNEON_1st
|
||
+
|
||
+ @ special case for num=8, everything is in register bank...
|
||
+
|
||
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||
+ sub $outer,$num,#1
|
||
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||
+
|
||
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||
+ vmov $Temp,$A0xB
|
||
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||
+ vmov $A0xB,$A1xB
|
||
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||
+ vmov $A1xB,$A2xB
|
||
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||
+ vmov $A2xB,$A3xB
|
||
+ vmov $A3xB,$A4xB
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+ vmov $A4xB,$A5xB
|
||
+ vmov $A5xB,$A6xB
|
||
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||
+ vmov $A6xB,$A7xB
|
||
+ veor $A7xB,$A7xB
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+
|
||
+ b .LNEON_outer8
|
||
+
|
||
+.align 4
|
||
+.LNEON_outer8:
|
||
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||
+ veor $zero,$zero,$zero
|
||
+ vzip.16 $Bi,$zero
|
||
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||
+
|
||
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||
+
|
||
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||
+ veor $zero,$zero,$zero
|
||
+ subs $outer,$outer,#1
|
||
+ vmul.u32 $Ni,$temp,$M0
|
||
+
|
||
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||
+ vzip.16 $Ni,$zero
|
||
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||
+
|
||
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||
+
|
||
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||
+ vmov $Temp,$A0xB
|
||
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||
+ vmov $A0xB,$A1xB
|
||
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||
+ vmov $A1xB,$A2xB
|
||
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||
+ vmov $A2xB,$A3xB
|
||
+ vmov $A3xB,$A4xB
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+ vmov $A4xB,$A5xB
|
||
+ vmov $A5xB,$A6xB
|
||
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||
+ vmov $A6xB,$A7xB
|
||
+ veor $A7xB,$A7xB
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+
|
||
+ bne .LNEON_outer8
|
||
+
|
||
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||
+ mov $toutptr,sp
|
||
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
|
||
+ mov $inner,$num
|
||
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
|
||
+ add $tinptr,sp,#16
|
||
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
|
||
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
|
||
+
|
||
+ b .LNEON_tail2
|
||
+
|
||
+.align 4
|
||
+.LNEON_1st:
|
||
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||
+ vld1.32 {$A0-$A3}, [$aptr]!
|
||
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||
+ subs $inner,$inner,#8
|
||
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||
+
|
||
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||
+ vld1.32 {$N0-$N1}, [$nptr]!
|
||
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||
+
|
||
+ vmull.u32 $A0xB,$Bi,${A0}[0]
|
||
+ vld1.32 {$N2-$N3}, [$nptr]!
|
||
+ vmull.u32 $A1xB,$Bi,${A0}[1]
|
||
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||
+ vmull.u32 $A2xB,$Bi,${A1}[0]
|
||
+ vmull.u32 $A3xB,$Bi,${A1}[1]
|
||
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||
+
|
||
+ vmull.u32 $A4xB,$Bi,${A2}[0]
|
||
+ vmull.u32 $A5xB,$Bi,${A2}[1]
|
||
+ vmull.u32 $A6xB,$Bi,${A3}[0]
|
||
+ vmull.u32 $A7xB,$Bi,${A3}[1]
|
||
+
|
||
+ bne .LNEON_1st
|
||
+
|
||
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||
+ add $tinptr,sp,#16
|
||
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
|
||
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||
+ vld1.64 {$Temp}, [sp,:128]
|
||
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||
+ sub $outer,$num,#1
|
||
+
|
||
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+ vld1.64 {$A0xB}, [$tinptr, :128]!
|
||
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||
+
|
||
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||
+ veor $Z,$Z,$Z
|
||
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||
+ vst1.64 {$Z}, [$toutptr,:128]
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+
|
||
+ b .LNEON_outer
|
||
+
|
||
+.align 4
|
||
+.LNEON_outer:
|
||
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
|
||
+ vld1.32 {$A0-$A3}, [$aptr]!
|
||
+ veor $zero,$zero,$zero
|
||
+ mov $toutptr,sp
|
||
+ vzip.16 $Bi,$zero
|
||
+ sub $inner,$num,#8
|
||
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||
+
|
||
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||
+ vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
|
||
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||
+ vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
|
||
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||
+
|
||
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||
+ veor $zero,$zero,$zero
|
||
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||
+ vld1.64 {$A7xB},[$tinptr,:128]!
|
||
+ vmul.u32 $Ni,$temp,$M0
|
||
+
|
||
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||
+ vld1.32 {$N0-$N3}, [$nptr]!
|
||
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||
+ vzip.16 $Ni,$zero
|
||
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||
+
|
||
+.LNEON_inner:
|
||
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||
+ vld1.32 {$A0-$A3}, [$aptr]!
|
||
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||
+ subs $inner,$inner,#8
|
||
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||
+
|
||
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||
+ vld1.64 {$A0xB}, [$tinptr, :128]!
|
||
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||
+
|
||
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
|
||
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
|
||
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||
+ vld1.32 {$N0-$N3}, [$nptr]!
|
||
+
|
||
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||
+ vld1.64 {$A7xB}, [$tinptr, :128]!
|
||
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||
+
|
||
+ bne .LNEON_inner
|
||
+
|
||
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||
+ add $tinptr,sp,#16
|
||
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
|
||
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||
+ vld1.64 {$Temp}, [sp,:128]
|
||
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||
+ subs $outer,$outer,#1
|
||
+
|
||
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||
+ vld1.64 {$A0xB}, [$tinptr, :128]!
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||
+
|
||
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||
+ vshr.u64 $temp,$temp,#16
|
||
+
|
||
+ bne .LNEON_outer
|
||
+
|
||
+ mov $toutptr,sp
|
||
+ mov $inner,$num
|
||
+
|
||
+.LNEON_tail:
|
||
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
|
||
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
|
||
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
|
||
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
|
||
+ vld1.64 {$A7xB}, [$tinptr, :128]!
|
||
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
|
||
+
|
||
+.LNEON_tail2:
|
||
+ vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
|
||
+ vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
|
||
+ vshr.u64 $temp,`&Dlo("$A1xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
|
||
+ vshr.u64 $temp,`&Dhi("$A1xB")`,#16
|
||
+ vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
|
||
+
|
||
+ vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
|
||
+ vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
|
||
+ vshr.u64 $temp,`&Dlo("$A2xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
|
||
+ vshr.u64 $temp,`&Dhi("$A2xB")`,#16
|
||
+ vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
|
||
+
|
||
+ vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
|
||
+ vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
|
||
+ vshr.u64 $temp,`&Dlo("$A3xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
|
||
+ vshr.u64 $temp,`&Dhi("$A3xB")`,#16
|
||
+ vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
|
||
+
|
||
+ vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
|
||
+ vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
|
||
+ vshr.u64 $temp,`&Dlo("$A4xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
|
||
+ vshr.u64 $temp,`&Dhi("$A4xB")`,#16
|
||
+ vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
|
||
+
|
||
+ vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
|
||
+ vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
|
||
+ vshr.u64 $temp,`&Dlo("$A5xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
|
||
+ vshr.u64 $temp,`&Dhi("$A5xB")`,#16
|
||
+ vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
|
||
+
|
||
+ vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
|
||
+ vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
|
||
+ vshr.u64 $temp,`&Dlo("$A6xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
|
||
+ vld1.64 {$A0xB}, [$tinptr, :128]!
|
||
+ vshr.u64 $temp,`&Dhi("$A6xB")`,#16
|
||
+ vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
|
||
+
|
||
+ vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
|
||
+ vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
|
||
+ vshr.u64 $temp,`&Dlo("$A7xB")`,#16
|
||
+ vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
|
||
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||
+ vshr.u64 $temp,`&Dhi("$A7xB")`,#16
|
||
+ vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
|
||
+ subs $inner,$inner,#8
|
||
+ vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
|
||
+
|
||
+ bne .LNEON_tail
|
||
+
|
||
+ vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
|
||
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
|
||
+ subs $aptr,sp,#0 @ clear carry flag
|
||
+ add $bptr,sp,$num,lsl#2
|
||
+
|
||
+.LNEON_sub:
|
||
+ ldmia $aptr!, {r4-r7}
|
||
+ ldmia $nptr!, {r8-r11}
|
||
+ sbcs r8, r4,r8
|
||
+ sbcs r9, r5,r9
|
||
+ sbcs r10,r6,r10
|
||
+ sbcs r11,r7,r11
|
||
+ teq $aptr,$bptr @ preserves carry
|
||
+ stmia $rptr!, {r8-r11}
|
||
+ bne .LNEON_sub
|
||
+
|
||
+ ldr r10, [$aptr] @ load top-most bit
|
||
+ veor q0,q0,q0
|
||
+ sub r11,$bptr,sp @ this is num*4
|
||
+ veor q1,q1,q1
|
||
+ mov $aptr,sp
|
||
+ sub $rptr,$rptr,r11 @ rewind $rptr
|
||
+ mov $nptr,$bptr @ second 3/4th of frame
|
||
+ sbcs r10,r10,#0 @ result is carry flag
|
||
+
|
||
+.LNEON_copy_n_zap:
|
||
+ ldmia $aptr!, {r4-r7}
|
||
+ ldmia $rptr, {r8-r11}
|
||
+ movcc r8, r4
|
||
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||
+ movcc r9, r5
|
||
+ movcc r10,r6
|
||
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||
+ movcc r11,r7
|
||
+ ldmia $aptr, {r4-r7}
|
||
+ stmia $rptr!, {r8-r11}
|
||
+ sub $aptr,$aptr,#16
|
||
+ ldmia $rptr, {r8-r11}
|
||
+ movcc r8, r4
|
||
+ vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
|
||
+ movcc r9, r5
|
||
+ movcc r10,r6
|
||
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||
+ movcc r11,r7
|
||
+ teq $aptr,$bptr @ preserves carry
|
||
+ stmia $rptr!, {r8-r11}
|
||
+ bne .LNEON_copy_n_zap
|
||
+
|
||
+ sub sp,ip,#96
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r11}
|
||
+ ret @ bx lr
|
||
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||
+#endif
|
||
+___
|
||
+}
|
||
+$code.=<<___;
|
||
+.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||
.align 2
|
||
+#if __ARM_ARCH__>=7
|
||
+.comm OPENSSL_armcap_P,4,4
|
||
+#endif
|
||
___
|
||
|
||
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
+$code =~ s/\bret\b/bx lr/gm;
|
||
print $code;
|
||
close STDOUT;
|
||
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
|
||
index c7869b6..ad0f7a4 100644
|
||
--- a/crypto/evp/e_aes.c
|
||
+++ b/crypto/evp/e_aes.c
|
||
@@ -62,7 +62,7 @@
|
||
|
||
typedef struct
|
||
{
|
||
- AES_KEY ks;
|
||
+ union { double align; AES_KEY ks; } ks;
|
||
block128_f block;
|
||
union {
|
||
cbc128_f cbc;
|
||
@@ -72,7 +72,7 @@ typedef struct
|
||
|
||
typedef struct
|
||
{
|
||
- AES_KEY ks; /* AES key schedule to use */
|
||
+ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */
|
||
int key_set; /* Set if key initialised */
|
||
int iv_set; /* Set if an iv is set */
|
||
GCM128_CONTEXT gcm;
|
||
@@ -86,7 +86,7 @@ typedef struct
|
||
|
||
typedef struct
|
||
{
|
||
- AES_KEY ks1, ks2; /* AES key schedules to use */
|
||
+ union { double align; AES_KEY ks; } ks1, ks2; /* AES key schedules to use */
|
||
XTS128_CONTEXT xts;
|
||
void (*stream)(const unsigned char *in,
|
||
unsigned char *out, size_t length,
|
||
@@ -96,7 +96,7 @@ typedef struct
|
||
|
||
typedef struct
|
||
{
|
||
- AES_KEY ks; /* AES key schedule to use */
|
||
+ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */
|
||
int key_set; /* Set if key initialised */
|
||
int iv_set; /* Set if an iv is set */
|
||
int tag_set; /* Set if tag is valid */
|
||
@@ -160,7 +160,7 @@ void AES_xts_decrypt(const char *inp,char *out,size_t len,
|
||
defined(_M_AMD64) || defined(_M_X64) || \
|
||
defined(__INTEL__) )
|
||
|
||
-extern unsigned int OPENSSL_ia32cap_P[2];
|
||
+extern unsigned int OPENSSL_ia32cap_P[];
|
||
|
||
#ifdef VPAES_ASM
|
||
#define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
|
||
@@ -310,7 +310,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
return 1;
|
||
if (key)
|
||
{
|
||
- aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
|
||
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
|
||
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
|
||
(block128_f)aesni_encrypt);
|
||
gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
|
||
@@ -355,19 +355,19 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
/* key_len is two AES keys */
|
||
if (enc)
|
||
{
|
||
- aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
|
||
+ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
xctx->xts.block1 = (block128_f)aesni_encrypt;
|
||
xctx->stream = aesni_xts_encrypt;
|
||
}
|
||
else
|
||
{
|
||
- aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
|
||
+ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
xctx->xts.block1 = (block128_f)aesni_decrypt;
|
||
xctx->stream = aesni_xts_decrypt;
|
||
}
|
||
|
||
aesni_set_encrypt_key(key + ctx->key_len/2,
|
||
- ctx->key_len * 4, &xctx->ks2);
|
||
+ ctx->key_len * 4, &xctx->ks2.ks);
|
||
xctx->xts.block2 = (block128_f)aesni_encrypt;
|
||
|
||
xctx->xts.key1 = &xctx->ks1;
|
||
@@ -394,7 +394,7 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
return 1;
|
||
if (key)
|
||
{
|
||
- aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
|
||
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
|
||
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
|
||
&cctx->ks, (block128_f)aesni_encrypt);
|
||
cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
|
||
@@ -484,6 +484,38 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
|
||
{ return &aes_##keylen##_##mode; }
|
||
#endif
|
||
|
||
+#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__))
|
||
+#include "arm_arch.h"
|
||
+#if __ARM_ARCH__>=7
|
||
+# if defined(BSAES_ASM)
|
||
+# define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
|
||
+# endif
|
||
+# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
|
||
+# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
|
||
+# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
|
||
+# define HWAES_encrypt aes_v8_encrypt
|
||
+# define HWAES_decrypt aes_v8_decrypt
|
||
+# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
|
||
+# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
|
||
+#endif
|
||
+#endif
|
||
+
|
||
+#if defined(HWAES_CAPABLE)
|
||
+int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
|
||
+ AES_KEY *key);
|
||
+int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
|
||
+ AES_KEY *key);
|
||
+void HWAES_encrypt(const unsigned char *in, unsigned char *out,
|
||
+ const AES_KEY *key);
|
||
+void HWAES_decrypt(const unsigned char *in, unsigned char *out,
|
||
+ const AES_KEY *key);
|
||
+void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
|
||
+ size_t length, const AES_KEY *key,
|
||
+ unsigned char *ivec, const int enc);
|
||
+void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
|
||
+ size_t len, const AES_KEY *key, const unsigned char ivec[16]);
|
||
+#endif
|
||
+
|
||
#define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
|
||
BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
|
||
BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
|
||
@@ -502,10 +534,23 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
mode = ctx->cipher->flags & EVP_CIPH_MODE;
|
||
if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
|
||
&& !enc)
|
||
+#ifdef HWAES_CAPABLE
|
||
+ if (HWAES_CAPABLE)
|
||
+ {
|
||
+ ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
+ dat->block = (block128_f)HWAES_decrypt;
|
||
+ dat->stream.cbc = NULL;
|
||
+#ifdef HWAES_cbc_encrypt
|
||
+ if (mode==EVP_CIPH_CBC_MODE)
|
||
+ dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
|
||
+#endif
|
||
+ }
|
||
+ else
|
||
+#endif
|
||
#ifdef BSAES_CAPABLE
|
||
if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
|
||
{
|
||
- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
|
||
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
dat->block = (block128_f)AES_decrypt;
|
||
dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt;
|
||
}
|
||
@@ -514,7 +559,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
#ifdef VPAES_CAPABLE
|
||
if (VPAES_CAPABLE)
|
||
{
|
||
- ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
|
||
+ ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
dat->block = (block128_f)vpaes_decrypt;
|
||
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
|
||
(cbc128_f)vpaes_cbc_encrypt :
|
||
@@ -523,17 +568,37 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
else
|
||
#endif
|
||
{
|
||
- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
|
||
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
dat->block = (block128_f)AES_decrypt;
|
||
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
|
||
(cbc128_f)AES_cbc_encrypt :
|
||
NULL;
|
||
}
|
||
else
|
||
+#ifdef HWAES_CAPABLE
|
||
+ if (HWAES_CAPABLE)
|
||
+ {
|
||
+ ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
+ dat->block = (block128_f)HWAES_encrypt;
|
||
+ dat->stream.cbc = NULL;
|
||
+#ifdef HWAES_cbc_encrypt
|
||
+ if (mode==EVP_CIPH_CBC_MODE)
|
||
+ dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
|
||
+ else
|
||
+#endif
|
||
+#ifdef HWAES_ctr32_encrypt_blocks
|
||
+ if (mode==EVP_CIPH_CTR_MODE)
|
||
+ dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
|
||
+ else
|
||
+#endif
|
||
+ (void)0; /* terminate potentially open 'else' */
|
||
+ }
|
||
+ else
|
||
+#endif
|
||
#ifdef BSAES_CAPABLE
|
||
if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
|
||
{
|
||
- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
|
||
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
dat->block = (block128_f)AES_encrypt;
|
||
dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
|
||
}
|
||
@@ -542,7 +607,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
#ifdef VPAES_CAPABLE
|
||
if (VPAES_CAPABLE)
|
||
{
|
||
- ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
|
||
+ ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
dat->block = (block128_f)vpaes_encrypt;
|
||
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
|
||
(cbc128_f)vpaes_cbc_encrypt :
|
||
@@ -551,7 +616,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
else
|
||
#endif
|
||
{
|
||
- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
|
||
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
|
||
dat->block = (block128_f)AES_encrypt;
|
||
dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
|
||
(cbc128_f)AES_cbc_encrypt :
|
||
@@ -822,10 +887,25 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
return 1;
|
||
if (key)
|
||
{ do {
|
||
+#ifdef HWAES_CAPABLE
|
||
+ if (HWAES_CAPABLE)
|
||
+ {
|
||
+ HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
|
||
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
|
||
+ (block128_f)HWAES_encrypt);
|
||
+#ifdef HWAES_ctr32_encrypt_blocks
|
||
+ gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
|
||
+#else
|
||
+ gctx->ctr = NULL;
|
||
+#endif
|
||
+ break;
|
||
+ }
|
||
+ else
|
||
+#endif
|
||
#ifdef BSAES_CAPABLE
|
||
if (BSAES_CAPABLE)
|
||
{
|
||
- AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
|
||
+ AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
|
||
CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
|
||
(block128_f)AES_encrypt);
|
||
gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
|
||
@@ -836,7 +916,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
#ifdef VPAES_CAPABLE
|
||
if (VPAES_CAPABLE)
|
||
{
|
||
- vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
|
||
+ vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
|
||
CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
|
||
(block128_f)vpaes_encrypt);
|
||
gctx->ctr = NULL;
|
||
@@ -846,7 +926,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
#endif
|
||
(void)0; /* terminate potentially open 'else' */
|
||
|
||
- AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
|
||
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
|
||
CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
|
||
#ifdef AES_CTR_ASM
|
||
gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
|
||
@@ -1067,6 +1147,29 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
xctx->stream = NULL;
|
||
#endif
|
||
/* key_len is two AES keys */
|
||
+#ifdef HWAES_CAPABLE
|
||
+ if (HWAES_CAPABLE)
|
||
+ {
|
||
+ if (enc)
|
||
+ {
|
||
+ HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
+ xctx->xts.block1 = (block128_f)HWAES_encrypt;
|
||
+ }
|
||
+ else
|
||
+ {
|
||
+ HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
+ xctx->xts.block1 = (block128_f)HWAES_decrypt;
|
||
+ }
|
||
+
|
||
+ HWAES_set_encrypt_key(key + ctx->key_len/2,
|
||
+ ctx->key_len * 4, &xctx->ks2.ks);
|
||
+ xctx->xts.block2 = (block128_f)HWAES_encrypt;
|
||
+
|
||
+ xctx->xts.key1 = &xctx->ks1;
|
||
+ break;
|
||
+ }
|
||
+ else
|
||
+#endif
|
||
#ifdef BSAES_CAPABLE
|
||
if (BSAES_CAPABLE)
|
||
xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
|
||
@@ -1077,17 +1180,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
{
|
||
if (enc)
|
||
{
|
||
- vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
|
||
+ vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
xctx->xts.block1 = (block128_f)vpaes_encrypt;
|
||
}
|
||
else
|
||
{
|
||
- vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
|
||
+ vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
xctx->xts.block1 = (block128_f)vpaes_decrypt;
|
||
}
|
||
|
||
vpaes_set_encrypt_key(key + ctx->key_len/2,
|
||
- ctx->key_len * 4, &xctx->ks2);
|
||
+ ctx->key_len * 4, &xctx->ks2.ks);
|
||
xctx->xts.block2 = (block128_f)vpaes_encrypt;
|
||
|
||
xctx->xts.key1 = &xctx->ks1;
|
||
@@ -1099,17 +1202,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
|
||
if (enc)
|
||
{
|
||
- AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
|
||
+ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
xctx->xts.block1 = (block128_f)AES_encrypt;
|
||
}
|
||
else
|
||
{
|
||
- AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
|
||
+ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
|
||
xctx->xts.block1 = (block128_f)AES_decrypt;
|
||
}
|
||
|
||
AES_set_encrypt_key(key + ctx->key_len/2,
|
||
- ctx->key_len * 4, &xctx->ks2);
|
||
+ ctx->key_len * 4, &xctx->ks2.ks);
|
||
xctx->xts.block2 = (block128_f)AES_encrypt;
|
||
|
||
xctx->xts.key1 = &xctx->ks1;
|
||
@@ -1217,10 +1320,23 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
return 1;
|
||
if (key) do
|
||
{
|
||
+#ifdef HWAES_CAPABLE
|
||
+ if (HWAES_CAPABLE)
|
||
+ {
|
||
+ HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks.ks);
|
||
+
|
||
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
|
||
+ &cctx->ks, (block128_f)HWAES_encrypt);
|
||
+ cctx->str = NULL;
|
||
+ cctx->key_set = 1;
|
||
+ break;
|
||
+ }
|
||
+ else
|
||
+#endif
|
||
#ifdef VPAES_CAPABLE
|
||
if (VPAES_CAPABLE)
|
||
{
|
||
- vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
|
||
+ vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks);
|
||
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
|
||
&cctx->ks, (block128_f)vpaes_encrypt);
|
||
cctx->str = NULL;
|
||
@@ -1228,7 +1344,7 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||
break;
|
||
}
|
||
#endif
|
||
- AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
|
||
+ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
|
||
CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
|
||
&cctx->ks, (block128_f)AES_encrypt);
|
||
cctx->str = NULL;
|
||
diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile
|
||
index 3d8bafd..9bcfa0e 100644
|
||
--- a/crypto/modes/Makefile
|
||
+++ b/crypto/modes/Makefile
|
||
@@ -56,14 +56,16 @@ ghash-alpha.s: asm/ghash-alpha.pl
|
||
(preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \
|
||
$(PERL) asm/ghash-alpha.pl > $$preproc && \
|
||
$(CC) -E $$preproc > $@ && rm $$preproc)
|
||
-
|
||
ghash-parisc.s: asm/ghash-parisc.pl
|
||
$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
|
||
+ghashv8-armx.S: asm/ghashv8-armx.pl
|
||
+ $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
|
||
|
||
# GNU make "catch all"
|
||
ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
|
||
|
||
ghash-armv4.o: ghash-armv4.S
|
||
+ghashv8-armx.o: ghashv8-armx.S
|
||
|
||
files:
|
||
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
|
||
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
|
||
index d91586e..0023bf9 100644
|
||
--- a/crypto/modes/asm/ghash-armv4.pl
|
||
+++ b/crypto/modes/asm/ghash-armv4.pl
|
||
@@ -35,6 +35,20 @@
|
||
# Add NEON implementation featuring polynomial multiplication, i.e. no
|
||
# lookup tables involved. On Cortex A8 it was measured to process one
|
||
# byte in 15 cycles or 55% faster than integer-only code.
|
||
+#
|
||
+# April 2014
|
||
+#
|
||
+# Switch to multiplication algorithm suggested in paper referred
|
||
+# below and combine it with reduction algorithm from x86 module.
|
||
+# Performance improvement over previous version varies from 65% on
|
||
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
|
||
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
|
||
+# in 9.33.
|
||
+#
|
||
+# C<>mara, D.; Gouv<75>a, C. P. L.; L<>pez, J. & Dahab, R.: Fast Software
|
||
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||
+#
|
||
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||
|
||
# ====================================================================
|
||
# Note about "528B" variant. In ARM case it makes lesser sense to
|
||
@@ -303,117 +317,160 @@ $code.=<<___;
|
||
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
||
___
|
||
{
|
||
-my $cnt=$Htbl; # $Htbl is used once in the very beginning
|
||
-
|
||
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
|
||
-my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
|
||
-
|
||
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
|
||
-# in Zo. Or should I say "top bit", because GHASH is specified in
|
||
-# reverse bit order? Otherwise straightforward 128-bt H by one input
|
||
-# byte multiplication and modulo-reduction, times 16.
|
||
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
||
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
|
||
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
|
||
|
||
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||
-sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
|
||
+sub clmul64x64 {
|
||
+my ($r,$a,$b)=@_;
|
||
+$code.=<<___;
|
||
+ vext.8 $t0#lo, $a, $a, #1 @ A1
|
||
+ vmull.p8 $t0, $t0#lo, $b @ F = A1*B
|
||
+ vext.8 $r#lo, $b, $b, #1 @ B1
|
||
+ vmull.p8 $r, $a, $r#lo @ E = A*B1
|
||
+ vext.8 $t1#lo, $a, $a, #2 @ A2
|
||
+ vmull.p8 $t1, $t1#lo, $b @ H = A2*B
|
||
+ vext.8 $t3#lo, $b, $b, #2 @ B2
|
||
+ vmull.p8 $t3, $a, $t3#lo @ G = A*B2
|
||
+ vext.8 $t2#lo, $a, $a, #3 @ A3
|
||
+ veor $t0, $t0, $r @ L = E + F
|
||
+ vmull.p8 $t2, $t2#lo, $b @ J = A3*B
|
||
+ vext.8 $r#lo, $b, $b, #3 @ B3
|
||
+ veor $t1, $t1, $t3 @ M = G + H
|
||
+ vmull.p8 $r, $a, $r#lo @ I = A*B3
|
||
+ veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
|
||
+ vand $t0#hi, $t0#hi, $k48
|
||
+ vext.8 $t3#lo, $b, $b, #4 @ B4
|
||
+ veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
|
||
+ vand $t1#hi, $t1#hi, $k32
|
||
+ vmull.p8 $t3, $a, $t3#lo @ K = A*B4
|
||
+ veor $t2, $t2, $r @ N = I + J
|
||
+ veor $t0#lo, $t0#lo, $t0#hi
|
||
+ veor $t1#lo, $t1#lo, $t1#hi
|
||
+ veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
|
||
+ vand $t2#hi, $t2#hi, $k16
|
||
+ vext.8 $t0, $t0, $t0, #15
|
||
+ veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
|
||
+ vmov.i64 $t3#hi, #0
|
||
+ vext.8 $t1, $t1, $t1, #14
|
||
+ veor $t2#lo, $t2#lo, $t2#hi
|
||
+ vmull.p8 $r, $a, $b @ D = A*B
|
||
+ vext.8 $t3, $t3, $t3, #12
|
||
+ vext.8 $t2, $t2, $t2, #13
|
||
+ veor $t0, $t0, $t1
|
||
+ veor $t2, $t2, $t3
|
||
+ veor $r, $r, $t0
|
||
+ veor $r, $r, $t2
|
||
+___
|
||
+}
|
||
|
||
$code.=<<___;
|
||
#if __ARM_ARCH__>=7
|
||
.fpu neon
|
||
|
||
+.global gcm_init_neon
|
||
+.type gcm_init_neon,%function
|
||
+.align 4
|
||
+gcm_init_neon:
|
||
+ vld1.64 $IN#hi,[r1,:64]! @ load H
|
||
+ vmov.i8 $t0,#0xe1
|
||
+ vld1.64 $IN#lo,[r1,:64]
|
||
+ vshl.i64 $t0#hi,#57
|
||
+ vshr.u64 $t0#lo,#63 @ t0=0xc2....01
|
||
+ vdup.8 $t1,$IN#hi[7]
|
||
+ vshr.u64 $Hlo,$IN#lo,#63
|
||
+ vshr.s8 $t1,#7 @ broadcast carry bit
|
||
+ vshl.i64 $IN,$IN,#1
|
||
+ vand $t0,$t0,$t1
|
||
+ vorr $IN#hi,$Hlo @ H<<<=1
|
||
+ veor $IN,$IN,$t0 @ twisted H
|
||
+ vstmia r0,{$IN}
|
||
+
|
||
+ ret @ bx lr
|
||
+.size gcm_init_neon,.-gcm_init_neon
|
||
+
|
||
.global gcm_gmult_neon
|
||
.type gcm_gmult_neon,%function
|
||
.align 4
|
||
gcm_gmult_neon:
|
||
- sub $Htbl,#16 @ point at H in GCM128_CTX
|
||
- vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
|
||
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
|
||
- vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
|
||
- vshr.u64 $mod,#32
|
||
- vldmia $Htbl,{$Hhi-$Hlo} @ load H
|
||
- veor $zero,$zero
|
||
+ vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
|
||
+ vld1.64 $IN#lo,[$Xi,:64]!
|
||
+ vmov.i64 $k48,#0x0000ffffffffffff
|
||
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
||
+ vmov.i64 $k32,#0x00000000ffffffff
|
||
#ifdef __ARMEL__
|
||
vrev64.8 $IN,$IN
|
||
#endif
|
||
- veor $Qpost,$Qpost
|
||
- veor $R,$R
|
||
- mov $cnt,#16
|
||
- veor $Z,$Z
|
||
+ vmov.i64 $k16,#0x000000000000ffff
|
||
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
||
mov $len,#16
|
||
- veor $Zo,$Zo
|
||
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
|
||
- b .Linner_neon
|
||
+ b .Lgmult_neon
|
||
.size gcm_gmult_neon,.-gcm_gmult_neon
|
||
|
||
.global gcm_ghash_neon
|
||
.type gcm_ghash_neon,%function
|
||
.align 4
|
||
gcm_ghash_neon:
|
||
- vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
|
||
- vmov.i32 $mod,#0xe1 @ our irreducible polynomial
|
||
- vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
|
||
- vshr.u64 $mod,#32
|
||
- vldmia $Xi,{$Hhi-$Hlo} @ load H
|
||
- veor $zero,$zero
|
||
- nop
|
||
+ vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
|
||
+ vld1.64 $Xl#lo,[$Xi,:64]!
|
||
+ vmov.i64 $k48,#0x0000ffffffffffff
|
||
+ vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
||
+ vmov.i64 $k32,#0x00000000ffffffff
|
||
#ifdef __ARMEL__
|
||
- vrev64.8 $Z,$Z
|
||
+ vrev64.8 $Xl,$Xl
|
||
#endif
|
||
-.Louter_neon:
|
||
- vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
|
||
- veor $Qpost,$Qpost
|
||
- vld1.64 `&Dlo($IN)`,[$inp]!
|
||
- veor $R,$R
|
||
- mov $cnt,#16
|
||
+ vmov.i64 $k16,#0x000000000000ffff
|
||
+ veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
||
+
|
||
+.Loop_neon:
|
||
+ vld1.64 $IN#hi,[$inp]! @ load inp
|
||
+ vld1.64 $IN#lo,[$inp]!
|
||
#ifdef __ARMEL__
|
||
vrev64.8 $IN,$IN
|
||
#endif
|
||
- veor $Zo,$Zo
|
||
- veor $IN,$Z @ inp^=Xi
|
||
- veor $Z,$Z
|
||
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
|
||
-.Linner_neon:
|
||
- subs $cnt,$cnt,#1
|
||
- vmull.p8 $Qlo,$Hlo,$xi @ H.lo<6C>Xi[i]
|
||
- vmull.p8 $Qhi,$Hhi,$xi @ H.hi<68>Xi[i]
|
||
- vext.8 $IN,$zero,#1 @ IN>>=8
|
||
-
|
||
- veor $Z,$Qpost @ modulo-scheduled part
|
||
- vshl.i64 `&Dlo("$R")`,#48
|
||
- vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
|
||
- veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
|
||
-
|
||
- veor `&Dhi("$Z")`,`&Dlo("$R")`
|
||
- vuzp.8 $Qlo,$Qhi
|
||
- vsli.8 $Zo,$T,#1 @ compose the "carry" byte
|
||
- vext.8 $Z,$zero,#1 @ Z>>=8
|
||
-
|
||
- vmull.p8 $R,$Zo,$mod @ "carry"<22>0xe1
|
||
- vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
|
||
- vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
|
||
- veor $Z,$Qhi
|
||
- bne .Linner_neon
|
||
-
|
||
- veor $Z,$Qpost @ modulo-scheduled artefact
|
||
- vshl.i64 `&Dlo("$R")`,#48
|
||
- veor `&Dhi("$Z")`,`&Dlo("$R")`
|
||
-
|
||
- @ finalization, normalize Z:Zo
|
||
- vand $Zo,$mod @ suffices to mask the bit
|
||
- vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
|
||
- vshl.i64 $Z,#1
|
||
+ veor $IN,$Xl @ inp^=Xi
|
||
+.Lgmult_neon:
|
||
+___
|
||
+ &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo<6C>Xi.lo
|
||
+$code.=<<___;
|
||
+ veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
|
||
+___
|
||
+ &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)<29>(Xi.lo+Xi.hi)
|
||
+ &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi<68>Xi.hi
|
||
+$code.=<<___;
|
||
+ veor $Xm,$Xm,$Xl @ Karatsuba post-processing
|
||
+ veor $Xm,$Xm,$Xh
|
||
+ veor $Xl#hi,$Xl#hi,$Xm#lo
|
||
+ veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
|
||
+
|
||
+ @ equivalent of reduction_avx from ghash-x86_64.pl
|
||
+ vshl.i64 $t1,$Xl,#57 @ 1st phase
|
||
+ vshl.i64 $t2,$Xl,#62
|
||
+ veor $t2,$t2,$t1 @
|
||
+ vshl.i64 $t1,$Xl,#63
|
||
+ veor $t2, $t2, $t1 @
|
||
+ veor $Xl#hi,$Xl#hi,$t2#lo @
|
||
+ veor $Xh#lo,$Xh#lo,$t2#hi
|
||
+
|
||
+ vshr.u64 $t2,$Xl,#1 @ 2nd phase
|
||
+ veor $Xh,$Xh,$Xl
|
||
+ veor $Xl,$Xl,$t2 @
|
||
+ vshr.u64 $t2,$t2,#6
|
||
+ vshr.u64 $Xl,$Xl,#1 @
|
||
+ veor $Xl,$Xl,$Xh @
|
||
+ veor $Xl,$Xl,$t2 @
|
||
+
|
||
subs $len,#16
|
||
- vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
|
||
- bne .Louter_neon
|
||
+ bne .Loop_neon
|
||
|
||
#ifdef __ARMEL__
|
||
- vrev64.8 $Z,$Z
|
||
+ vrev64.8 $Xl,$Xl
|
||
#endif
|
||
sub $Xi,#16
|
||
- vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
|
||
- vst1.64 `&Dlo("$Z")`,[$Xi,:64]
|
||
+ vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
|
||
+ vst1.64 $Xl#lo,[$Xi,:64]
|
||
|
||
- bx lr
|
||
+ ret @ bx lr
|
||
.size gcm_ghash_neon,.-gcm_ghash_neon
|
||
#endif
|
||
___
|
||
@@ -423,7 +480,13 @@ $code.=<<___;
|
||
.align 2
|
||
___
|
||
|
||
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
-print $code;
|
||
+foreach (split("\n",$code)) {
|
||
+ s/\`([^\`]*)\`/eval $1/geo;
|
||
+
|
||
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||
+ s/\bret\b/bx lr/go or
|
||
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||
+
|
||
+ print $_,"\n";
|
||
+}
|
||
close STDOUT; # enforce flush
|
||
diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
|
||
new file mode 100644
|
||
index 0000000..b24f3d7
|
||
--- /dev/null
|
||
+++ b/crypto/modes/asm/ghashv8-armx.pl
|
||
@@ -0,0 +1,240 @@
|
||
+#!/usr/bin/env perl
|
||
+#
|
||
+# ====================================================================
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
+# project. The module is, however, dual licensed under OpenSSL and
|
||
+# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
+# details see http://www.openssl.org/~appro/cryptogams/.
|
||
+# ====================================================================
|
||
+#
|
||
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
|
||
+#
|
||
+# June 2014
|
||
+#
|
||
+# Initial version was developed in tight cooperation with Ard
|
||
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
|
||
+# other assembly modules. Just like aesv8-armx.pl this module
|
||
+# supports both AArch32 and AArch64 execution modes.
|
||
+#
|
||
+# Current performance in cycles per processed byte:
|
||
+#
|
||
+# PMULL[2] 32-bit NEON(*)
|
||
+# Apple A7 1.76 5.62
|
||
+# Cortex-A5x n/a n/a
|
||
+#
|
||
+# (*) presented for reference/comparison purposes;
|
||
+
|
||
+$flavour = shift;
|
||
+open STDOUT,">".shift;
|
||
+
|
||
+$Xi="x0"; # argument block
|
||
+$Htbl="x1";
|
||
+$inp="x2";
|
||
+$len="x3";
|
||
+
|
||
+$inc="x12";
|
||
+
|
||
+{
|
||
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
||
+my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
|
||
+
|
||
+$code=<<___;
|
||
+#include "arm_arch.h"
|
||
+
|
||
+.text
|
||
+___
|
||
+$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||
+$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
|
||
+
|
||
+$code.=<<___;
|
||
+.global gcm_init_v8
|
||
+.type gcm_init_v8,%function
|
||
+.align 4
|
||
+gcm_init_v8:
|
||
+ vld1.64 {$t1},[x1] @ load H
|
||
+ vmov.i8 $t0,#0xe1
|
||
+ vext.8 $IN,$t1,$t1,#8
|
||
+ vshl.i64 $t0,$t0,#57
|
||
+ vshr.u64 $t2,$t0,#63
|
||
+ vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01
|
||
+ vdup.32 $t1,${t1}[1]
|
||
+ vshr.u64 $t3,$IN,#63
|
||
+ vshr.s32 $t1,$t1,#31 @ broadcast carry bit
|
||
+ vand $t3,$t3,$t0
|
||
+ vshl.i64 $IN,$IN,#1
|
||
+ vext.8 $t3,$t3,$t3,#8
|
||
+ vand $t0,$t0,$t1
|
||
+ vorr $IN,$IN,$t3 @ H<<<=1
|
||
+ veor $IN,$IN,$t0 @ twisted H
|
||
+ vst1.64 {$IN},[x0]
|
||
+
|
||
+ ret
|
||
+.size gcm_init_v8,.-gcm_init_v8
|
||
+
|
||
+.global gcm_gmult_v8
|
||
+.type gcm_gmult_v8,%function
|
||
+.align 4
|
||
+gcm_gmult_v8:
|
||
+ vld1.64 {$t1},[$Xi] @ load Xi
|
||
+ vmov.i8 $t3,#0xe1
|
||
+ vld1.64 {$H},[$Htbl] @ load twisted H
|
||
+ vshl.u64 $t3,$t3,#57
|
||
+#ifndef __ARMEB__
|
||
+ vrev64.8 $t1,$t1
|
||
+#endif
|
||
+ vext.8 $Hhl,$H,$H,#8
|
||
+ mov $len,#0
|
||
+ vext.8 $IN,$t1,$t1,#8
|
||
+ mov $inc,#0
|
||
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
|
||
+ mov $inp,$Xi
|
||
+ b .Lgmult_v8
|
||
+.size gcm_gmult_v8,.-gcm_gmult_v8
|
||
+
|
||
+.global gcm_ghash_v8
|
||
+.type gcm_ghash_v8,%function
|
||
+.align 4
|
||
+gcm_ghash_v8:
|
||
+ vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
|
||
+ subs $len,$len,#16
|
||
+ vmov.i8 $t3,#0xe1
|
||
+ mov $inc,#16
|
||
+ vld1.64 {$H},[$Htbl] @ load twisted H
|
||
+ cclr $inc,eq
|
||
+ vext.8 $Xl,$Xl,$Xl,#8
|
||
+ vshl.u64 $t3,$t3,#57
|
||
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
|
||
+ vext.8 $Hhl,$H,$H,#8
|
||
+#ifndef __ARMEB__
|
||
+ vrev64.8 $Xl,$Xl
|
||
+ vrev64.8 $t1,$t1
|
||
+#endif
|
||
+ veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
|
||
+ vext.8 $IN,$t1,$t1,#8
|
||
+ b .Loop_v8
|
||
+
|
||
+.align 4
|
||
+.Loop_v8:
|
||
+ vext.8 $t2,$Xl,$Xl,#8
|
||
+ veor $IN,$IN,$Xl @ inp^=Xi
|
||
+ veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi
|
||
+
|
||
+.Lgmult_v8:
|
||
+ vpmull.p64 $Xl,$H,$IN @ H.lo<6C>Xi.lo
|
||
+ veor $t1,$t1,$IN @ Karatsuba pre-processing
|
||
+ vpmull2.p64 $Xh,$H,$IN @ H.hi<68>Xi.hi
|
||
+ subs $len,$len,#16
|
||
+ vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)<29>(Xi.lo+Xi.hi)
|
||
+ cclr $inc,eq
|
||
+
|
||
+ vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||
+ veor $t2,$Xl,$Xh
|
||
+ veor $Xm,$Xm,$t1
|
||
+ vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
|
||
+ veor $Xm,$Xm,$t2
|
||
+ vpmull.p64 $t2,$Xl,$t3 @ 1st phase
|
||
+
|
||
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||
+#ifndef __ARMEB__
|
||
+ vrev64.8 $t1,$t1
|
||
+#endif
|
||
+ veor $Xl,$Xm,$t2
|
||
+ vext.8 $IN,$t1,$t1,#8
|
||
+
|
||
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
|
||
+ vpmull.p64 $Xl,$Xl,$t3
|
||
+ veor $t2,$t2,$Xh
|
||
+ veor $Xl,$Xl,$t2
|
||
+ b.hs .Loop_v8
|
||
+
|
||
+#ifndef __ARMEB__
|
||
+ vrev64.8 $Xl,$Xl
|
||
+#endif
|
||
+ vext.8 $Xl,$Xl,$Xl,#8
|
||
+ vst1.64 {$Xl},[$Xi] @ write out Xi
|
||
+
|
||
+ ret
|
||
+.size gcm_ghash_v8,.-gcm_ghash_v8
|
||
+___
|
||
+}
|
||
+$code.=<<___;
|
||
+.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||
+.align 2
|
||
+___
|
||
+
|
||
+if ($flavour =~ /64/) { ######## 64-bit code
|
||
+ sub unvmov {
|
||
+ my $arg=shift;
|
||
+
|
||
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
|
||
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
|
||
+ }
|
||
+ foreach(split("\n",$code)) {
|
||
+ s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
|
||
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
|
||
+ s/vmov\s+(.*)/unvmov($1)/geo or
|
||
+ s/vext\.8/ext/o or
|
||
+ s/vshr\.s/sshr\.s/o or
|
||
+ s/vshr/ushr/o or
|
||
+ s/^(\s+)v/$1/o or # strip off v prefix
|
||
+ s/\bbx\s+lr\b/ret/o;
|
||
+
|
||
+ s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
|
||
+ s/@\s/\/\//o; # old->new style commentary
|
||
+
|
||
+ # fix up remainig legacy suffixes
|
||
+ s/\.[ui]?8(\s)/$1/o;
|
||
+ s/\.[uis]?32//o and s/\.16b/\.4s/go;
|
||
+ m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
|
||
+ m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
|
||
+ s/\.[uisp]?64//o and s/\.16b/\.2d/go;
|
||
+ s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
|
||
+
|
||
+ print $_,"\n";
|
||
+ }
|
||
+} else { ######## 32-bit code
|
||
+ sub unvdup32 {
|
||
+ my $arg=shift;
|
||
+
|
||
+ $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||
+ }
|
||
+ sub unvpmullp64 {
|
||
+ my ($mnemonic,$arg)=@_;
|
||
+
|
||
+ if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
|
||
+ my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|
||
+ |(($2&7)<<17)|(($2&8)<<4)
|
||
+ |(($3&7)<<1) |(($3&8)<<2);
|
||
+ $word |= 0x00010001 if ($mnemonic =~ "2");
|
||
+ # since ARMv7 instructions are always encoded little-endian.
|
||
+ # correct solution is to use .inst directive, but older
|
||
+ # assemblers don't implement it:-(
|
||
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||
+ $word&0xff,($word>>8)&0xff,
|
||
+ ($word>>16)&0xff,($word>>24)&0xff,
|
||
+ $mnemonic,$arg;
|
||
+ }
|
||
+ }
|
||
+
|
||
+ foreach(split("\n",$code)) {
|
||
+ s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
||
+ s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
||
+ s/\/\/\s?/@ /o; # new->old style commentary
|
||
+
|
||
+ # fix up remainig new-style suffixes
|
||
+ s/\],#[0-9]+/]!/o;
|
||
+
|
||
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
|
||
+ s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
||
+ s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
|
||
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||
+ s/^(\s+)b\./$1b/o or
|
||
+ s/^(\s+)ret/$1bx\tlr/o;
|
||
+
|
||
+ print $_,"\n";
|
||
+ }
|
||
+}
|
||
+
|
||
+close STDOUT; # enforce flush
|
||
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
|
||
index e1dc2b0..79ebb66 100644
|
||
--- a/crypto/modes/gcm128.c
|
||
+++ b/crypto/modes/gcm128.c
|
||
@@ -642,7 +642,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
|
||
|
||
#endif
|
||
|
||
-#if TABLE_BITS==4 && defined(GHASH_ASM)
|
||
+#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
|
||
# if !defined(I386_ONLY) && \
|
||
(defined(__i386) || defined(__i386__) || \
|
||
defined(__x86_64) || defined(__x86_64__) || \
|
||
@@ -663,13 +663,21 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len
|
||
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
|
||
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||
# endif
|
||
-# elif defined(__arm__) || defined(__arm)
|
||
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
|
||
# include "arm_arch.h"
|
||
# if __ARM_ARCH__>=7
|
||
# define GHASH_ASM_ARM
|
||
# define GCM_FUNCREF_4BIT
|
||
+# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
|
||
+# if defined(__arm__) || defined(__arm)
|
||
+# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
|
||
+# endif
|
||
+void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
|
||
void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
|
||
void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||
+void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
|
||
+void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
|
||
+void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||
# endif
|
||
# endif
|
||
#endif
|
||
@@ -739,10 +747,21 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
|
||
ctx->ghash = gcm_ghash_4bit;
|
||
# endif
|
||
# elif defined(GHASH_ASM_ARM)
|
||
- if (OPENSSL_armcap_P & ARMV7_NEON) {
|
||
+# ifdef PMULL_CAPABLE
|
||
+ if (PMULL_CAPABLE) {
|
||
+ gcm_init_v8(ctx->Htable,ctx->H.u);
|
||
+ ctx->gmult = gcm_gmult_v8;
|
||
+ ctx->ghash = gcm_ghash_v8;
|
||
+ } else
|
||
+# endif
|
||
+# ifdef NEON_CAPABLE
|
||
+ if (NEON_CAPABLE) {
|
||
+ gcm_init_neon(ctx->Htable,ctx->H.u);
|
||
ctx->gmult = gcm_gmult_neon;
|
||
ctx->ghash = gcm_ghash_neon;
|
||
- } else {
|
||
+ } else
|
||
+# endif
|
||
+ {
|
||
gcm_init_4bit(ctx->Htable,ctx->H.u);
|
||
ctx->gmult = gcm_gmult_4bit;
|
||
ctx->ghash = gcm_ghash_4bit;
|
||
diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile
|
||
index 2eb2b7a..6ef027d 100644
|
||
--- a/crypto/sha/Makefile
|
||
+++ b/crypto/sha/Makefile
|
||
@@ -92,6 +92,9 @@ sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
|
||
sha1-armv4-large.o: sha1-armv4-large.S
|
||
sha256-armv4.o: sha256-armv4.S
|
||
sha512-armv4.o: sha512-armv4.S
|
||
+sha1-armv8.o: sha1-armv8.S
|
||
+sha256-armv8.o: sha256-armv8.S
|
||
+sha512-armv8.o: sha512-armv8.S
|
||
|
||
files:
|
||
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
|
||
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
|
||
index 33da3e0..50bd07b 100644
|
||
--- a/crypto/sha/asm/sha1-armv4-large.pl
|
||
+++ b/crypto/sha/asm/sha1-armv4-large.pl
|
||
@@ -1,7 +1,7 @@
|
||
#!/usr/bin/env perl
|
||
|
||
# ====================================================================
|
||
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
@@ -52,6 +52,20 @@
|
||
# Profiler-assisted and platform-specific optimization resulted in 10%
|
||
# improvement on Cortex A8 core and 12.2 cycles per byte.
|
||
|
||
+# September 2013.
|
||
+#
|
||
+# Add NEON implementation (see sha1-586.pl for background info). On
|
||
+# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
|
||
+# faster than integer-only code. Because [fully unrolled] NEON code
|
||
+# is ~2.5x larger and there are some redundant instructions executed
|
||
+# when processing last block, improvement is not as big for smallest
|
||
+# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
|
||
+# byte, which is also >80% faster than integer-only code.
|
||
+
|
||
+# May 2014.
|
||
+#
|
||
+# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
|
||
+
|
||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||
open STDOUT,">$output";
|
||
|
||
@@ -153,12 +167,22 @@ $code=<<___;
|
||
#include "arm_arch.h"
|
||
|
||
.text
|
||
+.code 32
|
||
|
||
.global sha1_block_data_order
|
||
.type sha1_block_data_order,%function
|
||
|
||
-.align 2
|
||
+.align 5
|
||
sha1_block_data_order:
|
||
+#if __ARM_ARCH__>=7
|
||
+ sub r3,pc,#8 @ sha1_block_data_order
|
||
+ ldr r12,.LOPENSSL_armcap
|
||
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
|
||
+ tst r12,#ARMV8_SHA1
|
||
+ bne .LARMv8
|
||
+ tst r12,#ARMV7_NEON
|
||
+ bne .LNEON
|
||
+#endif
|
||
stmdb sp!,{r4-r12,lr}
|
||
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
|
||
ldmia $ctx,{$a,$b,$c,$d,$e}
|
||
@@ -233,16 +257,422 @@ $code.=<<___;
|
||
moveq pc,lr @ be binary compatible with V4, yet
|
||
bx lr @ interoperable with Thumb ISA:-)
|
||
#endif
|
||
-.align 2
|
||
+.size sha1_block_data_order,.-sha1_block_data_order
|
||
+
|
||
+.align 5
|
||
.LK_00_19: .word 0x5a827999
|
||
.LK_20_39: .word 0x6ed9eba1
|
||
.LK_40_59: .word 0x8f1bbcdc
|
||
.LK_60_79: .word 0xca62c1d6
|
||
-.size sha1_block_data_order,.-sha1_block_data_order
|
||
-.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
||
-.align 2
|
||
+.LOPENSSL_armcap:
|
||
+.word OPENSSL_armcap_P-sha1_block_data_order
|
||
+.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||
+.align 5
|
||
+___
|
||
+#####################################################################
|
||
+# NEON stuff
|
||
+#
|
||
+{{{
|
||
+my @V=($a,$b,$c,$d,$e);
|
||
+my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
|
||
+my $Xi=4;
|
||
+my @X=map("q$_",(8..11,0..3));
|
||
+my @Tx=("q12","q13");
|
||
+my ($K,$zero)=("q14","q15");
|
||
+my $j=0;
|
||
+
|
||
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
||
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
|
||
+ my $arg = pop;
|
||
+ $arg = "#$arg" if ($arg*1 eq $arg);
|
||
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
|
||
+}
|
||
+
|
||
+sub body_00_19 () {
|
||
+ (
|
||
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
|
||
+ '&bic ($t0,$d,$b)',
|
||
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
|
||
+ '&and ($t1,$c,$b)',
|
||
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
|
||
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
|
||
+ '&eor ($t1,$t1,$t0)', # F_00_19
|
||
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
|
||
+ '&add ($e,$e,$t1);'. # e+=F_00_19
|
||
+ '$j++; unshift(@V,pop(@V));'
|
||
+ )
|
||
+}
|
||
+sub body_20_39 () {
|
||
+ (
|
||
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
|
||
+ '&eor ($t0,$b,$d)',
|
||
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
|
||
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
|
||
+ '&eor ($t1,$t0,$c)', # F_20_39
|
||
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
|
||
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
|
||
+ '&add ($e,$e,$t1);'. # e+=F_20_39
|
||
+ '$j++; unshift(@V,pop(@V));'
|
||
+ )
|
||
+}
|
||
+sub body_40_59 () {
|
||
+ (
|
||
+ '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
|
||
+ '&add ($e,$e,$Ki)', # e+=X[i]+K
|
||
+ '&and ($t0,$c,$d)',
|
||
+ '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
|
||
+ '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
|
||
+ '&eor ($t1,$c,$d)',
|
||
+ '&add ($e,$e,$t0)',
|
||
+ '&and ($t1,$t1,$b)',
|
||
+ '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
|
||
+ '&add ($e,$e,$t1);'. # e+=F_40_59
|
||
+ '$j++; unshift(@V,pop(@V));'
|
||
+ )
|
||
+}
|
||
+
|
||
+sub Xupdate_16_31 ()
|
||
+{ use integer;
|
||
+ my $body = shift;
|
||
+ my @insns = (&$body,&$body,&$body,&$body);
|
||
+ my ($a,$b,$c,$d,$e);
|
||
+
|
||
+ &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
|
||
+ eval(shift(@insns));
|
||
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
|
||
+ eval(shift(@insns));
|
||
+ &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
|
||
+ &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 (@Tx[0],@Tx[1],30);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshl_u32 (@Tx[1],@Tx[1],2);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor (@X[0],@X[0],@Tx[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
||
+
|
||
+ foreach (@insns) { eval; } # remaining instructions [if any]
|
||
+
|
||
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
||
+}
|
||
+
|
||
+sub Xupdate_32_79 ()
|
||
+{ use integer;
|
||
+ my $body = shift;
|
||
+ my @insns = (&$body,&$body,&$body,&$body);
|
||
+ my ($a,$b,$c,$d,$e);
|
||
+
|
||
+ &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
|
||
+ eval(shift(@insns));
|
||
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
|
||
+ eval(shift(@insns));
|
||
+ &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 (@X[0],@Tx[0],30);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
|
||
+ &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
|
||
+
|
||
+ foreach (@insns) { eval; } # remaining instructions [if any]
|
||
+
|
||
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
|
||
+}
|
||
+
|
||
+sub Xuplast_80 ()
|
||
+{ use integer;
|
||
+ my $body = shift;
|
||
+ my @insns = (&$body,&$body,&$body,&$body);
|
||
+ my ($a,$b,$c,$d,$e);
|
||
+
|
||
+ &vadd_i32 (@Tx[1],@X[-1&7],$K);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
|
||
+ &sub ($Xfer,$Xfer,64);
|
||
+
|
||
+ &teq ($inp,$len);
|
||
+ &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
|
||
+ &subeq ($inp,$inp,64); # reload last block to avoid SEGV
|
||
+ &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vrev32_8 (@X[-4&7],@X[-4&7]);
|
||
+
|
||
+ foreach (@insns) { eval; } # remaining instructions
|
||
+
|
||
+ $Xi=0;
|
||
+}
|
||
+
|
||
+sub Xloop()
|
||
+{ use integer;
|
||
+ my $body = shift;
|
||
+ my @insns = (&$body,&$body,&$body,&$body);
|
||
+ my ($a,$b,$c,$d,$e);
|
||
+
|
||
+ &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
|
||
+
|
||
+ foreach (@insns) { eval; }
|
||
+
|
||
+ $Xi++;
|
||
+}
|
||
+
|
||
+$code.=<<___;
|
||
+#if __ARM_ARCH__>=7
|
||
+.fpu neon
|
||
+
|
||
+.type sha1_block_data_order_neon,%function
|
||
+.align 4
|
||
+sha1_block_data_order_neon:
|
||
+.LNEON:
|
||
+ stmdb sp!,{r4-r12,lr}
|
||
+ add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
|
||
+ @ dmb @ errata #451034 on early Cortex A8
|
||
+ @ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+ mov $saved_sp,sp
|
||
+ sub sp,sp,#64 @ alloca
|
||
+ adr $K_XX_XX,.LK_00_19
|
||
+ bic sp,sp,#15 @ align for 128-bit stores
|
||
+
|
||
+ ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
|
||
+ mov $Xfer,sp
|
||
+
|
||
+ vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
|
||
+ veor $zero,$zero,$zero
|
||
+ vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
|
||
+ vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
|
||
+ vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
|
||
+ vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
|
||
+ vrev32.8 @X[-2&7],@X[-2&7]
|
||
+ vadd.i32 @X[0],@X[-4&7],$K
|
||
+ vrev32.8 @X[-1&7],@X[-1&7]
|
||
+ vadd.i32 @X[1],@X[-3&7],$K
|
||
+ vst1.32 {@X[0]},[$Xfer,:128]!
|
||
+ vadd.i32 @X[2],@X[-2&7],$K
|
||
+ vst1.32 {@X[1]},[$Xfer,:128]!
|
||
+ vst1.32 {@X[2]},[$Xfer,:128]!
|
||
+ ldr $Ki,[sp] @ big RAW stall
|
||
+
|
||
+.Loop_neon:
|
||
+___
|
||
+ &Xupdate_16_31(\&body_00_19);
|
||
+ &Xupdate_16_31(\&body_00_19);
|
||
+ &Xupdate_16_31(\&body_00_19);
|
||
+ &Xupdate_16_31(\&body_00_19);
|
||
+ &Xupdate_32_79(\&body_00_19);
|
||
+ &Xupdate_32_79(\&body_20_39);
|
||
+ &Xupdate_32_79(\&body_20_39);
|
||
+ &Xupdate_32_79(\&body_20_39);
|
||
+ &Xupdate_32_79(\&body_20_39);
|
||
+ &Xupdate_32_79(\&body_20_39);
|
||
+ &Xupdate_32_79(\&body_40_59);
|
||
+ &Xupdate_32_79(\&body_40_59);
|
||
+ &Xupdate_32_79(\&body_40_59);
|
||
+ &Xupdate_32_79(\&body_40_59);
|
||
+ &Xupdate_32_79(\&body_40_59);
|
||
+ &Xupdate_32_79(\&body_20_39);
|
||
+ &Xuplast_80(\&body_20_39);
|
||
+ &Xloop(\&body_20_39);
|
||
+ &Xloop(\&body_20_39);
|
||
+ &Xloop(\&body_20_39);
|
||
+$code.=<<___;
|
||
+ ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
|
||
+ add $a,$a,$Ki
|
||
+ ldr $Ki,[$ctx,#16]
|
||
+ add $b,$b,$t0
|
||
+ add $c,$c,$t1
|
||
+ add $d,$d,$Xfer
|
||
+ moveq sp,$saved_sp
|
||
+ add $e,$e,$Ki
|
||
+ ldrne $Ki,[sp]
|
||
+ stmia $ctx,{$a,$b,$c,$d,$e}
|
||
+ addne $Xfer,sp,#3*16
|
||
+ bne .Loop_neon
|
||
+
|
||
+ @ vldmia sp!,{d8-d15}
|
||
+ ldmia sp!,{r4-r12,pc}
|
||
+.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
|
||
+#endif
|
||
+___
|
||
+}}}
|
||
+#####################################################################
|
||
+# ARMv8 stuff
|
||
+#
|
||
+{{{
|
||
+my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
|
||
+my @MSG=map("q$_",(4..7));
|
||
+my @Kxx=map("q$_",(8..11));
|
||
+my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
|
||
+
|
||
+$code.=<<___;
|
||
+#if __ARM_ARCH__>=7
|
||
+.type sha1_block_data_order_armv8,%function
|
||
+.align 5
|
||
+sha1_block_data_order_armv8:
|
||
+.LARMv8:
|
||
+ vstmdb sp!,{d8-d15} @ ABI specification says so
|
||
+
|
||
+ veor $E,$E,$E
|
||
+ adr r3,.LK_00_19
|
||
+ vld1.32 {$ABCD},[$ctx]!
|
||
+ vld1.32 {$E\[0]},[$ctx]
|
||
+ sub $ctx,$ctx,#16
|
||
+ vld1.32 {@Kxx[0]\[]},[r3,:32]!
|
||
+ vld1.32 {@Kxx[1]\[]},[r3,:32]!
|
||
+ vld1.32 {@Kxx[2]\[]},[r3,:32]!
|
||
+ vld1.32 {@Kxx[3]\[]},[r3,:32]
|
||
+
|
||
+.Loop_v8:
|
||
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
|
||
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
|
||
+ vrev32.8 @MSG[0],@MSG[0]
|
||
+ vrev32.8 @MSG[1],@MSG[1]
|
||
+
|
||
+ vadd.i32 $W0,@Kxx[0],@MSG[0]
|
||
+ vrev32.8 @MSG[2],@MSG[2]
|
||
+ vmov $ABCD_SAVE,$ABCD @ offload
|
||
+ subs $len,$len,#1
|
||
+
|
||
+ vadd.i32 $W1,@Kxx[0],@MSG[1]
|
||
+ vrev32.8 @MSG[3],@MSG[3]
|
||
+ sha1h $E1,$ABCD @ 0
|
||
+ sha1c $ABCD,$E,$W0
|
||
+ vadd.i32 $W0,@Kxx[$j],@MSG[2]
|
||
+ sha1su0 @MSG[0],@MSG[1],@MSG[2]
|
||
+___
|
||
+for ($j=0,$i=1;$i<20-3;$i++) {
|
||
+my $f=("c","p","m","p")[$i/5];
|
||
+$code.=<<___;
|
||
+ sha1h $E0,$ABCD @ $i
|
||
+ sha1$f $ABCD,$E1,$W1
|
||
+ vadd.i32 $W1,@Kxx[$j],@MSG[3]
|
||
+ sha1su1 @MSG[0],@MSG[3]
|
||
+___
|
||
+$code.=<<___ if ($i<20-4);
|
||
+ sha1su0 @MSG[1],@MSG[2],@MSG[3]
|
||
___
|
||
+ ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
|
||
+ push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
|
||
+}
|
||
+$code.=<<___;
|
||
+ sha1h $E0,$ABCD @ $i
|
||
+ sha1p $ABCD,$E1,$W1
|
||
+ vadd.i32 $W1,@Kxx[$j],@MSG[3]
|
||
+
|
||
+ sha1h $E1,$ABCD @ 18
|
||
+ sha1p $ABCD,$E0,$W0
|
||
+
|
||
+ sha1h $E0,$ABCD @ 19
|
||
+ sha1p $ABCD,$E1,$W1
|
||
+
|
||
+ vadd.i32 $E,$E,$E0
|
||
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||
+ bne .Loop_v8
|
||
+
|
||
+ vst1.32 {$ABCD},[$ctx]!
|
||
+ vst1.32 {$E\[0]},[$ctx]
|
||
+
|
||
+ vldmia sp!,{d8-d15}
|
||
+ ret @ bx lr
|
||
+.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
|
||
+#endif
|
||
+___
|
||
+}}}
|
||
+$code.=<<___;
|
||
+.comm OPENSSL_armcap_P,4,4
|
||
+___
|
||
+
|
||
+{ my %opcode = (
|
||
+ "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
|
||
+ "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
|
||
+ "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
|
||
+
|
||
+ sub unsha1 {
|
||
+ my ($mnemonic,$arg)=@_;
|
||
+
|
||
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
|
||
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||
+ |(($2&7)<<17)|(($2&8)<<4)
|
||
+ |(($3&7)<<1) |(($3&8)<<2);
|
||
+ # since ARMv7 instructions are always encoded little-endian.
|
||
+ # correct solution is to use .inst directive, but older
|
||
+ # assemblers don't implement it:-(
|
||
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||
+ $word&0xff,($word>>8)&0xff,
|
||
+ ($word>>16)&0xff,($word>>24)&0xff,
|
||
+ $mnemonic,$arg;
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+foreach (split($/,$code)) {
|
||
+ s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
|
||
+ s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
|
||
+
|
||
+ s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
|
||
+
|
||
+ s/\bret\b/bx lr/o or
|
||
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
|
||
+
|
||
+ print $_,$/;
|
||
+}
|
||
|
||
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
-print $code;
|
||
close STDOUT; # enforce flush
|
||
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
|
||
new file mode 100644
|
||
index 0000000..c1f552b
|
||
--- /dev/null
|
||
+++ b/crypto/sha/asm/sha1-armv8.pl
|
||
@@ -0,0 +1,333 @@
|
||
+#!/usr/bin/env perl
|
||
+#
|
||
+# ====================================================================
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
+# project. The module is, however, dual licensed under OpenSSL and
|
||
+# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
+# details see http://www.openssl.org/~appro/cryptogams/.
|
||
+# ====================================================================
|
||
+#
|
||
+# SHA1 for ARMv8.
|
||
+#
|
||
+# Performance in cycles per processed byte and improvement coefficient
|
||
+# over code generated with "default" compiler:
|
||
+#
|
||
+# hardware-assisted software(*)
|
||
+# Apple A7 2.31 4.13 (+14%)
|
||
+# Cortex-A5x n/a n/a
|
||
+#
|
||
+# (*) Software results are presented mostly for reference purposes.
|
||
+
|
||
+$flavour = shift;
|
||
+open STDOUT,">".shift;
|
||
+
|
||
+($ctx,$inp,$num)=("x0","x1","x2");
|
||
+@Xw=map("w$_",(3..17,19));
|
||
+@Xx=map("x$_",(3..17,19));
|
||
+@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
|
||
+($t0,$t1,$t2,$K)=map("w$_",(25..28));
|
||
+
|
||
+
|
||
+sub BODY_00_19 {
|
||
+my ($i,$a,$b,$c,$d,$e)=@_;
|
||
+my $j=($i+2)&15;
|
||
+
|
||
+$code.=<<___ if ($i<15 && !($i&1));
|
||
+ lsr @Xx[$i+1],@Xx[$i],#32
|
||
+___
|
||
+$code.=<<___ if ($i<14 && !($i&1));
|
||
+ ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
|
||
+___
|
||
+$code.=<<___ if ($i<14 && ($i&1));
|
||
+#ifdef __ARMEB__
|
||
+ ror @Xx[$i+1],@Xx[$i+1],#32
|
||
+#else
|
||
+ rev32 @Xx[$i+1],@Xx[$i+1]
|
||
+#endif
|
||
+___
|
||
+$code.=<<___ if ($i<14);
|
||
+ bic $t0,$d,$b
|
||
+ and $t1,$c,$b
|
||
+ ror $t2,$a,#27
|
||
+ add $d,$d,$K // future e+=K
|
||
+ orr $t0,$t0,$t1
|
||
+ add $e,$e,$t2 // e+=rot(a,5)
|
||
+ ror $b,$b,#2
|
||
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||
+ add $e,$e,$t0 // e+=F(b,c,d)
|
||
+___
|
||
+$code.=<<___ if ($i==19);
|
||
+ movz $K,#0xeba1
|
||
+ movk $K,#0x6ed9,lsl#16
|
||
+___
|
||
+$code.=<<___ if ($i>=14);
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
|
||
+ bic $t0,$d,$b
|
||
+ and $t1,$c,$b
|
||
+ ror $t2,$a,#27
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
|
||
+ add $d,$d,$K // future e+=K
|
||
+ orr $t0,$t0,$t1
|
||
+ add $e,$e,$t2 // e+=rot(a,5)
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
|
||
+ ror $b,$b,#2
|
||
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||
+ add $e,$e,$t0 // e+=F(b,c,d)
|
||
+ ror @Xw[$j],@Xw[$j],#31
|
||
+___
|
||
+}
|
||
+
|
||
+sub BODY_40_59 {
|
||
+my ($i,$a,$b,$c,$d,$e)=@_;
|
||
+my $j=($i+2)&15;
|
||
+
|
||
+$code.=<<___ if ($i==59);
|
||
+ movz $K,#0xc1d6
|
||
+ movk $K,#0xca62,lsl#16
|
||
+___
|
||
+$code.=<<___;
|
||
+ orr $t0,$b,$c
|
||
+ and $t1,$b,$c
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
|
||
+ ror $t2,$a,#27
|
||
+ and $t0,$t0,$d
|
||
+ add $d,$d,$K // future e+=K
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
|
||
+ add $e,$e,$t2 // e+=rot(a,5)
|
||
+ orr $t0,$t0,$t1
|
||
+ ror $b,$b,#2
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
|
||
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||
+ add $e,$e,$t0 // e+=F(b,c,d)
|
||
+ ror @Xw[$j],@Xw[$j],#31
|
||
+___
|
||
+}
|
||
+
|
||
+sub BODY_20_39 {
|
||
+my ($i,$a,$b,$c,$d,$e)=@_;
|
||
+my $j=($i+2)&15;
|
||
+
|
||
+$code.=<<___ if ($i==39);
|
||
+ movz $K,#0xbcdc
|
||
+ movk $K,#0x8f1b,lsl#16
|
||
+___
|
||
+$code.=<<___ if ($i<78);
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
|
||
+ eor $t0,$d,$b
|
||
+ ror $t2,$a,#27
|
||
+ add $d,$d,$K // future e+=K
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
|
||
+ eor $t0,$t0,$c
|
||
+ add $e,$e,$t2 // e+=rot(a,5)
|
||
+ ror $b,$b,#2
|
||
+ eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
|
||
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||
+ add $e,$e,$t0 // e+=F(b,c,d)
|
||
+ ror @Xw[$j],@Xw[$j],#31
|
||
+___
|
||
+$code.=<<___ if ($i==78);
|
||
+ ldp @Xw[1],@Xw[2],[$ctx]
|
||
+ eor $t0,$d,$b
|
||
+ ror $t2,$a,#27
|
||
+ add $d,$d,$K // future e+=K
|
||
+ eor $t0,$t0,$c
|
||
+ add $e,$e,$t2 // e+=rot(a,5)
|
||
+ ror $b,$b,#2
|
||
+ add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||
+ add $e,$e,$t0 // e+=F(b,c,d)
|
||
+___
|
||
+$code.=<<___ if ($i==79);
|
||
+ ldp @Xw[3],@Xw[4],[$ctx,#8]
|
||
+ eor $t0,$d,$b
|
||
+ ror $t2,$a,#27
|
||
+ eor $t0,$t0,$c
|
||
+ add $e,$e,$t2 // e+=rot(a,5)
|
||
+ ror $b,$b,#2
|
||
+ ldr @Xw[5],[$ctx,#16]
|
||
+ add $e,$e,$t0 // e+=F(b,c,d)
|
||
+___
|
||
+}
|
||
+
|
||
+$code.=<<___;
|
||
+#include "arm_arch.h"
|
||
+
|
||
+.text
|
||
+
|
||
+.globl sha1_block_data_order
|
||
+.type sha1_block_data_order,%function
|
||
+.align 6
|
||
+sha1_block_data_order:
|
||
+ ldr x16,.LOPENSSL_armcap_P
|
||
+ adr x17,.LOPENSSL_armcap_P
|
||
+ add x16,x16,x17
|
||
+ ldr w16,[x16]
|
||
+ tst w16,#ARMV8_SHA1
|
||
+ b.ne .Lv8_entry
|
||
+
|
||
+ stp x29,x30,[sp,#-96]!
|
||
+ add x29,sp,#0
|
||
+ stp x19,x20,[sp,#16]
|
||
+ stp x21,x22,[sp,#32]
|
||
+ stp x23,x24,[sp,#48]
|
||
+ stp x25,x26,[sp,#64]
|
||
+ stp x27,x28,[sp,#80]
|
||
+
|
||
+ ldp $A,$B,[$ctx]
|
||
+ ldp $C,$D,[$ctx,#8]
|
||
+ ldr $E,[$ctx,#16]
|
||
+
|
||
+.Loop:
|
||
+ ldr @Xx[0],[$inp],#64
|
||
+ movz $K,#0x7999
|
||
+ sub $num,$num,#1
|
||
+ movk $K,#0x5a82,lsl#16
|
||
+#ifdef __ARMEB__
|
||
+ ror $Xx[0],@Xx[0],#32
|
||
+#else
|
||
+ rev32 @Xx[0],@Xx[0]
|
||
+#endif
|
||
+ add $E,$E,$K // warm it up
|
||
+ add $E,$E,@Xw[0]
|
||
+___
|
||
+for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||
+for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||
+for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||
+for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||
+$code.=<<___;
|
||
+ add $B,$B,@Xw[2]
|
||
+ add $C,$C,@Xw[3]
|
||
+ add $A,$A,@Xw[1]
|
||
+ add $D,$D,@Xw[4]
|
||
+ add $E,$E,@Xw[5]
|
||
+ stp $A,$B,[$ctx]
|
||
+ stp $C,$D,[$ctx,#8]
|
||
+ str $E,[$ctx,#16]
|
||
+ cbnz $num,.Loop
|
||
+
|
||
+ ldp x19,x20,[sp,#16]
|
||
+ ldp x21,x22,[sp,#32]
|
||
+ ldp x23,x24,[sp,#48]
|
||
+ ldp x25,x26,[sp,#64]
|
||
+ ldp x27,x28,[sp,#80]
|
||
+ ldr x29,[sp],#96
|
||
+ ret
|
||
+.size sha1_block_data_order,.-sha1_block_data_order
|
||
+___
|
||
+{{{
|
||
+my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
|
||
+my @MSG=map("v$_.16b",(4..7));
|
||
+my @Kxx=map("v$_.4s",(16..19));
|
||
+my ($W0,$W1)=("v20.4s","v21.4s");
|
||
+my $ABCD_SAVE="v22.16b";
|
||
+
|
||
+$code.=<<___;
|
||
+.type sha1_block_armv8,%function
|
||
+.align 6
|
||
+sha1_block_armv8:
|
||
+.Lv8_entry:
|
||
+ stp x29,x30,[sp,#-16]!
|
||
+ add x29,sp,#0
|
||
+
|
||
+ adr x4,.Lconst
|
||
+ eor $E,$E,$E
|
||
+ ld1.32 {$ABCD},[$ctx],#16
|
||
+ ld1.32 {$E}[0],[$ctx]
|
||
+ sub $ctx,$ctx,#16
|
||
+ ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
|
||
+
|
||
+.Loop_hw:
|
||
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
|
||
+ sub $num,$num,#1
|
||
+ rev32 @MSG[0],@MSG[0]
|
||
+ rev32 @MSG[1],@MSG[1]
|
||
+
|
||
+ add.i32 $W0,@Kxx[0],@MSG[0]
|
||
+ rev32 @MSG[2],@MSG[2]
|
||
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
|
||
+
|
||
+ add.i32 $W1,@Kxx[0],@MSG[1]
|
||
+ rev32 @MSG[3],@MSG[3]
|
||
+ sha1h $E1,$ABCD
|
||
+ sha1c $ABCD,$E,$W0 // 0
|
||
+ add.i32 $W0,@Kxx[$j],@MSG[2]
|
||
+ sha1su0 @MSG[0],@MSG[1],@MSG[2]
|
||
+___
|
||
+for ($j=0,$i=1;$i<20-3;$i++) {
|
||
+my $f=("c","p","m","p")[$i/5];
|
||
+$code.=<<___;
|
||
+ sha1h $E0,$ABCD // $i
|
||
+ sha1$f $ABCD,$E1,$W1
|
||
+ add.i32 $W1,@Kxx[$j],@MSG[3]
|
||
+ sha1su1 @MSG[0],@MSG[3]
|
||
+___
|
||
+$code.=<<___ if ($i<20-4);
|
||
+ sha1su0 @MSG[1],@MSG[2],@MSG[3]
|
||
+___
|
||
+ ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
|
||
+ push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
|
||
+}
|
||
+$code.=<<___;
|
||
+ sha1h $E0,$ABCD // $i
|
||
+ sha1p $ABCD,$E1,$W1
|
||
+ add.i32 $W1,@Kxx[$j],@MSG[3]
|
||
+
|
||
+ sha1h $E1,$ABCD // 18
|
||
+ sha1p $ABCD,$E0,$W0
|
||
+
|
||
+ sha1h $E0,$ABCD // 19
|
||
+ sha1p $ABCD,$E1,$W1
|
||
+
|
||
+ add.i32 $E,$E,$E0
|
||
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||
+
|
||
+ cbnz $num,.Loop_hw
|
||
+
|
||
+ st1.32 {$ABCD},[$ctx],#16
|
||
+ st1.32 {$E}[0],[$ctx]
|
||
+
|
||
+ ldr x29,[sp],#16
|
||
+ ret
|
||
+.size sha1_block_armv8,.-sha1_block_armv8
|
||
+.align 6
|
||
+.Lconst:
|
||
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
|
||
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
|
||
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
|
||
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
|
||
+.LOPENSSL_armcap_P:
|
||
+.quad OPENSSL_armcap_P-.
|
||
+.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||
+.align 2
|
||
+.comm OPENSSL_armcap_P,4,4
|
||
+___
|
||
+}}}
|
||
+
|
||
+{ my %opcode = (
|
||
+ "sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
|
||
+ "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
|
||
+ "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
|
||
+
|
||
+ sub unsha1 {
|
||
+ my ($mnemonic,$arg)=@_;
|
||
+
|
||
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
||
+ &&
|
||
+ sprintf ".inst\t0x%08x\t//%s %s",
|
||
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
||
+ $mnemonic,$arg;
|
||
+ }
|
||
+}
|
||
+
|
||
+foreach(split("\n",$code)) {
|
||
+
|
||
+ s/\`([^\`]*)\`/eval($1)/geo;
|
||
+
|
||
+ s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
|
||
+
|
||
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
|
||
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
|
||
+
|
||
+ print $_,"\n";
|
||
+}
|
||
+
|
||
+close STDOUT;
|
||
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
|
||
index 9c84e8d..505ca8f 100644
|
||
--- a/crypto/sha/asm/sha256-armv4.pl
|
||
+++ b/crypto/sha/asm/sha256-armv4.pl
|
||
@@ -1,7 +1,7 @@
|
||
#!/usr/bin/env perl
|
||
|
||
# ====================================================================
|
||
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
@@ -21,15 +21,27 @@
|
||
# February 2011.
|
||
#
|
||
# Profiler-assisted and platform-specific optimization resulted in 16%
|
||
-# improvement on Cortex A8 core and ~17 cycles per processed byte.
|
||
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
|
||
+
|
||
+# September 2013.
|
||
+#
|
||
+# Add NEON implementation. On Cortex A8 it was measured to process one
|
||
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
|
||
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
|
||
+# code (meaning that latter performs sub-optimally, nothing was done
|
||
+# about it).
|
||
+
|
||
+# May 2014.
|
||
+#
|
||
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
|
||
|
||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||
open STDOUT,">$output";
|
||
|
||
$ctx="r0"; $t0="r0";
|
||
-$inp="r1"; $t3="r1";
|
||
+$inp="r1"; $t4="r1";
|
||
$len="r2"; $t1="r2";
|
||
-$T1="r3";
|
||
+$T1="r3"; $t3="r3";
|
||
$A="r4";
|
||
$B="r5";
|
||
$C="r6";
|
||
@@ -52,71 +64,88 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||
|
||
$code.=<<___ if ($i<16);
|
||
#if __ARM_ARCH__>=7
|
||
- ldr $T1,[$inp],#4
|
||
+ @ ldr $t1,[$inp],#4 @ $i
|
||
+# if $i==15
|
||
+ str $inp,[sp,#17*4] @ make room for $t4
|
||
+# endif
|
||
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
|
||
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
|
||
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
|
||
+ rev $t1,$t1
|
||
#else
|
||
- ldrb $T1,[$inp,#3] @ $i
|
||
+ @ ldrb $t1,[$inp,#3] @ $i
|
||
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
|
||
ldrb $t2,[$inp,#2]
|
||
- ldrb $t1,[$inp,#1]
|
||
- ldrb $t0,[$inp],#4
|
||
- orr $T1,$T1,$t2,lsl#8
|
||
- orr $T1,$T1,$t1,lsl#16
|
||
- orr $T1,$T1,$t0,lsl#24
|
||
+ ldrb $t0,[$inp,#1]
|
||
+ orr $t1,$t1,$t2,lsl#8
|
||
+ ldrb $t2,[$inp],#4
|
||
+ orr $t1,$t1,$t0,lsl#16
|
||
+# if $i==15
|
||
+ str $inp,[sp,#17*4] @ make room for $t4
|
||
+# endif
|
||
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
|
||
+ orr $t1,$t1,$t2,lsl#24
|
||
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
|
||
#endif
|
||
___
|
||
$code.=<<___;
|
||
- mov $t0,$e,ror#$Sigma1[0]
|
||
ldr $t2,[$Ktbl],#4 @ *K256++
|
||
- eor $t0,$t0,$e,ror#$Sigma1[1]
|
||
+ add $h,$h,$t1 @ h+=X[i]
|
||
+ str $t1,[sp,#`$i%16`*4]
|
||
eor $t1,$f,$g
|
||
-#if $i>=16
|
||
- add $T1,$T1,$t3 @ from BODY_16_xx
|
||
-#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
|
||
- rev $T1,$T1
|
||
-#endif
|
||
-#if $i==15
|
||
- str $inp,[sp,#17*4] @ leave room for $t3
|
||
-#endif
|
||
- eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
|
||
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
|
||
and $t1,$t1,$e
|
||
- str $T1,[sp,#`$i%16`*4]
|
||
- add $T1,$T1,$t0
|
||
+ add $h,$h,$t2 @ h+=K256[i]
|
||
eor $t1,$t1,$g @ Ch(e,f,g)
|
||
- add $T1,$T1,$h
|
||
- mov $h,$a,ror#$Sigma0[0]
|
||
- add $T1,$T1,$t1
|
||
- eor $h,$h,$a,ror#$Sigma0[1]
|
||
- add $T1,$T1,$t2
|
||
- eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
|
||
-#if $i>=15
|
||
- ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
|
||
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
|
||
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
|
||
+#if $i==31
|
||
+ and $t2,$t2,#0xff
|
||
+ cmp $t2,#0xf2 @ done?
|
||
#endif
|
||
- orr $t0,$a,$b
|
||
- and $t1,$a,$b
|
||
- and $t0,$t0,$c
|
||
- add $h,$h,$T1
|
||
- orr $t0,$t0,$t1 @ Maj(a,b,c)
|
||
- add $d,$d,$T1
|
||
- add $h,$h,$t0
|
||
+#if $i<15
|
||
+# if __ARM_ARCH__>=7
|
||
+ ldr $t1,[$inp],#4 @ prefetch
|
||
+# else
|
||
+ ldrb $t1,[$inp,#3]
|
||
+# endif
|
||
+ eor $t2,$a,$b @ a^b, b^c in next round
|
||
+#else
|
||
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
|
||
+ eor $t2,$a,$b @ a^b, b^c in next round
|
||
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
|
||
+#endif
|
||
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
|
||
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
|
||
+ add $d,$d,$h @ d+=h
|
||
+ eor $t3,$t3,$b @ Maj(a,b,c)
|
||
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
|
||
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
|
||
___
|
||
+ ($t2,$t3)=($t3,$t2);
|
||
}
|
||
|
||
sub BODY_16_XX {
|
||
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
|
||
|
||
$code.=<<___;
|
||
- @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
|
||
- ldr $t2,[sp,#`($i+14)%16`*4]
|
||
- mov $t0,$t3,ror#$sigma0[0]
|
||
- ldr $T1,[sp,#`($i+0)%16`*4]
|
||
- eor $t0,$t0,$t3,ror#$sigma0[1]
|
||
- ldr $t1,[sp,#`($i+9)%16`*4]
|
||
- eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
|
||
- mov $t3,$t2,ror#$sigma1[0]
|
||
- add $T1,$T1,$t0
|
||
- eor $t3,$t3,$t2,ror#$sigma1[1]
|
||
- add $T1,$T1,$t1
|
||
- eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
|
||
- @ add $T1,$T1,$t3
|
||
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
|
||
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
|
||
+ mov $t0,$t1,ror#$sigma0[0]
|
||
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
|
||
+ mov $t2,$t4,ror#$sigma1[0]
|
||
+ eor $t0,$t0,$t1,ror#$sigma0[1]
|
||
+ eor $t2,$t2,$t4,ror#$sigma1[1]
|
||
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
|
||
+ ldr $t1,[sp,#`($i+0)%16`*4]
|
||
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
|
||
+ ldr $t4,[sp,#`($i+9)%16`*4]
|
||
+
|
||
+ add $t2,$t2,$t0
|
||
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
|
||
+ add $t1,$t1,$t2
|
||
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
|
||
+ add $t1,$t1,$t4 @ X[i]
|
||
___
|
||
&BODY_00_15(@_);
|
||
}
|
||
@@ -147,46 +176,64 @@ K256:
|
||
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||
.size K256,.-K256
|
||
+.word 0 @ terminator
|
||
+.LOPENSSL_armcap:
|
||
+.word OPENSSL_armcap_P-sha256_block_data_order
|
||
+.align 5
|
||
|
||
.global sha256_block_data_order
|
||
.type sha256_block_data_order,%function
|
||
sha256_block_data_order:
|
||
sub r3,pc,#8 @ sha256_block_data_order
|
||
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
|
||
+#if __ARM_ARCH__>=7
|
||
+ ldr r12,.LOPENSSL_armcap
|
||
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
|
||
+ tst r12,#ARMV8_SHA256
|
||
+ bne .LARMv8
|
||
+ tst r12,#ARMV7_NEON
|
||
+ bne .LNEON
|
||
+#endif
|
||
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
|
||
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
|
||
- sub $Ktbl,r3,#256 @ K256
|
||
+ sub $Ktbl,r3,#256+32 @ K256
|
||
sub sp,sp,#16*4 @ alloca(X[16])
|
||
.Loop:
|
||
+# if __ARM_ARCH__>=7
|
||
+ ldr $t1,[$inp],#4
|
||
+# else
|
||
+ ldrb $t1,[$inp,#3]
|
||
+# endif
|
||
+ eor $t3,$B,$C @ magic
|
||
+ eor $t2,$t2,$t2
|
||
___
|
||
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
|
||
$code.=".Lrounds_16_xx:\n";
|
||
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
|
||
$code.=<<___;
|
||
- and $t2,$t2,#0xff
|
||
- cmp $t2,#0xf2
|
||
+ ldreq $t3,[sp,#16*4] @ pull ctx
|
||
bne .Lrounds_16_xx
|
||
|
||
- ldr $T1,[sp,#16*4] @ pull ctx
|
||
- ldr $t0,[$T1,#0]
|
||
- ldr $t1,[$T1,#4]
|
||
- ldr $t2,[$T1,#8]
|
||
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
|
||
+ ldr $t0,[$t3,#0]
|
||
+ ldr $t1,[$t3,#4]
|
||
+ ldr $t2,[$t3,#8]
|
||
add $A,$A,$t0
|
||
- ldr $t0,[$T1,#12]
|
||
+ ldr $t0,[$t3,#12]
|
||
add $B,$B,$t1
|
||
- ldr $t1,[$T1,#16]
|
||
+ ldr $t1,[$t3,#16]
|
||
add $C,$C,$t2
|
||
- ldr $t2,[$T1,#20]
|
||
+ ldr $t2,[$t3,#20]
|
||
add $D,$D,$t0
|
||
- ldr $t0,[$T1,#24]
|
||
+ ldr $t0,[$t3,#24]
|
||
add $E,$E,$t1
|
||
- ldr $t1,[$T1,#28]
|
||
+ ldr $t1,[$t3,#28]
|
||
add $F,$F,$t2
|
||
ldr $inp,[sp,#17*4] @ pull inp
|
||
ldr $t2,[sp,#18*4] @ pull inp+len
|
||
add $G,$G,$t0
|
||
add $H,$H,$t1
|
||
- stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
|
||
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
|
||
cmp $inp,$t2
|
||
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
|
||
bne .Loop
|
||
@@ -200,12 +247,410 @@ $code.=<<___;
|
||
moveq pc,lr @ be binary compatible with V4, yet
|
||
bx lr @ interoperable with Thumb ISA:-)
|
||
#endif
|
||
-.size sha256_block_data_order,.-sha256_block_data_order
|
||
-.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
||
+.size sha256_block_data_order,.-sha256_block_data_order
|
||
+___
|
||
+######################################################################
|
||
+# NEON stuff
|
||
+#
|
||
+{{{
|
||
+my @X=map("q$_",(0..3));
|
||
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
|
||
+my $Xfer=$t4;
|
||
+my $j=0;
|
||
+
|
||
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||
+
|
||
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
||
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
|
||
+ my $arg = pop;
|
||
+ $arg = "#$arg" if ($arg*1 eq $arg);
|
||
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
|
||
+}
|
||
+
|
||
+sub Xupdate()
|
||
+{ use integer;
|
||
+ my $body = shift;
|
||
+ my @insns = (&$body,&$body,&$body,&$body);
|
||
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
|
||
+
|
||
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor ($T1,$T1,$T2);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor ($T5,$T5,$T4);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor ($T5,$T5,$T4);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 ($T0,$T0,@X[0]);
|
||
+ while($#insns>=2) { eval(shift(@insns)); }
|
||
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+
|
||
+ push(@X,shift(@X)); # "rotate" X[]
|
||
+}
|
||
+
|
||
+sub Xpreload()
|
||
+{ use integer;
|
||
+ my $body = shift;
|
||
+ my @insns = (&$body,&$body,&$body,&$body);
|
||
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
|
||
+
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vrev32_8 (@X[0],@X[0]);
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ eval(shift(@insns));
|
||
+ &vadd_i32 ($T0,$T0,@X[0]);
|
||
+ foreach (@insns) { eval; } # remaining instructions
|
||
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
|
||
+
|
||
+ push(@X,shift(@X)); # "rotate" X[]
|
||
+}
|
||
+
|
||
+sub body_00_15 () {
|
||
+ (
|
||
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
|
||
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
|
||
+ '&eor ($t1,$f,$g)',
|
||
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
|
||
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
|
||
+ '&and ($t1,$t1,$e)',
|
||
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
|
||
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
|
||
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
|
||
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
|
||
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
|
||
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
|
||
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
|
||
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
|
||
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
|
||
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
|
||
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
|
||
+ '&add ($d,$d,$h)', # d+=h
|
||
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
|
||
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
|
||
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
|
||
+ )
|
||
+}
|
||
+
|
||
+$code.=<<___;
|
||
+#if __ARM_ARCH__>=7
|
||
+.fpu neon
|
||
+
|
||
+.type sha256_block_data_order_neon,%function
|
||
+.align 4
|
||
+sha256_block_data_order_neon:
|
||
+.LNEON:
|
||
+ stmdb sp!,{r4-r12,lr}
|
||
+
|
||
+ mov $t2,sp
|
||
+ sub sp,sp,#16*4+16 @ alloca
|
||
+ sub $Ktbl,r3,#256+32 @ K256
|
||
+ bic sp,sp,#15 @ align for 128-bit stores
|
||
+
|
||
+ vld1.8 {@X[0]},[$inp]!
|
||
+ vld1.8 {@X[1]},[$inp]!
|
||
+ vld1.8 {@X[2]},[$inp]!
|
||
+ vld1.8 {@X[3]},[$inp]!
|
||
+ vld1.32 {$T0},[$Ktbl,:128]!
|
||
+ vld1.32 {$T1},[$Ktbl,:128]!
|
||
+ vld1.32 {$T2},[$Ktbl,:128]!
|
||
+ vld1.32 {$T3},[$Ktbl,:128]!
|
||
+ vrev32.8 @X[0],@X[0] @ yes, even on
|
||
+ str $ctx,[sp,#64]
|
||
+ vrev32.8 @X[1],@X[1] @ big-endian
|
||
+ str $inp,[sp,#68]
|
||
+ mov $Xfer,sp
|
||
+ vrev32.8 @X[2],@X[2]
|
||
+ str $len,[sp,#72]
|
||
+ vrev32.8 @X[3],@X[3]
|
||
+ str $t2,[sp,#76] @ save original sp
|
||
+ vadd.i32 $T0,$T0,@X[0]
|
||
+ vadd.i32 $T1,$T1,@X[1]
|
||
+ vst1.32 {$T0},[$Xfer,:128]!
|
||
+ vadd.i32 $T2,$T2,@X[2]
|
||
+ vst1.32 {$T1},[$Xfer,:128]!
|
||
+ vadd.i32 $T3,$T3,@X[3]
|
||
+ vst1.32 {$T2},[$Xfer,:128]!
|
||
+ vst1.32 {$T3},[$Xfer,:128]!
|
||
+
|
||
+ ldmia $ctx,{$A-$H}
|
||
+ sub $Xfer,$Xfer,#64
|
||
+ ldr $t1,[sp,#0]
|
||
+ eor $t2,$t2,$t2
|
||
+ eor $t3,$B,$C
|
||
+ b .L_00_48
|
||
+
|
||
+.align 4
|
||
+.L_00_48:
|
||
+___
|
||
+ &Xupdate(\&body_00_15);
|
||
+ &Xupdate(\&body_00_15);
|
||
+ &Xupdate(\&body_00_15);
|
||
+ &Xupdate(\&body_00_15);
|
||
+$code.=<<___;
|
||
+ teq $t1,#0 @ check for K256 terminator
|
||
+ ldr $t1,[sp,#0]
|
||
+ sub $Xfer,$Xfer,#64
|
||
+ bne .L_00_48
|
||
+
|
||
+ ldr $inp,[sp,#68]
|
||
+ ldr $t0,[sp,#72]
|
||
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
|
||
+ teq $inp,$t0
|
||
+ subeq $inp,$inp,#64 @ avoid SEGV
|
||
+ vld1.8 {@X[0]},[$inp]! @ load next input block
|
||
+ vld1.8 {@X[1]},[$inp]!
|
||
+ vld1.8 {@X[2]},[$inp]!
|
||
+ vld1.8 {@X[3]},[$inp]!
|
||
+ strne $inp,[sp,#68]
|
||
+ mov $Xfer,sp
|
||
+___
|
||
+ &Xpreload(\&body_00_15);
|
||
+ &Xpreload(\&body_00_15);
|
||
+ &Xpreload(\&body_00_15);
|
||
+ &Xpreload(\&body_00_15);
|
||
+$code.=<<___;
|
||
+ ldr $t0,[$t1,#0]
|
||
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
|
||
+ ldr $t2,[$t1,#4]
|
||
+ ldr $t3,[$t1,#8]
|
||
+ ldr $t4,[$t1,#12]
|
||
+ add $A,$A,$t0 @ accumulate
|
||
+ ldr $t0,[$t1,#16]
|
||
+ add $B,$B,$t2
|
||
+ ldr $t2,[$t1,#20]
|
||
+ add $C,$C,$t3
|
||
+ ldr $t3,[$t1,#24]
|
||
+ add $D,$D,$t4
|
||
+ ldr $t4,[$t1,#28]
|
||
+ add $E,$E,$t0
|
||
+ str $A,[$t1],#4
|
||
+ add $F,$F,$t2
|
||
+ str $B,[$t1],#4
|
||
+ add $G,$G,$t3
|
||
+ str $C,[$t1],#4
|
||
+ add $H,$H,$t4
|
||
+ str $D,[$t1],#4
|
||
+ stmia $t1,{$E-$H}
|
||
+
|
||
+ movne $Xfer,sp
|
||
+ ldrne $t1,[sp,#0]
|
||
+ eorne $t2,$t2,$t2
|
||
+ ldreq sp,[sp,#76] @ restore original sp
|
||
+ eorne $t3,$B,$C
|
||
+ bne .L_00_48
|
||
+
|
||
+ ldmia sp!,{r4-r12,pc}
|
||
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
|
||
+#endif
|
||
+___
|
||
+}}}
|
||
+######################################################################
|
||
+# ARMv8 stuff
|
||
+#
|
||
+{{{
|
||
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
|
||
+my @MSG=map("q$_",(8..11));
|
||
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
|
||
+my $Ktbl="r3";
|
||
+
|
||
+$code.=<<___;
|
||
+#if __ARM_ARCH__>=7
|
||
+.type sha256_block_data_order_armv8,%function
|
||
+.align 5
|
||
+sha256_block_data_order_armv8:
|
||
+.LARMv8:
|
||
+ vld1.32 {$ABCD,$EFGH},[$ctx]
|
||
+ sub $Ktbl,r3,#sha256_block_data_order-K256
|
||
+
|
||
+.Loop_v8:
|
||
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
|
||
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
|
||
+ vld1.32 {$W0},[$Ktbl]!
|
||
+ vrev32.8 @MSG[0],@MSG[0]
|
||
+ vrev32.8 @MSG[1],@MSG[1]
|
||
+ vrev32.8 @MSG[2],@MSG[2]
|
||
+ vrev32.8 @MSG[3],@MSG[3]
|
||
+ vmov $ABCD_SAVE,$ABCD @ offload
|
||
+ vmov $EFGH_SAVE,$EFGH
|
||
+ teq $inp,$len
|
||
+___
|
||
+for($i=0;$i<12;$i++) {
|
||
+$code.=<<___;
|
||
+ vld1.32 {$W1},[$Ktbl]!
|
||
+ vadd.i32 $W0,$W0,@MSG[0]
|
||
+ sha256su0 @MSG[0],@MSG[1]
|
||
+ vmov $abcd,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W0
|
||
+ sha256h2 $EFGH,$abcd,$W0
|
||
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
|
||
+___
|
||
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
||
+}
|
||
+$code.=<<___;
|
||
+ vld1.32 {$W1},[$Ktbl]!
|
||
+ vadd.i32 $W0,$W0,@MSG[0]
|
||
+ vmov $abcd,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W0
|
||
+ sha256h2 $EFGH,$abcd,$W0
|
||
+
|
||
+ vld1.32 {$W0},[$Ktbl]!
|
||
+ vadd.i32 $W1,$W1,@MSG[1]
|
||
+ vmov $abcd,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W1
|
||
+ sha256h2 $EFGH,$abcd,$W1
|
||
+
|
||
+ vld1.32 {$W1},[$Ktbl]
|
||
+ vadd.i32 $W0,$W0,@MSG[2]
|
||
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
|
||
+ vmov $abcd,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W0
|
||
+ sha256h2 $EFGH,$abcd,$W0
|
||
+
|
||
+ vadd.i32 $W1,$W1,@MSG[3]
|
||
+ vmov $abcd,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W1
|
||
+ sha256h2 $EFGH,$abcd,$W1
|
||
+
|
||
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
|
||
+ bne .Loop_v8
|
||
+
|
||
+ vst1.32 {$ABCD,$EFGH},[$ctx]
|
||
+
|
||
+ ret @ bx lr
|
||
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
|
||
+#endif
|
||
+___
|
||
+}}}
|
||
+$code.=<<___;
|
||
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||
.align 2
|
||
+.comm OPENSSL_armcap_P,4,4
|
||
___
|
||
|
||
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
-print $code;
|
||
+{ my %opcode = (
|
||
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
|
||
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
|
||
+
|
||
+ sub unsha256 {
|
||
+ my ($mnemonic,$arg)=@_;
|
||
+
|
||
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
|
||
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||
+ |(($2&7)<<17)|(($2&8)<<4)
|
||
+ |(($3&7)<<1) |(($3&8)<<2);
|
||
+ # since ARMv7 instructions are always encoded little-endian.
|
||
+ # correct solution is to use .inst directive, but older
|
||
+ # assemblers don't implement it:-(
|
||
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||
+ $word&0xff,($word>>8)&0xff,
|
||
+ ($word>>16)&0xff,($word>>24)&0xff,
|
||
+ $mnemonic,$arg;
|
||
+ }
|
||
+ }
|
||
+}
|
||
+
|
||
+foreach (split($/,$code)) {
|
||
+
|
||
+ s/\`([^\`]*)\`/eval $1/geo;
|
||
+
|
||
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
|
||
+
|
||
+ s/\bret\b/bx lr/go or
|
||
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||
+
|
||
+ print $_,"\n";
|
||
+}
|
||
+
|
||
close STDOUT; # enforce flush
|
||
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
|
||
index 7faf37b..71aa935 100644
|
||
--- a/crypto/sha/asm/sha512-armv4.pl
|
||
+++ b/crypto/sha/asm/sha512-armv4.pl
|
||
@@ -565,7 +565,7 @@ $code.=<<___;
|
||
bne .Loop_neon
|
||
|
||
vldmia sp!,{d8-d15} @ epilogue
|
||
- bx lr
|
||
+ ret @ bx lr
|
||
#endif
|
||
___
|
||
}
|
||
@@ -578,5 +578,6 @@ ___
|
||
|
||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
+$code =~ s/\bret\b/bx lr/gm;
|
||
print $code;
|
||
close STDOUT; # enforce flush
|
||
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
|
||
new file mode 100644
|
||
index 0000000..6935ed6
|
||
--- /dev/null
|
||
+++ b/crypto/sha/asm/sha512-armv8.pl
|
||
@@ -0,0 +1,414 @@
|
||
+#!/usr/bin/env perl
|
||
+#
|
||
+# ====================================================================
|
||
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
+# project. The module is, however, dual licensed under OpenSSL and
|
||
+# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
+# details see http://www.openssl.org/~appro/cryptogams/.
|
||
+# ====================================================================
|
||
+#
|
||
+# SHA256/512 for ARMv8.
|
||
+#
|
||
+# Performance in cycles per processed byte and improvement coefficient
|
||
+# over code generated with "default" compiler:
|
||
+#
|
||
+# SHA256-hw SHA256(*) SHA512
|
||
+# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
|
||
+# Cortex-A5x n/a n/a n/a
|
||
+#
|
||
+# (*) Software SHA256 results are of lesser relevance, presented
|
||
+# mostly for informational purposes.
|
||
+# (**) The result is a trade-off: it's possible to improve it by
|
||
+# 10%, but at the cost of 20% loss on Cortex-A5x.
|
||
+
|
||
+$flavour=shift;
|
||
+$output=shift;
|
||
+open STDOUT,">$output";
|
||
+
|
||
+if ($output =~ /512/) {
|
||
+ $BITS=512;
|
||
+ $SZ=8;
|
||
+ @Sigma0=(28,34,39);
|
||
+ @Sigma1=(14,18,41);
|
||
+ @sigma0=(1, 8, 7);
|
||
+ @sigma1=(19,61, 6);
|
||
+ $rounds=80;
|
||
+ $reg_t="x";
|
||
+} else {
|
||
+ $BITS=256;
|
||
+ $SZ=4;
|
||
+ @Sigma0=( 2,13,22);
|
||
+ @Sigma1=( 6,11,25);
|
||
+ @sigma0=( 7,18, 3);
|
||
+ @sigma1=(17,19,10);
|
||
+ $rounds=64;
|
||
+ $reg_t="w";
|
||
+}
|
||
+
|
||
+$func="sha${BITS}_block_data_order";
|
||
+
|
||
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
|
||
+
|
||
+@X=map("$reg_t$_",(3..15,0..2));
|
||
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
|
||
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
|
||
+
|
||
+sub BODY_00_xx {
|
||
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
|
||
+my $j=($i+1)&15;
|
||
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
|
||
+ $T0=@X[$i+3] if ($i<11);
|
||
+
|
||
+$code.=<<___ if ($i<16);
|
||
+#ifndef __ARMEB__
|
||
+ rev @X[$i],@X[$i] // $i
|
||
+#endif
|
||
+___
|
||
+$code.=<<___ if ($i<13 && ($i&1));
|
||
+ ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
|
||
+___
|
||
+$code.=<<___ if ($i==13);
|
||
+ ldp @X[14],@X[15],[$inp]
|
||
+___
|
||
+$code.=<<___ if ($i>=14);
|
||
+ ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
|
||
+___
|
||
+$code.=<<___ if ($i>0 && $i<16);
|
||
+ add $a,$a,$t1 // h+=Sigma0(a)
|
||
+___
|
||
+$code.=<<___ if ($i>=11);
|
||
+ str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
|
||
+___
|
||
+# While ARMv8 specifies merged rotate-n-logical operation such as
|
||
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
|
||
+# on Apple A7. The reason seems to be that it requires even 'y' to
|
||
+# be available earlier. This means that such merged instruction is
|
||
+# not necessarily best choice on critical path... On the other hand
|
||
+# Cortex-A5x handles merged instructions much better than disjoint
|
||
+# rotate and logical... See (**) footnote above.
|
||
+$code.=<<___ if ($i<15);
|
||
+ ror $t0,$e,#$Sigma1[0]
|
||
+ add $h,$h,$t2 // h+=K[i]
|
||
+ eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
|
||
+ and $t1,$f,$e
|
||
+ bic $t2,$g,$e
|
||
+ add $h,$h,@X[$i&15] // h+=X[i]
|
||
+ orr $t1,$t1,$t2 // Ch(e,f,g)
|
||
+ eor $t2,$a,$b // a^b, b^c in next round
|
||
+ eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
|
||
+ ror $T0,$a,#$Sigma0[0]
|
||
+ add $h,$h,$t1 // h+=Ch(e,f,g)
|
||
+ eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
|
||
+ add $h,$h,$t0 // h+=Sigma1(e)
|
||
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
|
||
+ add $d,$d,$h // d+=h
|
||
+ eor $t3,$t3,$b // Maj(a,b,c)
|
||
+ eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
|
||
+ add $h,$h,$t3 // h+=Maj(a,b,c)
|
||
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
|
||
+ //add $h,$h,$t1 // h+=Sigma0(a)
|
||
+___
|
||
+$code.=<<___ if ($i>=15);
|
||
+ ror $t0,$e,#$Sigma1[0]
|
||
+ add $h,$h,$t2 // h+=K[i]
|
||
+ ror $T1,@X[($j+1)&15],#$sigma0[0]
|
||
+ and $t1,$f,$e
|
||
+ ror $T2,@X[($j+14)&15],#$sigma1[0]
|
||
+ bic $t2,$g,$e
|
||
+ ror $T0,$a,#$Sigma0[0]
|
||
+ add $h,$h,@X[$i&15] // h+=X[i]
|
||
+ eor $t0,$t0,$e,ror#$Sigma1[1]
|
||
+ eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
|
||
+ orr $t1,$t1,$t2 // Ch(e,f,g)
|
||
+ eor $t2,$a,$b // a^b, b^c in next round
|
||
+ eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
|
||
+ eor $T0,$T0,$a,ror#$Sigma0[1]
|
||
+ add $h,$h,$t1 // h+=Ch(e,f,g)
|
||
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
|
||
+ eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
|
||
+ eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
|
||
+ add $h,$h,$t0 // h+=Sigma1(e)
|
||
+ eor $t3,$t3,$b // Maj(a,b,c)
|
||
+ eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
|
||
+ eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
|
||
+ add @X[$j],@X[$j],@X[($j+9)&15]
|
||
+ add $d,$d,$h // d+=h
|
||
+ add $h,$h,$t3 // h+=Maj(a,b,c)
|
||
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
|
||
+ add @X[$j],@X[$j],$T1
|
||
+ add $h,$h,$t1 // h+=Sigma0(a)
|
||
+ add @X[$j],@X[$j],$T2
|
||
+___
|
||
+ ($t2,$t3)=($t3,$t2);
|
||
+}
|
||
+
|
||
+$code.=<<___;
|
||
+#include "arm_arch.h"
|
||
+
|
||
+.text
|
||
+
|
||
+.globl $func
|
||
+.type $func,%function
|
||
+.align 6
|
||
+$func:
|
||
+___
|
||
+$code.=<<___ if ($SZ==4);
|
||
+ ldr x16,.LOPENSSL_armcap_P
|
||
+ adr x17,.LOPENSSL_armcap_P
|
||
+ add x16,x16,x17
|
||
+ ldr w16,[x16]
|
||
+ tst w16,#ARMV8_SHA256
|
||
+ b.ne .Lv8_entry
|
||
+___
|
||
+$code.=<<___;
|
||
+ stp x29,x30,[sp,#-128]!
|
||
+ add x29,sp,#0
|
||
+
|
||
+ stp x19,x20,[sp,#16]
|
||
+ stp x21,x22,[sp,#32]
|
||
+ stp x23,x24,[sp,#48]
|
||
+ stp x25,x26,[sp,#64]
|
||
+ stp x27,x28,[sp,#80]
|
||
+ sub sp,sp,#4*$SZ
|
||
+
|
||
+ ldp $A,$B,[$ctx] // load context
|
||
+ ldp $C,$D,[$ctx,#2*$SZ]
|
||
+ ldp $E,$F,[$ctx,#4*$SZ]
|
||
+ add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
|
||
+ ldp $G,$H,[$ctx,#6*$SZ]
|
||
+ adr $Ktbl,K$BITS
|
||
+ stp $ctx,$num,[x29,#96]
|
||
+
|
||
+.Loop:
|
||
+ ldp @X[0],@X[1],[$inp],#2*$SZ
|
||
+ ldr $t2,[$Ktbl],#$SZ // *K++
|
||
+ eor $t3,$B,$C // magic seed
|
||
+ str $inp,[x29,#112]
|
||
+___
|
||
+for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
|
||
+$code.=".Loop_16_xx:\n";
|
||
+for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
|
||
+$code.=<<___;
|
||
+ cbnz $t2,.Loop_16_xx
|
||
+
|
||
+ ldp $ctx,$num,[x29,#96]
|
||
+ ldr $inp,[x29,#112]
|
||
+ sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
|
||
+
|
||
+ ldp @X[0],@X[1],[$ctx]
|
||
+ ldp @X[2],@X[3],[$ctx,#2*$SZ]
|
||
+ add $inp,$inp,#14*$SZ // advance input pointer
|
||
+ ldp @X[4],@X[5],[$ctx,#4*$SZ]
|
||
+ add $A,$A,@X[0]
|
||
+ ldp @X[6],@X[7],[$ctx,#6*$SZ]
|
||
+ add $B,$B,@X[1]
|
||
+ add $C,$C,@X[2]
|
||
+ add $D,$D,@X[3]
|
||
+ stp $A,$B,[$ctx]
|
||
+ add $E,$E,@X[4]
|
||
+ add $F,$F,@X[5]
|
||
+ stp $C,$D,[$ctx,#2*$SZ]
|
||
+ add $G,$G,@X[6]
|
||
+ add $H,$H,@X[7]
|
||
+ cmp $inp,$num
|
||
+ stp $E,$F,[$ctx,#4*$SZ]
|
||
+ stp $G,$H,[$ctx,#6*$SZ]
|
||
+ b.ne .Loop
|
||
+
|
||
+ ldp x19,x20,[x29,#16]
|
||
+ add sp,sp,#4*$SZ
|
||
+ ldp x21,x22,[x29,#32]
|
||
+ ldp x23,x24,[x29,#48]
|
||
+ ldp x25,x26,[x29,#64]
|
||
+ ldp x27,x28,[x29,#80]
|
||
+ ldp x29,x30,[sp],#128
|
||
+ ret
|
||
+.size $func,.-$func
|
||
+
|
||
+.align 6
|
||
+.type K$BITS,%object
|
||
+K$BITS:
|
||
+___
|
||
+$code.=<<___ if ($SZ==8);
|
||
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
|
||
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
|
||
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||
+ .quad 0 // terminator
|
||
+___
|
||
+$code.=<<___ if ($SZ==4);
|
||
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||
+ .long 0 //terminator
|
||
+___
|
||
+$code.=<<___;
|
||
+.size K$BITS,.-K$BITS
|
||
+.align 3
|
||
+.LOPENSSL_armcap_P:
|
||
+ .quad OPENSSL_armcap_P-.
|
||
+.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||
+.align 2
|
||
+___
|
||
+
|
||
+if ($SZ==4) {
|
||
+my $Ktbl="x3";
|
||
+
|
||
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
|
||
+my @MSG=map("v$_.16b",(4..7));
|
||
+my ($W0,$W1)=("v16.4s","v17.4s");
|
||
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
|
||
+
|
||
+$code.=<<___;
|
||
+.type sha256_block_armv8,%function
|
||
+.align 6
|
||
+sha256_block_armv8:
|
||
+.Lv8_entry:
|
||
+ stp x29,x30,[sp,#-16]!
|
||
+ add x29,sp,#0
|
||
+
|
||
+ ld1.32 {$ABCD,$EFGH},[$ctx]
|
||
+ adr $Ktbl,K256
|
||
+
|
||
+.Loop_hw:
|
||
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
|
||
+ sub $num,$num,#1
|
||
+ ld1.32 {$W0},[$Ktbl],#16
|
||
+ rev32 @MSG[0],@MSG[0]
|
||
+ rev32 @MSG[1],@MSG[1]
|
||
+ rev32 @MSG[2],@MSG[2]
|
||
+ rev32 @MSG[3],@MSG[3]
|
||
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
|
||
+ orr $EFGH_SAVE,$EFGH,$EFGH
|
||
+___
|
||
+for($i=0;$i<12;$i++) {
|
||
+$code.=<<___;
|
||
+ ld1.32 {$W1},[$Ktbl],#16
|
||
+ add.i32 $W0,$W0,@MSG[0]
|
||
+ sha256su0 @MSG[0],@MSG[1]
|
||
+ orr $abcd,$ABCD,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W0
|
||
+ sha256h2 $EFGH,$abcd,$W0
|
||
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
|
||
+___
|
||
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
||
+}
|
||
+$code.=<<___;
|
||
+ ld1.32 {$W1},[$Ktbl],#16
|
||
+ add.i32 $W0,$W0,@MSG[0]
|
||
+ orr $abcd,$ABCD,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W0
|
||
+ sha256h2 $EFGH,$abcd,$W0
|
||
+
|
||
+ ld1.32 {$W0},[$Ktbl],#16
|
||
+ add.i32 $W1,$W1,@MSG[1]
|
||
+ orr $abcd,$ABCD,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W1
|
||
+ sha256h2 $EFGH,$abcd,$W1
|
||
+
|
||
+ ld1.32 {$W1},[$Ktbl]
|
||
+ add.i32 $W0,$W0,@MSG[2]
|
||
+ sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
|
||
+ orr $abcd,$ABCD,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W0
|
||
+ sha256h2 $EFGH,$abcd,$W0
|
||
+
|
||
+ add.i32 $W1,$W1,@MSG[3]
|
||
+ orr $abcd,$ABCD,$ABCD
|
||
+ sha256h $ABCD,$EFGH,$W1
|
||
+ sha256h2 $EFGH,$abcd,$W1
|
||
+
|
||
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||
+ add.i32 $EFGH,$EFGH,$EFGH_SAVE
|
||
+
|
||
+ cbnz $num,.Loop_hw
|
||
+
|
||
+ st1.32 {$ABCD,$EFGH},[$ctx]
|
||
+
|
||
+ ldr x29,[sp],#16
|
||
+ ret
|
||
+.size sha256_block_armv8,.-sha256_block_armv8
|
||
+___
|
||
+}
|
||
+
|
||
+$code.=<<___;
|
||
+.comm OPENSSL_armcap_P,4,4
|
||
+___
|
||
+
|
||
+{ my %opcode = (
|
||
+ "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
|
||
+ "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
|
||
+
|
||
+ sub unsha256 {
|
||
+ my ($mnemonic,$arg)=@_;
|
||
+
|
||
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
||
+ &&
|
||
+ sprintf ".inst\t0x%08x\t//%s %s",
|
||
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
||
+ $mnemonic,$arg;
|
||
+ }
|
||
+}
|
||
+
|
||
+foreach(split("\n",$code)) {
|
||
+
|
||
+ s/\`([^\`]*)\`/eval($1)/geo;
|
||
+
|
||
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
|
||
+
|
||
+ s/\.\w?32\b//o and s/\.16b/\.4s/go;
|
||
+ m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
|
||
+
|
||
+ print $_,"\n";
|
||
+}
|
||
+
|
||
+close STDOUT;
|