Support for Signal calls.

Merge in RedPhone

// FREEBIE
This commit is contained in:
Moxie Marlinspike
2015-09-09 13:54:29 -07:00
parent 3d4ae60d81
commit d83a3d71bc
2585 changed files with 803492 additions and 45 deletions

View File

@@ -0,0 +1 @@
C2.pl works

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,322 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for Alpha.
# On 21264 performance is 33% better than code generated by vendor
# compiler, and 75% better than GCC [3.4], and in absolute terms is
# 8.7 cycles per processed byte. Implementation features vectorized
# byte swap, but not Xupdate.
@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
"\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
$ctx="a0"; # $16
$inp="a1";
$num="a2";
$A="a3";
$B="a4"; # 20
$C="a5";
$D="t8";
$E="t9"; @V=($A,$B,$C,$D,$E);
$t0="t10"; # 24
$t1="t11";
$t2="ra";
$t3="t12";
$K="AT"; # 28
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
ldq_u @X[0],0+0($inp)
ldq_u @X[1],0+7($inp)
___
$code.=<<___ if (!($i&1) && $i<14);
ldq_u @X[$i+2],($i+2)*4+0($inp)
ldq_u @X[$i+3],($i+2)*4+7($inp)
___
$code.=<<___ if (!($i&1) && $i<15);
extql @X[$i],$inp,@X[$i]
extqh @X[$i+1],$inp,@X[$i+1]
or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
srl @X[$i],24,$t0 # vectorized byte swap
srl @X[$i],8,$t2
sll @X[$i],8,$t3
sll @X[$i],24,@X[$i]
zapnot $t0,0x11,$t0
zapnot $t2,0x22,$t2
zapnot @X[$i],0x88,@X[$i]
or $t0,$t2,$t0
zapnot $t3,0x44,$t3
sll $a,5,$t1
or @X[$i],$t0,@X[$i]
addl $K,$e,$e
and $b,$c,$t2
zapnot $a,0xf,$a
or @X[$i],$t3,@X[$i]
srl $a,27,$t0
bic $d,$b,$t3
sll $b,30,$b
extll @X[$i],4,@X[$i+1] # extract upper half
or $t2,$t3,$t2
addl @X[$i],$e,$e
addl $t1,$e,$e
srl $b,32,$t3
zapnot @X[$i],0xf,@X[$i]
addl $t0,$e,$e
addl $t2,$e,$e
or $t3,$b,$b
___
$code.=<<___ if (($i&1) && $i<15);
sll $a,5,$t1
addl $K,$e,$e
and $b,$c,$t2
zapnot $a,0xf,$a
srl $a,27,$t0
addl @X[$i%16],$e,$e
bic $d,$b,$t3
sll $b,30,$b
or $t2,$t3,$t2
addl $t1,$e,$e
srl $b,32,$t3
zapnot @X[$i],0xf,@X[$i]
addl $t0,$e,$e
addl $t2,$e,$e
or $t3,$b,$b
___
$code.=<<___ if ($i>=15); # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
and $b,$c,$t2
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
zapnot $a,0xf,$a
addl @X[$i%16],$e,$e
bic $d,$b,$t3
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
srl $a,27,$t0
addl $t1,$e,$e
or $t2,$t3,$t2
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
sll $b,30,$b
addl $t0,$e,$e
srl @X[$j%16],31,$t1
addl $t2,$e,$e
srl $b,32,$t3
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
zapnot @X[$i%16],0xf,@X[$i%16]
or $t1,@X[$j%16],@X[$j%16]
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79); # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
sll $b,30,$t3
addl $t1,$e,$e
xor $b,$c,$t2
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
srl $b,2,$b
addl @X[$i%16],$e,$e
xor $d,$t2,$t2
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
srl @X[$j%16],31,$t1
addl $t2,$e,$e
srl $a,27,$t0
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
addl $t0,$e,$e
or $t1,@X[$j%16],@X[$j%16]
___
$code.=<<___ if ($i<77);
zapnot @X[$i%16],0xf,@X[$i%16]
___
$code.=<<___ if ($i==79); # with context fetch
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
ldl @X[0],0($ctx)
sll $b,30,$t3
addl $t1,$e,$e
xor $b,$c,$t2
ldl @X[1],4($ctx)
srl $b,2,$b
addl @X[$i%16],$e,$e
xor $d,$t2,$t2
ldl @X[2],8($ctx)
srl $a,27,$t0
addl $t2,$e,$e
ldl @X[3],12($ctx)
or $t3,$b,$b
addl $t0,$e,$e
ldl @X[4],16($ctx)
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___; # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
srl $a,27,$t0
and $b,$c,$t2
and $b,$d,$t3
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
sll $b,30,$b
addl $t1,$e,$e
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
srl @X[$j%16],31,$t1
addl $t0,$e,$e
or $t2,$t3,$t2
and $c,$d,$t3
or $t2,$t3,$t2
srl $b,32,$t3
addl @X[$i%16],$e,$e
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
addl $t2,$e,$e
or $t1,@X[$j%16],@X[$j%16]
zapnot @X[$i%16],0xf,@X[$i%16]
___
}
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl sha1_block_data_order
.align 5
.ent sha1_block_data_order
sha1_block_data_order:
lda sp,-64(sp)
stq ra,0(sp)
stq s0,8(sp)
stq s1,16(sp)
stq s2,24(sp)
stq s3,32(sp)
stq s4,40(sp)
stq s5,48(sp)
stq fp,56(sp)
.mask 0x0400fe00,-64
.frame sp,64,ra
.prologue 0
ldl $A,0($ctx)
ldl $B,4($ctx)
sll $num,6,$num
ldl $C,8($ctx)
ldl $D,12($ctx)
ldl $E,16($ctx)
addq $inp,$num,$num
.Lloop:
.set noreorder
ldah $K,23170(zero)
zapnot $B,0xf,$B
lda $K,31129($K) # K_00_19
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,28378(zero)
lda $K,-5215($K) # K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,-28900(zero)
lda $K,-17188($K) # K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,-13725(zero)
lda $K,-15914($K) # K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
addl @X[0],$A,$A
addl @X[1],$B,$B
addl @X[2],$C,$C
addl @X[3],$D,$D
addl @X[4],$E,$E
stl $A,0($ctx)
stl $B,4($ctx)
addq $inp,64,$inp
stl $C,8($ctx)
stl $D,12($ctx)
stl $E,16($ctx)
cmpult $inp,$num,$t1
bne $t1,.Lloop
.set noreorder
ldq ra,0(sp)
ldq s0,8(sp)
ldq s1,16(sp)
ldq s2,24(sp)
ldq s3,32(sp)
ldq s4,40(sp)
ldq s5,48(sp)
ldq fp,56(sp)
lda sp,64(sp)
ret (ra)
.end sha1_block_data_order
.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,678 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# sha1_block procedure for ARMv4.
#
# January 2007.
# Size/performance trade-off
# ====================================================================
# impl size in bytes comp cycles[*] measured performance
# ====================================================================
# thumb 304 3212 4420
# armv4-small 392/+29% 1958/+64% 2250/+96%
# armv4-compact 740/+89% 1552/+26% 1840/+22%
# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
# ====================================================================
# thumb = same as 'small' but in Thumb instructions[**] and
# with recurring code in two private functions;
# small = detached Xload/update, loops are folded;
# compact = detached Xload/update, 5x unroll;
# large = interleaved Xload/update, 5x unroll;
# full unroll = interleaved Xload/update, full unroll, estimated[!];
#
# [*] Manually counted instructions in "grand" loop body. Measured
# performance is affected by prologue and epilogue overhead,
# i-cache availability, branch penalties, etc.
# [**] While each Thumb instruction is twice smaller, they are not as
# diverse as ARM ones: e.g., there are only two arithmetic
# instructions with 3 arguments, no [fixed] rotate, addressing
# modes are limited. As result it takes more instructions to do
# the same job in Thumb, therefore the code is never twice as
# small and always slower.
# [***] which is also ~35% better than compiler generated code. Dual-
# issue Cortex A8 core was measured to process input block in
# ~990 cycles.
# August 2010.
#
# Rescheduling for dual-issue pipeline resulted in 13% improvement on
# Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
# September 2013.
#
# Add NEON implementation (see sha1-586.pl for background info). On
# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
# faster than integer-only code. Because [fully unrolled] NEON code
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
# byte, which is also >80% faster than integer-only code.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0";
$inp="r1";
$len="r2";
$a="r3";
$b="r4";
$c="r5";
$d="r6";
$e="r7";
$K="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
$Xi="r14";
@V=($a,$b,$c,$d,$e);
sub Xupdate {
my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
$code.=<<___;
ldr $t0,[$Xi,#15*4]
ldr $t1,[$Xi,#13*4]
ldr $t2,[$Xi,#7*4]
add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1
eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx
$opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i]
___
}
sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_;
$code.=<<___;
#if __ARM_ARCH__<7
ldrb $t1,[$inp,#2]
ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp],#4
orr $t0,$t0,$t1,lsl#8
eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
str $t0,[$Xi,#-4]!
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_16_19 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2");
$code.=<<___;
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_20_39 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"eor $t1,$b,$t1,ror#2");
$code.=<<___;
add $e,$e,$t1 @ E+=F_20_39(B,C,D)
___
}
sub BODY_40_59 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
$code.=<<___;
add $e,$e,$t1 @ E+=F_40_59(B,C,D)
add $e,$e,$t2,ror#2
___
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 5
sha1_block_data_order:
#if __ARM_ARCH__>=7
sub r3,pc,#8 @ sha1_block_data_order
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
.Lloop:
ldr $K,.LK_00_19
mov $Xi,sp
sub sp,sp,#15*4
mov $c,$c,ror#30
mov $d,$d,ror#30
mov $e,$e,ror#30 @ [6]
.L_00_15:
___
for($i=0;$i<5;$i++) {
&BODY_00_15(@V); unshift(@V,pop(@V));
}
$code.=<<___;
teq $Xi,sp
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
___
&BODY_00_15(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
$code.=<<___;
ldr $K,.LK_20_39 @ [+15+16*4]
cmn sp,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
___
for($i=0;$i<5;$i++) {
&BODY_20_39(@V); unshift(@V,pop(@V));
}
$code.=<<___;
teq $Xi,sp @ preserve carry
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
ldr $K,.LK_40_59
sub sp,sp,#20*4 @ [+2]
.L_40_59:
___
for($i=0;$i<5;$i++) {
&BODY_40_59(@V); unshift(@V,pop(@V));
}
$code.=<<___;
teq $Xi,sp
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr $K,.LK_60_79
sub sp,sp,#20*4
cmp sp,#0 @ set carry to denote 60_79
b .L_20_39_or_60_79 @ [+4], spare 300 bytes
.L_done:
add sp,sp,#80*4 @ "deallocate" stack frame
ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
add $a,$K,$a
add $b,$t0,$b
add $c,$t1,$c,ror#2
add $d,$t2,$d,ror#2
add $e,$t3,$e,ror#2
stmia $ctx,{$a,$b,$c,$d,$e}
teq $inp,$len
bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha1_block_data_order,.-sha1_block_data_order
.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha1_block_data_order
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
#####################################################################
# NEON stuff
#
{{{
my @V=($a,$b,$c,$d,$e);
my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
my $Xi=4;
my @X=map("q$_",(8..11,0..3));
my @Tx=("q12","q13");
my ($K,$zero)=("q14","q15");
my $j=0;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub body_00_19 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&bic ($t0,$d,$b)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t1,$c,$b)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$t1,$t0)', # F_00_19
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_00_19
'$j++; unshift(@V,pop(@V));'
)
}
sub body_20_39 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&eor ($t0,$b,$d)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
'&eor ($t1,$t0,$c)', # F_20_39
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_20_39
'$j++; unshift(@V,pop(@V));'
)
}
sub body_40_59 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t0,$c,$d)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$c,$d)',
'&add ($e,$e,$t0)',
'&and ($t1,$t1,$b)',
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_40_59
'$j++; unshift(@V,pop(@V));'
)
}
sub Xupdate_16_31 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@Tx[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@Tx[0],@Tx[1],30);
eval(shift(@insns));
eval(shift(@insns));
&vshl_u32 (@Tx[1],@Tx[1],2);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xupdate_32_79 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@X[0],@Tx[0],30);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xuplast_80 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
&sub ($Xfer,$Xfer,64);
&teq ($inp,$len);
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
&subeq ($inp,$inp,64); # reload last block to avoid SEGV
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[-4&7],@X[-4&7]);
foreach (@insns) { eval; } # remaining instructions
$Xi=0;
}
sub Xloop()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
foreach (@insns) { eval; }
$Xi++;
}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
sub sp,sp,#64 @ alloca
adr $K_XX_XX,.LK_00_19
bic sp,sp,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
mov $Xfer,sp
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
vrev32.8 @X[-2&7],@X[-2&7]
vadd.i32 @X[0],@X[-4&7],$K
vrev32.8 @X[-1&7],@X[-1&7]
vadd.i32 @X[1],@X[-3&7],$K
vst1.32 {@X[0]},[$Xfer,:128]!
vadd.i32 @X[2],@X[-2&7],$K
vst1.32 {@X[1]},[$Xfer,:128]!
vst1.32 {@X[2]},[$Xfer,:128]!
ldr $Ki,[sp] @ big RAW stall
.Loop_neon:
___
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_32_79(\&body_00_19);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_20_39);
&Xuplast_80(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
$code.=<<___;
ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
add $a,$a,$Ki
ldr $Ki,[$ctx,#16]
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
moveq sp,$saved_sp
add $e,$e,$Ki
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
addne $Xfer,sp,#3*16
bne .Loop_neon
@ vldmia sp!,{d8-d15}
ldmia sp!,{r4-r12,pc}
.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
#endif
___
}}}
#####################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
my @MSG=map("q$_",(4..7));
my @Kxx=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
$code.=<<___;
#if __ARM_ARCH__>=7
.type sha1_block_data_order_armv8,%function
.align 5
sha1_block_data_order_armv8:
.LARMv8:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
adr r3,.LK_00_19
vld1.32 {$ABCD},[$ctx]!
vld1.32 {$E\[0]},[$ctx]
sub $ctx,$ctx,#16
vld1.32 {@Kxx[0]\[]},[r3,:32]!
vld1.32 {@Kxx[1]\[]},[r3,:32]!
vld1.32 {@Kxx[2]\[]},[r3,:32]!
vld1.32 {@Kxx[3]\[]},[r3,:32]
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vadd.i32 $W0,@Kxx[0],@MSG[0]
vrev32.8 @MSG[2],@MSG[2]
vmov $ABCD_SAVE,$ABCD @ offload
subs $len,$len,#1
vadd.i32 $W1,@Kxx[0],@MSG[1]
vrev32.8 @MSG[3],@MSG[3]
sha1h $E1,$ABCD @ 0
sha1c $ABCD,$E,$W0
vadd.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1$f $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1p $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD @ 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD @ 19
sha1p $ABCD,$E1,$W1
vadd.i32 $E,$E,$E0
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
bne .Loop_v8
vst1.32 {$ABCD},[$ctx]!
vst1.32 {$E\[0]},[$ctx]
vldmia sp!,{d8-d15}
ret @ bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
.comm OPENSSL_armcap_P,4,4
___
{ my %opcode = (
"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
"sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
"sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
s/\bret\b/bx lr/o or
s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
print $_,$/;
}
close STDOUT; # enforce flush

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,333 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA1 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# hardware-assisted software(*)
# Apple A7 2.31 4.13 (+14%)
# Cortex-A5x n/a n/a
#
# (*) Software results are presented mostly for reference purposes.
$flavour = shift;
open STDOUT,">".shift;
($ctx,$inp,$num)=("x0","x1","x2");
@Xw=map("w$_",(3..17,19));
@Xx=map("x$_",(3..17,19));
@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
($t0,$t1,$t2,$K)=map("w$_",(25..28));
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i<15 && !($i&1));
lsr @Xx[$i+1],@Xx[$i],#32
___
$code.=<<___ if ($i<14 && !($i&1));
ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
___
$code.=<<___ if ($i<14 && ($i&1));
#ifdef __ARMEB__
ror @Xx[$i+1],@Xx[$i+1],#32
#else
rev32 @Xx[$i+1],@Xx[$i+1]
#endif
___
$code.=<<___ if ($i<14);
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==19);
movz $K,#0xeba1
movk $K,#0x6ed9,lsl#16
___
$code.=<<___ if ($i>=14);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==59);
movz $K,#0xc1d6
movk $K,#0xca62,lsl#16
___
$code.=<<___;
orr $t0,$b,$c
and $t1,$b,$c
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
ror $t2,$a,#27
and $t0,$t0,$d
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $e,$e,$t2 // e+=rot(a,5)
orr $t0,$t0,$t1
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==39);
movz $K,#0xbcdc
movk $K,#0x8f1b,lsl#16
___
$code.=<<___ if ($i<78);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
$code.=<<___ if ($i==78);
ldp @Xw[1],@Xw[2],[$ctx]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==79);
ldp @Xw[3],@Xw[4],[$ctx,#8]
eor $t0,$d,$b
ror $t2,$a,#27
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
ldr @Xw[5],[$ctx,#16]
add $e,$e,$t0 // e+=F(b,c,d)
___
}
$code.=<<___;
#include "arm_arch.h"
.text
.globl sha1_block_data_order
.type sha1_block_data_order,%function
.align 6
sha1_block_data_order:
ldr x16,.LOPENSSL_armcap_P
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA1
b.ne .Lv8_entry
stp x29,x30,[sp,#-96]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldr $E,[$ctx,#16]
.Loop:
ldr @Xx[0],[$inp],#64
movz $K,#0x7999
sub $num,$num,#1
movk $K,#0x5a82,lsl#16
#ifdef __ARMEB__
ror $Xx[0],@Xx[0],#32
#else
rev32 @Xx[0],@Xx[0]
#endif
add $E,$E,$K // warm it up
add $E,$E,@Xw[0]
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $B,$B,@Xw[2]
add $C,$C,@Xw[3]
add $A,$A,@Xw[1]
add $D,$D,@Xw[4]
add $E,$E,@Xw[5]
stp $A,$B,[$ctx]
stp $C,$D,[$ctx,#8]
str $E,[$ctx,#16]
cbnz $num,.Loop
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldr x29,[sp],#96
ret
.size sha1_block_data_order,.-sha1_block_data_order
___
{{{
my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
my @MSG=map("v$_.16b",(4..7));
my @Kxx=map("v$_.4s",(16..19));
my ($W0,$W1)=("v20.4s","v21.4s");
my $ABCD_SAVE="v22.16b";
$code.=<<___;
.type sha1_block_armv8,%function
.align 6
sha1_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
adr x4,.Lconst
eor $E,$E,$E
ld1.32 {$ABCD},[$ctx],#16
ld1.32 {$E}[0],[$ctx]
sub $ctx,$ctx,#16
ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
add.i32 $W0,@Kxx[0],@MSG[0]
rev32 @MSG[2],@MSG[2]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
add.i32 $W1,@Kxx[0],@MSG[1]
rev32 @MSG[3],@MSG[3]
sha1h $E1,$ABCD
sha1c $ABCD,$E,$W0 // 0
add.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1$f $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1p $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD // 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD // 19
sha1p $ABCD,$E1,$W1
add.i32 $E,$E,$E0
add.i32 $ABCD,$ABCD,$ABCD_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD},[$ctx],#16
st1.32 {$E}[0],[$ctx]
ldr x29,[sp],#16
ret
.size sha1_block_armv8,.-sha1_block_armv8
.align 6
.Lconst:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
___
}}}
{ my %opcode = (
"sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
"sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
"sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,305 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Eternal question is what's wrong with compiler generated code? The
# trick is that it's possible to reduce the number of shifts required
# to perform rotations by maintaining copy of 32-bit value in upper
# bits of 64-bit register. Just follow mux2 and shrp instructions...
# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
# is >50% better than HP C and >2x better than gcc.
$code=<<___;
.ident \"sha1-ia64.s, version 1.3\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit
___
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
#$human=1;
if ($human) { # useful for visual code auditing...
($A,$B,$C,$D,$E) = ("A","B","C","D","E");
($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
( "K_00_19","K_20_39","K_40_59","K_60_79" );
@X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
"X8", "X9","X10","X11","X12","X13","X14","X15" );
}
else {
($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
( "r14", "r15", "loc10", "loc11" );
@X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
}
sub BODY_00_15 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___ if ($i==0);
{ .mmi; ld1 $X[$i]=[inp],2 // MSB
ld1 tmp2=[tmp3],2 };;
{ .mmi; ld1 tmp0=[inp],2
ld1 tmp4=[tmp3],2 // LSB
dep $X[$i]=$X[$i],tmp2,8,8 };;
___
if ($i<15) {
$code.=<<___;
{ .mmi; ld1 $Xn=[inp],2 // forward Xload
nop.m 0x0
dep tmp1=tmp0,tmp4,8,8 };;
{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
and tmp4=$c,$b
dep $X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
{ .mmi; add $e=$e,$X[$i] // e+=Xload
or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 };; // a>>27
{ .mmi; ld1 tmp0=[inp],2 // forward Xload
add $e=$e,tmp4 // e+=F_00_19(b,c,d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
or tmp5=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
dep $Xn=$Xn,tmp2,8,8 // forward Xload
mux2 $X[$i]=$X[$i],0x44 } //;;
___
}
else {
$code.=<<___;
{ .mii; and tmp3=$c,$b
dep tmp1=tmp0,tmp4,8,8;;
dep $X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
mux2 $X[$i]=$X[$i],0x44 };;
___
}
}
sub BODY_16_19 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___;
{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; andcm tmp1=$d,$b
and tmp0=$c,$b };;
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
}
sub BODY_20_39 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
$Konst = $K_20_39 if (!defined($Konst));
my $j=$i+1;
my $Xn=@X[$j%16];
if ($i<79) {
$code.=<<___;
{ .mib; add $e=$e,$Konst // e+=K_XX_XX
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
extr.u tmp1=$a,27,5 } // a>>27
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
}
else {
$code.=<<___;
{ .mib; add $e=$e,$Konst // e+=K_60_79
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
add $h1=$h1,$a };; // wrap up
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
extr.u tmp1=$a,27,5 } // a>>27
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
add $h3=$h3,$c };; // wrap up
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
or tmp1=tmp1,tmp5 // ROTATE(a,5)
shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
add tmp3=1,inp // used in unaligned codepath
add $h4=$h4,$d };; // wrap up
___
}
}
sub BODY_40_59 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___;
{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; and tmp1=$c,$d
xor tmp0=$c,$d };;
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
add tmp5=tmp5,tmp1 // a<<5+(c&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; and tmp0=tmp0,$b
xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; xor $Xn=$Xn,tmp3
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0x0 };;
___
}
sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); }
$code.=<<___;
.text
tmp0=r8;
tmp1=r9;
tmp2=r10;
tmp3=r11;
ctx=r32; // in0
inp=r33; // in1
// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
.global sha1_block_data_order#
.proc sha1_block_data_order#
.align 32
sha1_block_data_order:
.prologue
{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
$ADDP tmp0=4,ctx
.save ar.lc,r3
mov r3=ar.lc }
{ .mmi; $ADDP ctx=0,ctx
$ADDP inp=0,inp
mov r2=pr };;
tmp4=in2;
tmp5=loc12;
tmp6=loc13;
.body
{ .mlx; ld4 $h0=[ctx],8
movl $K_00_19=0x5a827999 }
{ .mlx; ld4 $h1=[tmp0],8
movl $K_20_39=0x6ed9eba1 };;
{ .mlx; ld4 $h2=[ctx],8
movl $K_40_59=0x8f1bbcdc }
{ .mlx; ld4 $h3=[tmp0]
movl $K_60_79=0xca62c1d6 };;
{ .mmi; ld4 $h4=[ctx],-16
add in2=-1,in2 // adjust num for ar.lc
mov ar.ec=1 };;
{ .mmi; nop.m 0
add tmp3=1,inp
mov ar.lc=in2 };; // brp.loop.imp: too far
.Ldtop:
{ .mmi; mov $A=$h0
mov $B=$h1
mux2 tmp6=$h1,0x44 }
{ .mmi; mov $C=$h2
mov $D=$h3
mov $E=$h4 };;
___
{ my $i;
my @V=($A,$B,$C,$D,$E);
for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
(($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
}
$code.=<<___;
{ .mmb; add $h0=$h0,$A
add $h2=$h2,$C
br.ctop.dptk.many .Ldtop };;
.Ldend:
{ .mmi; add tmp0=4,ctx
mov ar.lc=r3 };;
{ .mmi; st4 [ctx]=$h0,8
st4 [tmp0]=$h1,8 };;
{ .mmi; st4 [ctx]=$h2,8
st4 [tmp0]=$h3 };;
{ .mib; st4 [ctx]=$h4,-16
mov pr=r2,0x1ffff
br.ret.sptk.many b0 };;
.endp sha1_block_data_order#
stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=shift and open STDOUT,">$output";
print $code;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,354 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for MIPS.
# Performance improvement is 30% on unaligned input. The "secret" is
# to deploy lwl/lwr pair to load unaligned input. One could have
# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
# compatible subroutine. There is room for minor optimization on
# little-endian platforms...
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
$SZREG=4;
}
#
# <appro@openssl.org>
#
######################################################################
$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
# offsets of the Most and Least Significant Bytes
$MSB=$big_endian?0:3;
$LSB=3&~$MSB;
@X=map("\$$_",(8..23)); # a4-a7,s0-s11
$ctx=$a0;
$inp=$a1;
$num=$a2;
$A="\$1";
$B="\$2";
$C="\$3";
$D="\$7";
$E="\$24"; @V=($A,$B,$C,$D,$E);
$t0="\$25";
$t1=$num; # $num is offloaded to stack
$t2="\$30"; # fp
$K="\$31"; # ra
sub BODY_00_14 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian);
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
sll @X[$i],@X[$i],24
andi $t1,0xFF00
sll $t2,$t2,8
or @X[$i],$t0
or $t1,$t2
or @X[$i],$t1
___
$code.=<<___;
lwl @X[$j],$j*4+$MSB($inp)
sll $t0,$a,5 # $i
addu $e,$K
lwr @X[$j],$j*4+$LSB($inp)
srl $t1,$a,27
addu $e,$t0
xor $t0,$c,$d
addu $e,$t1
sll $t2,$b,30
and $t0,$b
srl $b,$b,2
xor $t0,$d
addu $e,@X[$i]
or $b,$t2
addu $e,$t0
___
}
sub BODY_15_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian && $i==15);
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
sll @X[$i],@X[$i],24
andi $t1,0xFF00
sll $t2,$t2,8
or @X[$i],$t0
or @X[$i],$t1
or @X[$i],$t2
___
$code.=<<___;
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
and $t0,$b
srl $t1,@X[$j%16],31
addu @X[$j%16],@X[$j%16]
srl $b,$b,2
xor $t0,$d
or @X[$j%16],$t1
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
xor $t0,$b
srl $t1,@X[$j%16],31
addu @X[$j%16],@X[$j%16]
srl $b,$b,2
addu $e,@X[$i%16]
or @X[$j%16],$t1
or $b,$t2
addu $e,$t0
___
$code.=<<___ if ($i==79);
lw @X[0],0($ctx)
sll $t0,$a,5 # $i
addu $e,$K
lw @X[1],4($ctx)
srl $t1,$a,27
addu $e,$t0
lw @X[2],8($ctx)
xor $t0,$c,$d
addu $e,$t1
lw @X[3],12($ctx)
sll $t2,$b,30
xor $t0,$b
lw @X[4],16($ctx)
srl $b,$b,2
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
and $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
addu $e,$t0
srl $t1,@X[$j%16],31
xor $t0,$c,$d
addu @X[$j%16],@X[$j%16]
and $t0,$b
srl $b,$b,2
or @X[$j%16],$t1
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
___
}
$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
$code=<<___;
#ifdef OPENSSL_FIPSCANISTER
# include <openssl/fipssyms.h>
#endif
.text
.set noat
.set noreorder
.align 5
.globl sha1_block_data_order
.ent sha1_block_data_order
sha1_block_data_order:
.frame $sp,$FRAMESIZE*$SZREG,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
$REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
$REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
___
$code.=<<___;
$PTR_SLL $num,6
$PTR_ADD $num,$inp
$REG_S $num,0($sp)
lw $A,0($ctx)
lw $B,4($ctx)
lw $C,8($ctx)
lw $D,12($ctx)
b .Loop
lw $E,16($ctx)
.align 4
.Loop:
.set reorder
lwl @X[0],$MSB($inp)
lui $K,0x5a82
lwr @X[0],$LSB($inp)
ori $K,0x7999 # K_00_19
___
for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0x6ed9
ori $K,0xeba1 # K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0x8f1b
ori $K,0xbcdc # K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0xca62
ori $K,0xc1d6 # K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
$PTR_ADD $inp,64
$REG_L $num,0($sp)
addu $A,$X[0]
addu $B,$X[1]
sw $A,0($ctx)
addu $C,$X[2]
addu $D,$X[3]
sw $B,4($ctx)
addu $E,$X[4]
sw $C,8($ctx)
sw $D,12($ctx)
sw $E,16($ctx)
.set noreorder
bne $inp,$num,.Loop
nop
.set noreorder
$REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
$REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end sha1_block_data_order
.rdata
.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT;

View File

@@ -0,0 +1,260 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for PA-RISC.
# June 2009.
#
# On PA-7100LC performance is >30% better than gcc 3.2 generated code
# for aligned input and >50% better for unaligned. Compared to vendor
# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
# few percent faster in 32-bit one (this for aligned input, data for
# unaligned input is not available).
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
# [+ argument transfer]
$ctx="%r26"; # arg0
$inp="%r25"; # arg1
$num="%r24"; # arg2
$t0="%r28";
$t1="%r29";
$K="%r31";
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<15);
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
addl @X[$i],$e,$e
and $c,$b,$t0
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
addl $t0,$e,$e
___
$code.=<<___ if ($i>=15); # with forward Xupdate
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
and $c,$b,$t0
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
add $t0,$e,$e
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t0,$e,$e
___
$code.=<<___ if ($i==79); # with context load
ldw 0($ctx),@X[0] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
ldw 4($ctx),@X[1]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
ldw 8($ctx),@X[2]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
ldw 12($ctx),@X[3]
addl $t0,$e,$e
ldw 16($ctx),@X[4]
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
shd $a,$a,27,$t1 ; $i
addl $K,$e,$e
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
xor $d,$c,$t0
addl @X[$i%16],$e,$e
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
and $b,$t0,$t0
addl $t1,$e,$e
shd $b,$b,2,$b
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t0,$e,$e
and $d,$c,$t1
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t1,$e,$e
___
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
sha1_block_data_order
.PROC
.CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
ldw 0($ctx),$A
ldw 4($ctx),$B
ldw 8($ctx),$C
ldw 12($ctx),$D
ldw 16($ctx),$E
extru $inp,31,2,$t0 ; t0=inp&3;
sh3addl $t0,%r0,$t0 ; t0*=8;
subi 32,$t0,$t0 ; t0=32-t0;
mtctl $t0,%cr11 ; %sar=t0;
L\$oop
ldi 3,$t0
andcm $inp,$t0,$t0 ; 64-bit neutral
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\tldw `4*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
ldw 60($t0),@X[15]
ldw 64($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align input
$code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
ldil L'0x5a827000,$K ; K_00_19
ldo 0x999($K),$K
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x6ed9e000,$K ; K_20_39
ldo 0xba1($K),$K
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x8f1bb000,$K ; K_40_59
ldo 0xcdc($K),$K
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0xca62c000,$K ; K_60_79
ldo 0x1d6($K),$K
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
addl @X[0],$A,$A
addl @X[1],$B,$B
addl @X[2],$C,$C
addl @X[3],$D,$D
addl @X[4],$E,$E
stw $A,0($ctx)
stw $B,4($ctx)
stw $C,8($ctx)
stw $D,12($ctx)
stw $E,16($ctx)
addib,*<> -1,$num,L\$oop
ldo 64($inp),$inp
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/,\*/,/gm if ($SIZE_T==4);
$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
print $code;
close STDOUT;

View File

@@ -0,0 +1,326 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# I let hardware handle unaligned input(*), except on page boundaries
# (see below for details). Otherwise straightforward implementation
# with X vector in register bank. The module is big-endian [which is
# not big deal as there're no little-endian targets left around].
#
# (*) this means that this module is inappropriate for PPC403? Does
# anybody know if pre-POWER3 can sustain unaligned load?
# -m64 -m32
# ----------------------------------
# PPC970,gcc-4.0.0 +76% +59%
# Power6,xlc-7 +68% +33%
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T+64;
$LOCALS=6*$SIZE_T;
$K ="r0";
$sp ="r1";
$toc="r2";
$ctx="r3";
$inp="r4";
$num="r5";
$t0 ="r15";
$t1 ="r6";
$A ="r7";
$B ="r8";
$C ="r9";
$D ="r10";
$E ="r11";
$T ="r12";
@V=($A,$B,$C,$D,$E,$T);
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
lwz @X[$i],`$i*4`($inp)
___
$code.=<<___ if ($i<15);
lwz @X[$j],`$j*4`($inp)
add $f,$K,$e
rotlwi $e,$a,5
add $f,$f,@X[$i]
and $t0,$c,$b
add $f,$f,$e
andc $t1,$d,$b
rotlwi $b,$b,30
or $t0,$t0,$t1
add $f,$f,$t0
___
$code.=<<___ if ($i>=15);
add $f,$K,$e
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
and $t0,$c,$b
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$e
andc $t1,$d,$b
rotlwi $b,$b,30
or $t0,$t0,$t1
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
add $f,$f,$t0
rotlwi @X[$j%16],@X[$j%16],1
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
add $f,$K,$e
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
xor $t0,$b,$c
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$e
rotlwi $b,$b,30
xor $t0,$t0,$d
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
add $f,$f,$t0
rotlwi @X[$j%16],@X[$j%16],1
___
$code.=<<___ if ($i==79);
add $f,$K,$e
rotlwi $e,$a,5
lwz r16,0($ctx)
add $f,$f,@X[$i%16]
xor $t0,$b,$c
lwz r17,4($ctx)
add $f,$f,$e
rotlwi $b,$b,30
lwz r18,8($ctx)
xor $t0,$t0,$d
lwz r19,12($ctx)
add $f,$f,$t0
lwz r20,16($ctx)
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___;
add $f,$K,$e
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
and $t0,$b,$c
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$e
or $t1,$b,$c
rotlwi $b,$b,30
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
and $t1,$t1,$d
or $t0,$t0,$t1
rotlwi @X[$j%16],@X[$j%16],1
add $f,$f,$t0
___
}
$code=<<___;
.machine "any"
.text
.globl .sha1_block_data_order
.align 4
.sha1_block_data_order:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $A,0($ctx)
lwz $B,4($ctx)
lwz $C,8($ctx)
lwz $D,12($ctx)
lwz $E,16($ctx)
andi. r0,$inp,3
bne Lunaligned
Laligned:
mtctr $num
bl Lsha1_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for 64-byte input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,4095 ; distance to closest page boundary
srwi. $t1,$t1,6 ; t1/=64
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
mtctr $t1
subfc $num,$t1,$num
bl Lsha1_block_private
Lcross_page:
li $t1,16
mtctr $t1
addi r20,$sp,$LOCALS ; spot within the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
li $t1,1
addi $inp,$sp,$LOCALS
mtctr $t1
bl Lsha1_block_private
$POP $inp,`$FRAME-$SIZE_T*18`($sp)
addic. $num,$num,-1
bne- Lunaligned
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
___
# This is private block function, which uses tailored calling
# interface, namely upon entry SHA_CTX is pre-loaded to given
# registers and counter register contains amount of chunks to
# digest...
$code.=<<___;
.align 4
Lsha1_block_private:
___
$code.=<<___; # load K_00_19
lis $K,0x5a82
ori $K,$K,0x7999
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_20_39
lis $K,0x6ed9
ori $K,$K,0xeba1
___
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_40_59
lis $K,0x8f1b
ori $K,$K,0xbcdc
___
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_60_79
lis $K,0xca62
ori $K,$K,0xc1d6
___
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add r16,r16,$E
add r17,r17,$T
add r18,r18,$A
add r19,r19,$B
add r20,r20,$C
stw r16,0($ctx)
mr $A,r16
stw r17,4($ctx)
mr $B,r17
stw r18,8($ctx)
mr $C,r18
stw r19,12($ctx)
mr $D,r19
stw r20,16($ctx)
mr $E,r20
addi $inp,$inp,`16*4`
bdnz- Lsha1_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
$code.=<<___;
.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,246 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for s390x.
# April 2007.
#
# Performance is >30% better than gcc 3.3 generated code. But the real
# twist is that SHA1 hardware support is detected and utilized. In
# which case performance can reach further >4.5x for larger chunks.
# January 2009.
#
# Optimize Xupdate for amount of memory references and reschedule
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific.
$kimdfunc=1; # magic function code for kimd instruction
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$K_00_39="%r0"; $K=$K_00_39;
$K_40_79="%r1";
$ctx="%r2"; $prefetch="%r2";
$inp="%r3";
$len="%r4";
$A="%r5";
$B="%r6";
$C="%r7";
$D="%r8";
$E="%r9"; @V=($A,$B,$C,$D,$E);
$t0="%r10";
$t1="%r11";
@X=("%r12","%r13","%r14");
$sp="%r15";
$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+16*4;
sub Xupdate {
my $i=shift;
$code.=<<___ if ($i==15);
lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
lr $X[0],$X[2]
___
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
$code.=<<___ if ($i<16);
lg $X[0],`$i*4`($inp) ### Xload($i)
rllg $X[1],$X[0],32
___
$code.=<<___ if ($i>=16);
xgr $X[0],$prefetch ### Xupdate($i)
lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
xgr $X[0],$prefetch
rll $X[0],$X[0],1
rllg $X[1],$X[0],32
rll $X[1],$X[1],1
rllg $X[0],$X[1],32
lr $X[2],$X[1] # feedback
___
$code.=<<___ if ($i<=70);
stg $X[0],`$stdframe+4*($i%16)`($sp)
___
unshift(@X,pop(@X));
}
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$d
xr $t0,$c
alr $e,$t1
nr $t0,$b
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
xr $t0,$c
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
or $t0,$c
lr $t1,$b
nr $t0,$d
nr $t1,$c
alr $e,$xi
or $t0,$t1
rll $b,$b,30
alr $e,$t0
___
}
$code.=<<___;
.text
.align 64
.type Ktable,\@object
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
.size Ktable,.-Ktable
.globl sha1_block_data_order
.type sha1_block_data_order,\@function
sha1_block_data_order:
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
lg %r0,0(%r1)
tmhl %r0,0x4000 # check for message-security assist
jz .Lsoftware
lghi %r0,0
la %r1,`2*$SIZE_T`($sp)
.long 0xb93e0002 # kimd %r0,%r2
lg %r0,`2*$SIZE_T`($sp)
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
lgr %r1,$ctx
lgr %r2,$inp
sllg %r3,$len,6
.long 0xb93e0002 # kimd %r0,%r2
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 16
.Lsoftware:
___
$code.=<<___;
lghi %r1,-$frame
st${g} $ctx,`2*$SIZE_T`($sp)
stm${g} %r6,%r15,`6*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
larl $t0,Ktable
llgf $A,0($ctx)
llgf $B,4($ctx)
llgf $C,8($ctx)
llgf $D,12($ctx)
llgf $E,16($ctx)
lg $K_00_39,0($t0)
lg $K_40_79,8($t0)
.Lloop:
rllg $K_00_39,$K_00_39,32
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_00_39,$K_00_39,32
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; $K=$K_40_79;
rllg $K_40_79,$K_40_79,32
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_40_79,$K_40_79,32
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,64($inp)
al $A,0($ctx)
al $B,4($ctx)
al $C,8($ctx)
al $D,12($ctx)
al $E,16($ctx)
st $A,0($ctx)
st $B,4($ctx)
st $C,8($ctx)
st $D,12($ctx)
st $E,16($ctx)
brct${g} $len,.Lloop
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size sha1_block_data_order,.-sha1_block_data_order
.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,284 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# Performance improvement is not really impressive on pre-T1 CPU: +8%
# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
# X[16] vector is packed to 8 64-bit registers and as result nothing
# is spilled on stack. In addition input data is loaded in compact
# instruction sequence, thus minimizing the window when the code is
# subject to [inter-thread] cache-thrashing hazard. The goal is to
# ensure scalability on UltraSPARC T1, or rather to avoid decay when
# amount of active threads exceeds the number of physical cores.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$output=shift;
open STDOUT,">$output";
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$rot1m="%g2";
$tmp64="%g3";
$Xi="%g4";
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
@V=($A,$B,$C,$D,$E);
$K_00_19="%l5";
$K_20_39="%l6";
$K_40_59="%l7";
$K_60_79="%g5";
@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
$ctx="%i0";
$inp="%i1";
$len="%i2";
$tmp0="%i3";
$tmp1="%i4";
$tmp2="%i5";
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?@X[($i/2)%8]:$Xi;
$code.=<<___;
sll $a,5,$tmp0 !! $i
add @K[$i/20],$e,$e
srl $a,27,$tmp1
add $tmp0,$e,$e
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
andn $d,$b,$tmp1
srl $b,2,$b
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $xi,$e,$e
___
if ($i&1 && $i<15) {
$code.=
" srlx @X[(($i+1)/2)%8],32,$Xi\n";
}
$code.=<<___;
add $tmp1,$e,$e
___
}
sub Xupdate {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i/2;
if ($i&1) {
$code.=<<___;
sll $a,5,$tmp0 !! $i
add @K[$i/20],$e,$e
srl $a,27,$tmp1
___
} else {
$code.=<<___;
sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
srlx @X[($j+7)%8],32,$tmp1
xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
sll $a,5,$tmp0 !! $i
or $tmp1,$Xi,$Xi
add @K[$i/20],$e,$e !!
xor $Xi,@X[$j%8],@X[$j%8]
srlx @X[$j%8],31,$Xi
add @X[$j%8],@X[$j%8],@X[$j%8]
and $Xi,$rot1m,$Xi
andn @X[$j%8],$rot1m,@X[$j%8]
srl $a,27,$tmp1 !!
or $Xi,@X[$j%8],@X[$j%8]
___
}
}
sub BODY_16_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
add $xi,$e,$e
andn $d,$b,$tmp1
srl $b,2,$b
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $xi,$e,$e
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
or $c,$b,$tmp1
srl $b,2,$b
and $d,$tmp1,$tmp1
add $xi,$e,$e
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
___
}
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.section ".text",#alloc,#execinstr
.align 32
.globl sha1_block_data_order
sha1_block_data_order:
save %sp,-$frame,%sp
sllx $len,6,$len
add $inp,$len,$len
or %g0,1,$rot1m
sllx $rot1m,32,$rot1m
or $rot1m,1,$rot1m
ld [$ctx+0],$A
ld [$ctx+4],$B
ld [$ctx+8],$C
ld [$ctx+12],$D
ld [$ctx+16],$E
andn $inp,7,$tmp0
sethi %hi(0x5a827999),$K_00_19
or $K_00_19,%lo(0x5a827999),$K_00_19
sethi %hi(0x6ed9eba1),$K_20_39
or $K_20_39,%lo(0x6ed9eba1),$K_20_39
sethi %hi(0x8f1bbcdc),$K_40_59
or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
sethi %hi(0xca62c1d6),$K_60_79
or $K_60_79,%lo(0xca62c1d6),$K_60_79
.Lloop:
ldx [$tmp0+0],@X[0]
ldx [$tmp0+16],@X[2]
ldx [$tmp0+32],@X[4]
ldx [$tmp0+48],@X[6]
and $inp,7,$tmp1
ldx [$tmp0+8],@X[1]
sll $tmp1,3,$tmp1
ldx [$tmp0+24],@X[3]
subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
ldx [$tmp0+40],@X[5]
bz,pt %icc,.Laligned
ldx [$tmp0+56],@X[7]
sllx @X[0],$tmp1,@X[0]
ldx [$tmp0+64],$tmp64
___
for($i=0;$i<7;$i++)
{ $code.=<<___;
srlx @X[$i+1],$tmp2,$Xi
sllx @X[$i+1],$tmp1,@X[$i+1]
or $Xi,@X[$i],@X[$i]
___
}
$code.=<<___;
srlx $tmp64,$tmp2,$tmp64
or $tmp64,@X[7],@X[7]
.Laligned:
srlx @X[0],32,$Xi
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ld [$ctx+0],@X[0]
ld [$ctx+4],@X[1]
ld [$ctx+8],@X[2]
ld [$ctx+12],@X[3]
add $inp,64,$inp
ld [$ctx+16],@X[4]
cmp $inp,$len
add $A,@X[0],$A
st $A,[$ctx+0]
add $B,@X[1],$B
st $B,[$ctx+4]
add $C,@X[2],$C
st $C,[$ctx+8]
add $D,@X[3],$D
st $D,[$ctx+12]
add $E,@X[4],$E
st $E,[$ctx+16]
bne `$bits==64?"%xcc":"%icc"`,.Lloop
andn $inp,7,$tmp0
ret
restore
.type sha1_block_data_order,#function
.size sha1_block_data_order,(.-sha1_block_data_order)
.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,601 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2009
#
# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
# Graphic Unit would make it possible to achieve higher instruction-
# level parallelism, ILP, and thus higher performance. It should be
# explicitly noted that ILP is the keyword, and it means that this
# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
# not really novel, Sun had VIS-powered implementation for a while.
# Unlike Sun's implementation this one can process multiple unaligned
# input blocks, and as such works as drop-in replacement for OpenSSL
# sha1_block_data_order. Performance improvement was measured to be
# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
# UltraSPARC-III. See below for discussion...
#
# The module does not present direct interest for OpenSSL, because
# it doesn't provide better performance on contemporary SPARCv9 CPUs,
# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
# absolutely must score on UltraSPARC-I-IV can simply replace
# crypto/sha/asm/sha1-sparcv9.pl with this module.
#
# (*) "Pipe-lined" means that even if it takes several cycles to
# complete, next instruction using same functional unit [but not
# depending on the result of the current instruction] can start
# execution without having to wait for the unit. "Pairable"
# means that two [or more] independent instructions can be
# issued at the very same time.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$output=shift;
open STDOUT,">$output";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$tmp0="%i3";
$tmp1="%i4";
$tmp2="%i5";
$tmp3="%g5";
$base="%g1";
$align="%g4";
$Xfer="%o5";
$nXfer=$tmp3;
$Xi="%o7";
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
@V=($A,$B,$C,$D,$E);
$Actx="%o0";
$Bctx="%o1";
$Cctx="%o2";
$Dctx="%o3";
$Ectx="%o4";
$fmul="%f32";
$VK_00_19="%f34";
$VK_20_39="%f36";
$VK_40_59="%f38";
$VK_60_79="%f40";
@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
"%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
# covers even K_NN_MM addition...
sub Xupdate {
my ($i)=@_;
my $K=@VK[($i+16)/20];
my $j=($i+16)%16;
# [ provided that GSR.alignaddr_offset is 5, $mul contains
# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
# chosen registers... ]
$code.=<<___;
fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
![fxors %f15,%f2,%f2]
for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
fpadd32 $K,@X[$j],%f20
std %f20,[$Xfer+`4*$j`]
___
# The numbers delimited with slash are the earliest possible dispatch
# cycles for given instruction assuming 1 cycle latency for simple VIS
# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
# round. As [long as] FPU/VIS instructions are perfectly pairable with
# IALU ones, the round timing is defined by the maximum between VIS
# and IALU timings. The latter varies from round to round and averages
# out at 6.25 ticks. This means that USI&II should operate at IALU
# rate, while USIII&IV - at VIS rate. This explains why performance
# improvement varies among processors. Well, given that pure IALU
# sha1-sparcv9.pl module exhibits virtually uniform performance of
# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
# lower limits. Real-life performance was measured to be 6.6 cycles
# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
# half-round VIS timing, because there are 16 Xupdate-free rounds,
# which "push down" average theoretical timing to 8 cycles...
# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
# latency. Well, it might have, but it doesn't have dedicated
# VIS-unit. Instead, VIS instructions are executed by other
# functional units, ones used here - by IALU. This doesn't
# improve effective ILP...
}
# The reference Xupdate procedure is then "strained" over *pairs* of
# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
# plenty of room to amortize for read-after-write hazard, as well as
# to fetch and align input for the next spin. The VIS instructions are
# scheduled for latency of 2 cycles, because there are not enough IALU
# instructions to schedule for latency of 3, while scheduling for 1
# would give no gain on USI&II anyway.
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1));
sll $a,5,$tmp0 !! $i
and $c,$b,$tmp3
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
sll $b,30,$tmp2
add $tmp1,$e,$e
andn $d,$b,$tmp1
add $Xi,$e,$e
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
or $tmp1,$tmp3,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
___
$code.=<<___ if ($i&1);
sll $a,5,$tmp0 !! $i
and $c,$b,$tmp3
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
sll $b,30,$tmp2
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
andn $d,$b,$tmp1
add $Xi,$e,$e
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
or $tmp1,$tmp3,$tmp1
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
or $tmp2,$b,$b
add $tmp1,$e,$e
___
$code.=<<___ if ($i&1 && $i>=2);
std %f20,[$Xfer+`4*$l`] !
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1) && $i<64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
___
$code.=<<___ if ($i&1 && $i<64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
xor $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
add $tmp1,$e,$e
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
or $tmp2,$b,$b
add $Xi,$e,$e
std %f20,[$Xfer+`4*$l`] !
___
$code.=<<___ if ($i==64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 $K,@X[$l],%f20
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
std %f20,[$Xfer+`4*$l`]
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i>64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1));
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
or $c,$b,$tmp1
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
and $d,$tmp1,$tmp1
add $Xi,$e,$e
or $tmp1,$tmp0,$tmp1
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
or $tmp2,$b,$b
add $tmp1,$e,$e
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
___
$code.=<<___ if ($i&1);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
and $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
sll $b,30,$tmp2
or $c,$b,$tmp1
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
and $d,$tmp1,$tmp1
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
add $Xi,$e,$e
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
std %f20,[$Xfer+`4*$l`] !
___
}
# If there is more data to process, then we pre-fetch the data for
# next iteration in last ten rounds...
sub BODY_70_79 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $m=($i%8)*2;
$j=($j+16)%16;
$code.=<<___ if ($i==70);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
ldd [$inp+64],@X[0]
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
and $inp,-64,$nXfer
inc 64,$inp
and $nXfer,255,$nXfer
alignaddr %g0,$align,%g0
add $base,$nXfer,$nXfer
___
$code.=<<___ if ($i==71);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i>=72);
faligndata @X[$m],@X[$m+2],@X[$m]
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $VK_00_19,@X[$m],%f20
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i<77);
ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
___
$code.=<<___ if ($i==77); # redundant if $inp was aligned
add $align,63,$tmp0
and $tmp0,-8,$tmp0
ldd [$inp+$tmp0],@X[16]
___
$code.=<<___ if ($i>=72);
std %f20,[$nXfer+`4*$m`]
___
}
$code.=<<___;
.section ".text",#alloc,#execinstr
.align 64
vis_const:
.long 0x5a827999,0x5a827999 ! K_00_19
.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
.long 0xca62c1d6,0xca62c1d6 ! K_60_79
.long 0x00000100,0x00000100
.align 64
.type vis_const,#object
.size vis_const,(.-vis_const)
.globl sha1_block_data_order
sha1_block_data_order:
save %sp,-$frame,%sp
add %fp,$bias-256,$base
1: call .+8
add %o7,vis_const-1b,$tmp0
ldd [$tmp0+0],$VK_00_19
ldd [$tmp0+8],$VK_20_39
ldd [$tmp0+16],$VK_40_59
ldd [$tmp0+24],$VK_60_79
ldd [$tmp0+32],$fmul
ld [$ctx+0],$Actx
and $base,-256,$base
ld [$ctx+4],$Bctx
sub $base,$bias+$frame,%sp
ld [$ctx+8],$Cctx
and $inp,7,$align
ld [$ctx+12],$Dctx
and $inp,-8,$inp
ld [$ctx+16],$Ectx
! X[16] is maintained in FP register bank
alignaddr %g0,$align,%g0
ldd [$inp+0],@X[0]
sub $inp,-64,$Xfer
ldd [$inp+8],@X[2]
and $Xfer,-64,$Xfer
ldd [$inp+16],@X[4]
and $Xfer,255,$Xfer
ldd [$inp+24],@X[6]
add $base,$Xfer,$Xfer
ldd [$inp+32],@X[8]
ldd [$inp+40],@X[10]
ldd [$inp+48],@X[12]
brz,pt $align,.Laligned
ldd [$inp+56],@X[14]
ldd [$inp+64],@X[16]
faligndata @X[0],@X[2],@X[0]
faligndata @X[2],@X[4],@X[2]
faligndata @X[4],@X[6],@X[4]
faligndata @X[6],@X[8],@X[6]
faligndata @X[8],@X[10],@X[8]
faligndata @X[10],@X[12],@X[10]
faligndata @X[12],@X[14],@X[12]
faligndata @X[14],@X[16],@X[14]
.Laligned:
mov 5,$tmp0
dec 1,$len
alignaddr %g0,$tmp0,%g0
fpadd32 $VK_00_19,@X[0],%f16
fpadd32 $VK_00_19,@X[2],%f18
fpadd32 $VK_00_19,@X[4],%f20
fpadd32 $VK_00_19,@X[6],%f22
fpadd32 $VK_00_19,@X[8],%f24
fpadd32 $VK_00_19,@X[10],%f26
fpadd32 $VK_00_19,@X[12],%f28
fpadd32 $VK_00_19,@X[14],%f30
std %f16,[$Xfer+0]
mov $Actx,$A
std %f18,[$Xfer+8]
mov $Bctx,$B
std %f20,[$Xfer+16]
mov $Cctx,$C
std %f22,[$Xfer+24]
mov $Dctx,$D
std %f24,[$Xfer+32]
mov $Ectx,$E
std %f26,[$Xfer+40]
fxors @X[13],@X[0],@X[0]
std %f28,[$Xfer+48]
ba .Loop
std %f30,[$Xfer+56]
.align 32
.Loop:
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
tst $len
bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
nop
___
for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $A,$Actx,$Actx
add $B,$Bctx,$Bctx
add $C,$Cctx,$Cctx
add $D,$Dctx,$Dctx
add $E,$Ectx,$Ectx
mov 5,$tmp0
fxors @X[13],@X[0],@X[0]
mov $Actx,$A
mov $Bctx,$B
mov $Cctx,$C
mov $Dctx,$D
mov $Ectx,$E
alignaddr %g0,$tmp0,%g0
dec 1,$len
ba .Loop
mov $nXfer,$Xfer
.align 32
.Ltail:
___
for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $A,$Actx,$Actx
add $B,$Bctx,$Bctx
add $C,$Cctx,$Cctx
add $D,$Dctx,$Dctx
add $E,$Ectx,$Ectx
st $Actx,[$ctx+0]
st $Bctx,[$ctx+4]
st $Cctx,[$ctx+8]
st $Dctx,[$ctx+12]
st $Ectx,[$ctx+16]
ret
restore
.type sha1_block_data_order,#function
.size sha1_block_data_order,(.-sha1_block_data_order)
.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my ($ref,$opf);
my %visopf = ( "fmul8ulx16" => 0x037,
"faligndata" => 0x048,
"fpadd32" => 0x052,
"fxor" => 0x06c,
"fxors" => 0x06d );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%f([0-9]{1,2})/);
$_=$1;
if ($1>=32) {
return $ref if ($1&1);
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00300|$rd<<25|$rs1<<14|$rs2,
$ref;
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/gem;
$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,259 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# sha1_block for Thumb.
#
# January 2007.
#
# The code does not present direct interest to OpenSSL, because of low
# performance. Its purpose is to establish _size_ benchmark. Pretty
# useless one I must say, because 30% or 88 bytes larger ARMv4 code
# [avialable on demand] is almost _twice_ as fast. It should also be
# noted that in-lining of .Lcommon and .Lrotate improves performance
# by over 40%, while code increases by only 10% or 32 bytes. But once
# again, the goal was to establish _size_ benchmark, not performance.
$output=shift;
open STDOUT,">$output";
$inline=0;
#$cheat_on_binutils=1;
$t0="r0";
$t1="r1";
$t2="r2";
$a="r3";
$b="r4";
$c="r5";
$d="r6";
$e="r7";
$K="r8"; # "upper" registers can be used in add/sub and mov insns
$ctx="r9";
$inp="r10";
$len="r11";
$Xi="r12";
sub common {
<<___;
sub $t0,#4
ldr $t1,[$t0]
add $e,$K @ E+=K_xx_xx
lsl $t2,$a,#5
add $t2,$e
lsr $e,$a,#27
add $t2,$e @ E+=ROR(A,27)
add $t2,$t1 @ E+=X[i]
___
}
sub rotate {
<<___;
mov $e,$d @ E=D
mov $d,$c @ D=C
lsl $c,$b,#30
lsr $b,$b,#2
orr $c,$b @ C=ROR(B,2)
mov $b,$a @ B=A
add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
___
}
sub BODY_00_19 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$c
eor $t1,$d
and $t1,$b
eor $t1,$d @ F_00_19(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
sub BODY_20_39 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$b
eor $t1,$c
eor $t1,$d @ F_20_39(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
sub BODY_40_59 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$b
and $t1,$c
mov $e,$b
orr $e,$c
and $e,$d
orr $t1,$e @ F_40_59(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
$code=<<___;
.text
.code 16
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 2
sha1_block_data_order:
___
if ($cheat_on_binutils) {
$code.=<<___;
.code 32
add r3,pc,#1
bx r3 @ switch to Thumb ISA
.code 16
___
}
$code.=<<___;
push {r4-r7}
mov r3,r8
mov r4,r9
mov r5,r10
mov r6,r11
mov r7,r12
push {r3-r7,lr}
lsl r2,#6
mov $ctx,r0 @ save context
mov $inp,r1 @ save inp
mov $len,r2 @ save len
add $len,$inp @ $len to point at inp end
.Lloop:
mov $Xi,sp
mov $t2,sp
sub $t2,#16*4 @ [3]
.LXload:
ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
ldrb $b,[$t1,#1]
ldrb $c,[$t1,#2]
ldrb $d,[$t1,#3]
lsl $a,#24
lsl $b,#16
lsl $c,#8
orr $a,$b
orr $a,$c
orr $a,$d
add $t1,#4
push {$a}
cmp sp,$t2
bne .LXload @ [+14*16]
mov $inp,$t1 @ update $inp
sub $t2,#32*4
sub $t2,#32*4
mov $e,#31 @ [+4]
.LXupdate:
ldr $a,[sp,#15*4]
ldr $b,[sp,#13*4]
ldr $c,[sp,#7*4]
ldr $d,[sp,#2*4]
eor $a,$b
eor $a,$c
eor $a,$d
ror $a,$e
push {$a}
cmp sp,$t2
bne .LXupdate @ [+(11+1)*64]
ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
mov $t0,$Xi
ldr $t2,.LK_00_19
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+7+4]
.L_00_19:
___
&BODY_00_19();
$code.=<<___;
cmp $Xi,$t0
bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
ldr $t2,.LK_20_39
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+5]
.L_20_39_or_60_79:
___
&BODY_20_39();
$code.=<<___;
cmp $Xi,$t0
bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
cmp sp,$t0
beq .Ldone @ [+2]
ldr $t2,.LK_40_59
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+5]
.L_40_59:
___
&BODY_40_59();
$code.=<<___;
cmp $Xi,$t0
bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
ldr $t2,.LK_60_79
mov $Xi,sp
mov $K,$t2
b .L_20_39_or_60_79 @ [+4]
.Ldone:
mov $t0,$ctx
ldr $t1,[$t0,#0]
ldr $t2,[$t0,#4]
add $a,$t1
ldr $t1,[$t0,#8]
add $b,$t2
ldr $t2,[$t0,#12]
add $c,$t1
ldr $t1,[$t0,#16]
add $d,$t2
add $e,$t1
stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
add sp,#80*4 @ deallocate stack frame
mov $t0,$ctx @ restore ctx
mov $t1,$inp @ restore inp
cmp $t1,$len
beq .Lexit
b .Lloop @ [+6] total 3212 cycles
.Lexit:
pop {r2-r7}
mov r8,r2
mov r9,r3
mov r10,r4
mov r11,r5
mov r12,r6
mov lr,r7
pop {r4-r7}
bx lr
.align 2
___
$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
$code.=<<___;
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
.size sha1_block_data_order,.-sha1_block_data_order
.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT; # enforce flush

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,258 @@
.file "sha512-586.s"
.text
.globl sha256_block_data_order
.type sha256_block_data_order,@function
.align 16
sha256_block_data_order:
.L_sha256_block_data_order_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%esi
movl 24(%esp),%edi
movl 28(%esp),%eax
movl %esp,%ebx
call .L000pic_point
.L000pic_point:
popl %ebp
leal .L001K256-.L000pic_point(%ebp),%ebp
subl $16,%esp
andl $-64,%esp
shll $6,%eax
addl %edi,%eax
movl %esi,(%esp)
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
.align 16
.L002loop:
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
movl 12(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 16(%edi),%eax
movl 20(%edi),%ebx
movl 24(%edi),%ecx
movl 28(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 32(%edi),%eax
movl 36(%edi),%ebx
movl 40(%edi),%ecx
movl 44(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 48(%edi),%eax
movl 52(%edi),%ebx
movl 56(%edi),%ecx
movl 60(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
addl $64,%edi
subl $32,%esp
movl %edi,100(%esp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edi
movl %ebx,4(%esp)
movl %ecx,8(%esp)
movl %edi,12(%esp)
movl 16(%esi),%edx
movl 20(%esi),%ebx
movl 24(%esi),%ecx
movl 28(%esi),%edi
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %edi,28(%esp)
.align 16
.L00300_15:
movl 92(%esp),%ebx
movl %edx,%ecx
rorl $14,%ecx
movl 20(%esp),%esi
xorl %edx,%ecx
rorl $5,%ecx
xorl %edx,%ecx
rorl $6,%ecx
movl 24(%esp),%edi
addl %ecx,%ebx
xorl %edi,%esi
movl %edx,16(%esp)
movl %eax,%ecx
andl %edx,%esi
movl 12(%esp),%edx
xorl %edi,%esi
movl %eax,%edi
addl %esi,%ebx
rorl $9,%ecx
addl 28(%esp),%ebx
xorl %eax,%ecx
rorl $11,%ecx
movl 4(%esp),%esi
xorl %eax,%ecx
rorl $2,%ecx
addl %ebx,%edx
movl 8(%esp),%edi
addl %ecx,%ebx
movl %eax,(%esp)
movl %eax,%ecx
subl $4,%esp
orl %esi,%eax
andl %esi,%ecx
andl %edi,%eax
movl (%ebp),%esi
orl %ecx,%eax
addl $4,%ebp
addl %ebx,%eax
addl %esi,%edx
addl %esi,%eax
cmpl $3248222580,%esi
jne .L00300_15
movl 152(%esp),%ebx
.align 16
.L00416_63:
movl %ebx,%esi
movl 100(%esp),%ecx
rorl $11,%esi
movl %ecx,%edi
xorl %ebx,%esi
rorl $7,%esi
shrl $3,%ebx
rorl $2,%edi
xorl %esi,%ebx
xorl %ecx,%edi
rorl $17,%edi
shrl $10,%ecx
addl 156(%esp),%ebx
xorl %ecx,%edi
addl 120(%esp),%ebx
movl %edx,%ecx
addl %edi,%ebx
rorl $14,%ecx
movl 20(%esp),%esi
xorl %edx,%ecx
rorl $5,%ecx
movl %ebx,92(%esp)
xorl %edx,%ecx
rorl $6,%ecx
movl 24(%esp),%edi
addl %ecx,%ebx
xorl %edi,%esi
movl %edx,16(%esp)
movl %eax,%ecx
andl %edx,%esi
movl 12(%esp),%edx
xorl %edi,%esi
movl %eax,%edi
addl %esi,%ebx
rorl $9,%ecx
addl 28(%esp),%ebx
xorl %eax,%ecx
rorl $11,%ecx
movl 4(%esp),%esi
xorl %eax,%ecx
rorl $2,%ecx
addl %ebx,%edx
movl 8(%esp),%edi
addl %ecx,%ebx
movl %eax,(%esp)
movl %eax,%ecx
subl $4,%esp
orl %esi,%eax
andl %esi,%ecx
andl %edi,%eax
movl (%ebp),%esi
orl %ecx,%eax
addl $4,%ebp
addl %ebx,%eax
movl 152(%esp),%ebx
addl %esi,%edx
addl %esi,%eax
cmpl $3329325298,%esi
jne .L00416_63
movl 352(%esp),%esi
movl 4(%esp),%ebx
movl 8(%esp),%ecx
movl 12(%esp),%edi
addl (%esi),%eax
addl 4(%esi),%ebx
addl 8(%esi),%ecx
addl 12(%esi),%edi
movl %eax,(%esi)
movl %ebx,4(%esi)
movl %ecx,8(%esi)
movl %edi,12(%esi)
movl 20(%esp),%eax
movl 24(%esp),%ebx
movl 28(%esp),%ecx
movl 356(%esp),%edi
addl 16(%esi),%edx
addl 20(%esi),%eax
addl 24(%esi),%ebx
addl 28(%esi),%ecx
movl %edx,16(%esi)
movl %eax,20(%esi)
movl %ebx,24(%esi)
movl %ecx,28(%esi)
addl $352,%esp
subl $256,%ebp
cmpl 8(%esp),%edi
jb .L002loop
movl 12(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.align 64
.L001K256:
.long 1116352408,1899447441,3049323471,3921009573
.long 961987163,1508970993,2453635748,2870763221
.long 3624381080,310598401,607225278,1426881987
.long 1925078388,2162078206,2614888103,3248222580
.long 3835390401,4022224774,264347078,604807628
.long 770255983,1249150122,1555081692,1996064986
.long 2554220882,2821834349,2952996808,3210313671
.long 3336571891,3584528711,113926993,338241895
.long 666307205,773529912,1294757372,1396182291
.long 1695183700,1986661051,2177026350,2456956037
.long 2730485921,2820302411,3259730800,3345764771
.long 3516065817,3600352804,4094571909,275423344
.long 430227734,506948616,659060556,883997877
.long 958139571,1322822218,1537002063,1747873779
.long 1955562222,2024104815,2227730452,2361852424
.long 2428436474,2756734187,3204031479,3329325298
.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0

View File

@@ -0,0 +1,249 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256 block transform for x86. September 2007.
#
# Performance in clock cycles per processed byte (less is better):
#
# Pentium PIII P4 AMD K8 Core2
# gcc 46 36 41 27 26
# icc 57 33 38 25 23
# x86 asm 40 30 33 20 18
# x86_64 asm(*) - - 21 16 16
#
# (*) x86_64 assembler performance is presented for reference
# purposes.
#
# Performance improvement over compiler generated code varies from
# 10% to 40% [see above]. Not very impressive on some µ-archs, but
# it's 5 times smaller and optimizies amount of writes.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
$A="eax";
$E="edx";
$T="ebx";
$Aoff=&DWP(0,"esp");
$Boff=&DWP(4,"esp");
$Coff=&DWP(8,"esp");
$Doff=&DWP(12,"esp");
$Eoff=&DWP(16,"esp");
$Foff=&DWP(20,"esp");
$Goff=&DWP(24,"esp");
$Hoff=&DWP(28,"esp");
$Xoff=&DWP(32,"esp");
$K256="ebp";
sub BODY_00_15() {
my $in_16_63=shift;
&mov ("ecx",$E);
&add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
&ror ("ecx",25-11);
&mov ("esi",$Foff);
&xor ("ecx",$E);
&ror ("ecx",11-6);
&mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
&xor ("ecx",$E);
&ror ("ecx",6); # Sigma1(e)
&mov ("edi",$Goff);
&add ($T,"ecx"); # T += Sigma1(e)
&xor ("esi","edi");
&mov ($Eoff,$E); # modulo-scheduled
&mov ("ecx",$A);
&and ("esi",$E);
&mov ($E,$Doff); # e becomes d, which is e in next iteration
&xor ("esi","edi"); # Ch(e,f,g)
&mov ("edi",$A);
&add ($T,"esi"); # T += Ch(e,f,g)
&ror ("ecx",22-13);
&add ($T,$Hoff); # T += h
&xor ("ecx",$A);
&ror ("ecx",13-2);
&mov ("esi",$Boff);
&xor ("ecx",$A);
&ror ("ecx",2); # Sigma0(a)
&add ($E,$T); # d += T
&mov ("edi",$Coff);
&add ($T,"ecx"); # T += Sigma0(a)
&mov ($Aoff,$A); # modulo-scheduled
&mov ("ecx",$A);
&sub ("esp",4);
&or ($A,"esi"); # a becomes h, which is a in next iteration
&and ("ecx","esi");
&and ($A,"edi");
&mov ("esi",&DWP(0,$K256));
&or ($A,"ecx"); # h=Maj(a,b,c)
&add ($K256,4);
&add ($A,$T); # h += T
&mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_63); # preload T
&add ($E,"esi"); # d += K256[i]
&add ($A,"esi"); # h += K256[i]
}
&function_begin("sha256_block_data_order");
&mov ("esi",wparam(0)); # ctx
&mov ("edi",wparam(1)); # inp
&mov ("eax",wparam(2)); # num
&mov ("ebx","esp"); # saved sp
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($K256);
&lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
&sub ("esp",16);
&and ("esp",-64);
&shl ("eax",6);
&add ("eax","edi");
&mov (&DWP(0,"esp"),"esi"); # ctx
&mov (&DWP(4,"esp"),"edi"); # inp
&mov (&DWP(8,"esp"),"eax"); # inp+num*128
&mov (&DWP(12,"esp"),"ebx"); # saved sp
&set_label("loop",16);
# copy input block to stack reversing byte and dword order
for($i=0;$i<4;$i++) {
&mov ("eax",&DWP($i*16+0,"edi"));
&mov ("ebx",&DWP($i*16+4,"edi"));
&mov ("ecx",&DWP($i*16+8,"edi"));
&mov ("edx",&DWP($i*16+12,"edi"));
&bswap ("eax");
&bswap ("ebx");
&bswap ("ecx");
&bswap ("edx");
&push ("eax");
&push ("ebx");
&push ("ecx");
&push ("edx");
}
&add ("edi",64);
&sub ("esp",4*8); # place for A,B,C,D,E,F,G,H
&mov (&DWP(4*(8+16)+4,"esp"),"edi");
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&mov ($A,&DWP(0,"esi"));
&mov ("ebx",&DWP(4,"esi"));
&mov ("ecx",&DWP(8,"esi"));
&mov ("edi",&DWP(12,"esi"));
# &mov ($Aoff,$A);
&mov ($Boff,"ebx");
&mov ($Coff,"ecx");
&mov ($Doff,"edi");
&mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("edi",&DWP(28,"esi"));
# &mov ($Eoff,$E);
&mov ($Foff,"ebx");
&mov ($Goff,"ecx");
&mov ($Hoff,"edi");
&set_label("00_15",16);
&mov ($T,&DWP(4*(8+15),"esp"));
&BODY_00_15();
&cmp ("esi",0xc19bf174);
&jne (&label("00_15"));
&mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
&set_label("16_63",16);
&mov ("esi",$T);
&mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
&ror ("esi",18-7);
&mov ("edi","ecx");
&xor ("esi",$T);
&ror ("esi",7);
&shr ($T,3);
&ror ("edi",19-17);
&xor ($T,"esi"); # T = sigma0(X[-15])
&xor ("edi","ecx");
&ror ("edi",17);
&shr ("ecx",10);
&add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
&xor ("edi","ecx"); # sigma1(X[-2])
&add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
# &add ($T,"edi"); # T += sigma1(X[-2])
# &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
&BODY_00_15(1);
&cmp ("esi",0xc67178f2);
&jne (&label("16_63"));
&mov ("esi",&DWP(4*(8+16+64)+0,"esp"));#ctx
# &mov ($A,$Aoff);
&mov ("ebx",$Boff);
&mov ("ecx",$Coff);
&mov ("edi",$Doff);
&add ($A,&DWP(0,"esi"));
&add ("ebx",&DWP(4,"esi"));
&add ("ecx",&DWP(8,"esi"));
&add ("edi",&DWP(12,"esi"));
&mov (&DWP(0,"esi"),$A);
&mov (&DWP(4,"esi"),"ebx");
&mov (&DWP(8,"esi"),"ecx");
&mov (&DWP(12,"esi"),"edi");
# &mov ($E,$Eoff);
&mov ("eax",$Foff);
&mov ("ebx",$Goff);
&mov ("ecx",$Hoff);
&mov ("edi",&DWP(4*(8+16+64)+4,"esp"));#inp
&add ($E,&DWP(16,"esi"));
&add ("eax",&DWP(20,"esi"));
&add ("ebx",&DWP(24,"esi"));
&add ("ecx",&DWP(28,"esi"));
&mov (&DWP(16,"esi"),$E);
&mov (&DWP(20,"esi"),"eax");
&mov (&DWP(24,"esi"),"ebx");
&mov (&DWP(28,"esi"),"ecx");
&add ("esp",4*(8+16+64)); # destroy frame
&sub ($K256,4*64); # rewind K
&cmp ("edi",&DWP(8,"esp")); # are we done yet?
&jb (&label("loop"));
&mov ("esp",&DWP(12,"esp")); # restore sp
&function_end_A();
&set_label("K256",64); # Yes! I keep it in the code segment!
&data_word(0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5);
&data_word(0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5);
&data_word(0xd807aa98,0x12835b01,0x243185be,0x550c7dc3);
&data_word(0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174);
&data_word(0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc);
&data_word(0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da);
&data_word(0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7);
&data_word(0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967);
&data_word(0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13);
&data_word(0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85);
&data_word(0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3);
&data_word(0xd192e819,0xd6990624,0xf40e3585,0x106aa070);
&data_word(0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5);
&data_word(0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3);
&data_word(0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208);
&data_word(0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2);
&function_end_B("sha256_block_data_order");
&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,656 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
# September 2013.
#
# Add NEON implementation. On Cortex A8 it was measured to process one
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
# code (meaning that latter performs sub-optimally, nothing was done
# about it).
# May 2014.
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
$D="r7";
$E="r8";
$F="r9";
$G="r10";
$H="r11";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
$t2="r12";
$Ktbl="r14";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
@ ldr $t1,[$inp],#4 @ $i
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
rev $t1,$t1
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
ldrb $t0,[$inp,#1]
orr $t1,$t1,$t2,lsl#8
ldrb $t2,[$inp],#4
orr $t1,$t1,$t0,lsl#16
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
orr $t1,$t1,$t2,lsl#24
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
add $h,$h,$t1 @ h+=X[i]
str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
add $h,$h,$t1 @ h+=Ch(e,f,g)
#if $i==31
and $t2,$t2,#0xff
cmp $t2,#0xf2 @ done?
#endif
#if $i<15
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4 @ prefetch
# else
ldrb $t1,[$inp,#3]
# endif
eor $t2,$a,$b @ a^b, b^c in next round
#else
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
eor $t2,$a,$b @ a^b, b^c in next round
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
#endif
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
and $t3,$t3,$t2 @ (b^c)&=(a^b)
add $d,$d,$h @ d+=h
eor $t3,$t3,$b @ Maj(a,b,c)
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t4,[sp,#`($i+14)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
mov $t2,$t4,ror#$sigma1[0]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t2,$t2,$t4,ror#$sigma1[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
ldr $t1,[sp,#`($i+0)%16`*4]
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
ldr $t4,[sp,#`($i+9)%16`*4]
add $t2,$t2,$t0
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
add $t1,$t1,$t2
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
.type K256,%object
.align 5
K256:
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha256_block_data_order
.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256+32 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4
# else
ldrb $t1,[$inp,#3]
# endif
eor $t3,$B,$C @ magic
eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t0,[$t3,#0]
ldr $t1,[$t3,#4]
ldr $t2,[$t3,#8]
add $A,$A,$t0
ldr $t0,[$t3,#12]
add $B,$B,$t1
ldr $t1,[$t3,#16]
add $C,$C,$t2
ldr $t2,[$t3,#20]
add $D,$D,$t0
ldr $t0,[$t3,#24]
add $E,$E,$t1
ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order
___
######################################################################
# NEON stuff
#
{{{
my @X=map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
my $Xfer=$t4;
my $j=0;
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
while($#insns>=2) { eval(shift(@insns)); }
&vst1_32 ("{$T0}","[$Xfer,:128]!");
eval(shift(@insns));
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&vst1_32 ("{$T0}","[$Xfer,:128]!");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&eor ($t1,$f,$g)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&and ($t1,$t1,$e)',
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&ldr ($t1,"[sp,#64]") if ($j==31)',
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&add ($d,$d,$h)', # d+=h
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.type sha256_block_data_order_neon,%function
.align 4
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
mov $t2,sp
sub sp,sp,#16*4+16 @ alloca
sub $Ktbl,r3,#256+32 @ K256
bic sp,sp,#15 @ align for 128-bit stores
vld1.8 {@X[0]},[$inp]!
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
vld1.32 {$T0},[$Ktbl,:128]!
vld1.32 {$T1},[$Ktbl,:128]!
vld1.32 {$T2},[$Ktbl,:128]!
vld1.32 {$T3},[$Ktbl,:128]!
vrev32.8 @X[0],@X[0] @ yes, even on
str $ctx,[sp,#64]
vrev32.8 @X[1],@X[1] @ big-endian
str $inp,[sp,#68]
mov $Xfer,sp
vrev32.8 @X[2],@X[2]
str $len,[sp,#72]
vrev32.8 @X[3],@X[3]
str $t2,[sp,#76] @ save original sp
vadd.i32 $T0,$T0,@X[0]
vadd.i32 $T1,$T1,@X[1]
vst1.32 {$T0},[$Xfer,:128]!
vadd.i32 $T2,$T2,@X[2]
vst1.32 {$T1},[$Xfer,:128]!
vadd.i32 $T3,$T3,@X[3]
vst1.32 {$T2},[$Xfer,:128]!
vst1.32 {$T3},[$Xfer,:128]!
ldmia $ctx,{$A-$H}
sub $Xfer,$Xfer,#64
ldr $t1,[sp,#0]
eor $t2,$t2,$t2
eor $t3,$B,$C
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
teq $t1,#0 @ check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
ldr $inp,[sp,#68]
ldr $t0,[sp,#72]
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
teq $inp,$t0
subeq $inp,$inp,#64 @ avoid SEGV
vld1.8 {@X[0]},[$inp]! @ load next input block
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
strne $inp,[sp,#68]
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
ldr $t0,[$t1,#0]
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t2,[$t1,#4]
ldr $t3,[$t1,#8]
ldr $t4,[$t1,#12]
add $A,$A,$t0 @ accumulate
ldr $t0,[$t1,#16]
add $B,$B,$t2
ldr $t2,[$t1,#20]
add $C,$C,$t3
ldr $t3,[$t1,#24]
add $D,$D,$t4
ldr $t4,[$t1,#28]
add $E,$E,$t0
str $A,[$t1],#4
add $F,$F,$t2
str $B,[$t1],#4
add $G,$G,$t3
str $C,[$t1],#4
add $H,$H,$t4
str $D,[$t1],#4
stmia $t1,{$E-$H}
movne $Xfer,sp
ldrne $t1,[sp,#0]
eorne $t2,$t2,$t2
ldreq sp,[sp,#76] @ restore original sp
eorne $t3,$B,$C
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
___
}}}
######################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
my @MSG=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
$code.=<<___;
#if __ARM_ARCH__>=7
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
sub $Ktbl,r3,#sha256_block_data_order-K256
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vld1.32 {$W0},[$Ktbl]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vrev32.8 @MSG[2],@MSG[2]
vrev32.8 @MSG[3],@MSG[3]
vmov $ABCD_SAVE,$ABCD @ offload
vmov $EFGH_SAVE,$EFGH
teq $inp,$len
___
for($i=0;$i<12;$i++) {
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vld1.32 {$W0},[$Ktbl]!
vadd.i32 $W1,$W1,@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vld1.32 {$W1},[$Ktbl]
vadd.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#256-16 @ rewind
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vadd.i32 $W1,$W1,@MSG[3]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
bne .Loop_v8
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
___
{ my %opcode = (
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,836 @@
.file "sha512-586.s"
.text
.globl sha512_block_data_order
.type sha512_block_data_order,@function
.align 16
sha512_block_data_order:
.L_sha512_block_data_order_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%esi
movl 24(%esp),%edi
movl 28(%esp),%eax
movl %esp,%ebx
call .L000pic_point
.L000pic_point:
popl %ebp
leal .L001K512-.L000pic_point(%ebp),%ebp
subl $16,%esp
andl $-64,%esp
shll $7,%eax
addl %edi,%eax
movl %esi,(%esp)
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
leal _GLOBAL_OFFSET_TABLE_+[.-.L001K512](%ebp),%edx
movl OPENSSL_ia32cap_P@GOT(%edx),%edx
btl $26,(%edx)
jnc .L002loop_x86
movq (%esi),%mm0
movq 8(%esi),%mm1
movq 16(%esi),%mm2
movq 24(%esi),%mm3
movq 32(%esi),%mm4
movq 40(%esi),%mm5
movq 48(%esi),%mm6
movq 56(%esi),%mm7
subl $80,%esp
.align 16
.L003loop_sse2:
movq %mm1,8(%esp)
movq %mm2,16(%esp)
movq %mm3,24(%esp)
movq %mm5,40(%esp)
movq %mm6,48(%esp)
movq %mm7,56(%esp)
movl (%edi),%ecx
movl 4(%edi),%edx
addl $8,%edi
bswap %ecx
bswap %edx
movl %ecx,76(%esp)
movl %edx,72(%esp)
.align 16
.L00400_14_sse2:
movl (%edi),%eax
movl 4(%edi),%ebx
addl $8,%edi
bswap %eax
bswap %ebx
movl %eax,68(%esp)
movl %ebx,64(%esp)
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
movq %mm4,%mm1
movq %mm4,%mm2
psrlq $14,%mm1
movq %mm4,32(%esp)
psllq $23,%mm2
movq %mm1,%mm3
psrlq $4,%mm1
pxor %mm2,%mm3
psllq $23,%mm2
pxor %mm1,%mm3
psrlq $23,%mm1
pxor %mm2,%mm3
psllq $4,%mm2
pxor %mm1,%mm3
paddq (%ebp),%mm7
pxor %mm2,%mm3
pxor %mm6,%mm5
movq 8(%esp),%mm1
pand %mm4,%mm5
movq 16(%esp),%mm2
pxor %mm6,%mm5
movq 24(%esp),%mm4
paddq %mm5,%mm3
movq %mm0,(%esp)
paddq %mm7,%mm3
movq %mm0,%mm5
movq %mm0,%mm6
paddq 72(%esp),%mm3
psrlq $28,%mm5
paddq %mm3,%mm4
psllq $25,%mm6
movq %mm5,%mm7
psrlq $6,%mm5
pxor %mm6,%mm7
psllq $5,%mm6
pxor %mm5,%mm7
psrlq $5,%mm5
pxor %mm6,%mm7
psllq $6,%mm6
pxor %mm5,%mm7
subl $8,%esp
pxor %mm6,%mm7
movq %mm0,%mm5
por %mm2,%mm0
pand %mm2,%mm5
pand %mm1,%mm0
por %mm0,%mm5
paddq %mm5,%mm7
movq %mm3,%mm0
movb (%ebp),%dl
paddq %mm7,%mm0
addl $8,%ebp
cmpb $53,%dl
jne .L00400_14_sse2
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
movq %mm4,%mm1
movq %mm4,%mm2
psrlq $14,%mm1
movq %mm4,32(%esp)
psllq $23,%mm2
movq %mm1,%mm3
psrlq $4,%mm1
pxor %mm2,%mm3
psllq $23,%mm2
pxor %mm1,%mm3
psrlq $23,%mm1
pxor %mm2,%mm3
psllq $4,%mm2
pxor %mm1,%mm3
paddq (%ebp),%mm7
pxor %mm2,%mm3
pxor %mm6,%mm5
movq 8(%esp),%mm1
pand %mm4,%mm5
movq 16(%esp),%mm2
pxor %mm6,%mm5
movq 24(%esp),%mm4
paddq %mm5,%mm3
movq %mm0,(%esp)
paddq %mm7,%mm3
movq %mm0,%mm5
movq %mm0,%mm6
paddq 72(%esp),%mm3
psrlq $28,%mm5
paddq %mm3,%mm4
psllq $25,%mm6
movq %mm5,%mm7
psrlq $6,%mm5
pxor %mm6,%mm7
psllq $5,%mm6
pxor %mm5,%mm7
psrlq $5,%mm5
pxor %mm6,%mm7
psllq $6,%mm6
pxor %mm5,%mm7
subl $8,%esp
pxor %mm6,%mm7
movq %mm0,%mm5
por %mm2,%mm0
movq 88(%esp),%mm6
pand %mm2,%mm5
pand %mm1,%mm0
movq 192(%esp),%mm2
por %mm0,%mm5
paddq %mm5,%mm7
movq %mm3,%mm0
movb (%ebp),%dl
paddq %mm7,%mm0
addl $8,%ebp
.align 16
.L00516_79_sse2:
movq %mm2,%mm1
psrlq $1,%mm2
movq %mm6,%mm7
psrlq $6,%mm6
movq %mm2,%mm3
psrlq $6,%mm2
movq %mm6,%mm5
psrlq $13,%mm6
pxor %mm2,%mm3
psrlq $1,%mm2
pxor %mm6,%mm5
psrlq $42,%mm6
pxor %mm2,%mm3
movq 200(%esp),%mm2
psllq $56,%mm1
pxor %mm6,%mm5
psllq $3,%mm7
pxor %mm1,%mm3
paddq 128(%esp),%mm2
psllq $7,%mm1
pxor %mm7,%mm5
psllq $42,%mm7
pxor %mm1,%mm3
pxor %mm7,%mm5
paddq %mm5,%mm3
paddq %mm2,%mm3
movq %mm3,72(%esp)
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
movq %mm4,%mm1
movq %mm4,%mm2
psrlq $14,%mm1
movq %mm4,32(%esp)
psllq $23,%mm2
movq %mm1,%mm3
psrlq $4,%mm1
pxor %mm2,%mm3
psllq $23,%mm2
pxor %mm1,%mm3
psrlq $23,%mm1
pxor %mm2,%mm3
psllq $4,%mm2
pxor %mm1,%mm3
paddq (%ebp),%mm7
pxor %mm2,%mm3
pxor %mm6,%mm5
movq 8(%esp),%mm1
pand %mm4,%mm5
movq 16(%esp),%mm2
pxor %mm6,%mm5
movq 24(%esp),%mm4
paddq %mm5,%mm3
movq %mm0,(%esp)
paddq %mm7,%mm3
movq %mm0,%mm5
movq %mm0,%mm6
paddq 72(%esp),%mm3
psrlq $28,%mm5
paddq %mm3,%mm4
psllq $25,%mm6
movq %mm5,%mm7
psrlq $6,%mm5
pxor %mm6,%mm7
psllq $5,%mm6
pxor %mm5,%mm7
psrlq $5,%mm5
pxor %mm6,%mm7
psllq $6,%mm6
pxor %mm5,%mm7
subl $8,%esp
pxor %mm6,%mm7
movq %mm0,%mm5
por %mm2,%mm0
movq 88(%esp),%mm6
pand %mm2,%mm5
pand %mm1,%mm0
movq 192(%esp),%mm2
por %mm0,%mm5
paddq %mm5,%mm7
movq %mm3,%mm0
movb (%ebp),%dl
paddq %mm7,%mm0
addl $8,%ebp
cmpb $23,%dl
jne .L00516_79_sse2
movq 8(%esp),%mm1
movq 16(%esp),%mm2
movq 24(%esp),%mm3
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
paddq (%esi),%mm0
paddq 8(%esi),%mm1
paddq 16(%esi),%mm2
paddq 24(%esi),%mm3
paddq 32(%esi),%mm4
paddq 40(%esi),%mm5
paddq 48(%esi),%mm6
paddq 56(%esi),%mm7
movq %mm0,(%esi)
movq %mm1,8(%esi)
movq %mm2,16(%esi)
movq %mm3,24(%esi)
movq %mm4,32(%esi)
movq %mm5,40(%esi)
movq %mm6,48(%esi)
movq %mm7,56(%esi)
addl $640,%esp
subl $640,%ebp
cmpl 88(%esp),%edi
jb .L003loop_sse2
emms
movl 92(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.align 16
.L002loop_x86:
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
movl 12(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 16(%edi),%eax
movl 20(%edi),%ebx
movl 24(%edi),%ecx
movl 28(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 32(%edi),%eax
movl 36(%edi),%ebx
movl 40(%edi),%ecx
movl 44(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 48(%edi),%eax
movl 52(%edi),%ebx
movl 56(%edi),%ecx
movl 60(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 64(%edi),%eax
movl 68(%edi),%ebx
movl 72(%edi),%ecx
movl 76(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 80(%edi),%eax
movl 84(%edi),%ebx
movl 88(%edi),%ecx
movl 92(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 96(%edi),%eax
movl 100(%edi),%ebx
movl 104(%edi),%ecx
movl 108(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
movl 112(%edi),%eax
movl 116(%edi),%ebx
movl 120(%edi),%ecx
movl 124(%edi),%edx
bswap %eax
bswap %ebx
bswap %ecx
bswap %edx
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
addl $128,%edi
subl $72,%esp
movl %edi,204(%esp)
leal 8(%esp),%edi
movl $16,%ecx
.long 2784229001
.align 16
.L00600_15_x86:
movl 40(%esp),%ecx
movl 44(%esp),%edx
movl %ecx,%esi
shrl $9,%ecx
movl %edx,%edi
shrl $9,%edx
movl %ecx,%ebx
shll $14,%esi
movl %edx,%eax
shll $14,%edi
xorl %esi,%ebx
shrl $5,%ecx
xorl %edi,%eax
shrl $5,%edx
xorl %ecx,%eax
shll $4,%esi
xorl %edx,%ebx
shll $4,%edi
xorl %esi,%ebx
shrl $4,%ecx
xorl %edi,%eax
shrl $4,%edx
xorl %ecx,%eax
shll $5,%esi
xorl %edx,%ebx
shll $5,%edi
xorl %esi,%eax
xorl %edi,%ebx
movl 48(%esp),%ecx
movl 52(%esp),%edx
movl 56(%esp),%esi
movl 60(%esp),%edi
addl 64(%esp),%eax
adcl 68(%esp),%ebx
xorl %esi,%ecx
xorl %edi,%edx
andl 40(%esp),%ecx
andl 44(%esp),%edx
addl 192(%esp),%eax
adcl 196(%esp),%ebx
xorl %esi,%ecx
xorl %edi,%edx
movl (%ebp),%esi
movl 4(%ebp),%edi
addl %ecx,%eax
adcl %edx,%ebx
movl 32(%esp),%ecx
movl 36(%esp),%edx
addl %esi,%eax
adcl %edi,%ebx
movl %eax,(%esp)
movl %ebx,4(%esp)
addl %ecx,%eax
adcl %edx,%ebx
movl 8(%esp),%ecx
movl 12(%esp),%edx
movl %eax,32(%esp)
movl %ebx,36(%esp)
movl %ecx,%esi
shrl $2,%ecx
movl %edx,%edi
shrl $2,%edx
movl %ecx,%ebx
shll $4,%esi
movl %edx,%eax
shll $4,%edi
xorl %esi,%ebx
shrl $5,%ecx
xorl %edi,%eax
shrl $5,%edx
xorl %ecx,%ebx
shll $21,%esi
xorl %edx,%eax
shll $21,%edi
xorl %esi,%eax
shrl $21,%ecx
xorl %edi,%ebx
shrl $21,%edx
xorl %ecx,%eax
shll $5,%esi
xorl %edx,%ebx
shll $5,%edi
xorl %esi,%eax
xorl %edi,%ebx
movl 8(%esp),%ecx
movl 12(%esp),%edx
movl 16(%esp),%esi
movl 20(%esp),%edi
addl (%esp),%eax
adcl 4(%esp),%ebx
orl %esi,%ecx
orl %edi,%edx
andl 24(%esp),%ecx
andl 28(%esp),%edx
andl 8(%esp),%esi
andl 12(%esp),%edi
orl %esi,%ecx
orl %edi,%edx
addl %ecx,%eax
adcl %edx,%ebx
movl %eax,(%esp)
movl %ebx,4(%esp)
movb (%ebp),%dl
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $148,%dl
jne .L00600_15_x86
.align 16
.L00716_79_x86:
movl 312(%esp),%ecx
movl 316(%esp),%edx
movl %ecx,%esi
shrl $1,%ecx
movl %edx,%edi
shrl $1,%edx
movl %ecx,%eax
shll $24,%esi
movl %edx,%ebx
shll $24,%edi
xorl %esi,%ebx
shrl $6,%ecx
xorl %edi,%eax
shrl $6,%edx
xorl %ecx,%eax
shll $7,%esi
xorl %edx,%ebx
shll $1,%edi
xorl %esi,%ebx
shrl $1,%ecx
xorl %edi,%eax
shrl $1,%edx
xorl %ecx,%eax
shll $6,%edi
xorl %edx,%ebx
xorl %edi,%eax
movl %eax,(%esp)
movl %ebx,4(%esp)
movl 208(%esp),%ecx
movl 212(%esp),%edx
movl %ecx,%esi
shrl $6,%ecx
movl %edx,%edi
shrl $6,%edx
movl %ecx,%eax
shll $3,%esi
movl %edx,%ebx
shll $3,%edi
xorl %esi,%eax
shrl $13,%ecx
xorl %edi,%ebx
shrl $13,%edx
xorl %ecx,%eax
shll $10,%esi
xorl %edx,%ebx
shll $10,%edi
xorl %esi,%ebx
shrl $10,%ecx
xorl %edi,%eax
shrl $10,%edx
xorl %ecx,%ebx
shll $13,%edi
xorl %edx,%eax
xorl %edi,%eax
movl 320(%esp),%ecx
movl 324(%esp),%edx
addl (%esp),%eax
adcl 4(%esp),%ebx
movl 248(%esp),%esi
movl 252(%esp),%edi
addl %ecx,%eax
adcl %edx,%ebx
addl %esi,%eax
adcl %edi,%ebx
movl %eax,192(%esp)
movl %ebx,196(%esp)
movl 40(%esp),%ecx
movl 44(%esp),%edx
movl %ecx,%esi
shrl $9,%ecx
movl %edx,%edi
shrl $9,%edx
movl %ecx,%ebx
shll $14,%esi
movl %edx,%eax
shll $14,%edi
xorl %esi,%ebx
shrl $5,%ecx
xorl %edi,%eax
shrl $5,%edx
xorl %ecx,%eax
shll $4,%esi
xorl %edx,%ebx
shll $4,%edi
xorl %esi,%ebx
shrl $4,%ecx
xorl %edi,%eax
shrl $4,%edx
xorl %ecx,%eax
shll $5,%esi
xorl %edx,%ebx
shll $5,%edi
xorl %esi,%eax
xorl %edi,%ebx
movl 48(%esp),%ecx
movl 52(%esp),%edx
movl 56(%esp),%esi
movl 60(%esp),%edi
addl 64(%esp),%eax
adcl 68(%esp),%ebx
xorl %esi,%ecx
xorl %edi,%edx
andl 40(%esp),%ecx
andl 44(%esp),%edx
addl 192(%esp),%eax
adcl 196(%esp),%ebx
xorl %esi,%ecx
xorl %edi,%edx
movl (%ebp),%esi
movl 4(%ebp),%edi
addl %ecx,%eax
adcl %edx,%ebx
movl 32(%esp),%ecx
movl 36(%esp),%edx
addl %esi,%eax
adcl %edi,%ebx
movl %eax,(%esp)
movl %ebx,4(%esp)
addl %ecx,%eax
adcl %edx,%ebx
movl 8(%esp),%ecx
movl 12(%esp),%edx
movl %eax,32(%esp)
movl %ebx,36(%esp)
movl %ecx,%esi
shrl $2,%ecx
movl %edx,%edi
shrl $2,%edx
movl %ecx,%ebx
shll $4,%esi
movl %edx,%eax
shll $4,%edi
xorl %esi,%ebx
shrl $5,%ecx
xorl %edi,%eax
shrl $5,%edx
xorl %ecx,%ebx
shll $21,%esi
xorl %edx,%eax
shll $21,%edi
xorl %esi,%eax
shrl $21,%ecx
xorl %edi,%ebx
shrl $21,%edx
xorl %ecx,%eax
shll $5,%esi
xorl %edx,%ebx
shll $5,%edi
xorl %esi,%eax
xorl %edi,%ebx
movl 8(%esp),%ecx
movl 12(%esp),%edx
movl 16(%esp),%esi
movl 20(%esp),%edi
addl (%esp),%eax
adcl 4(%esp),%ebx
orl %esi,%ecx
orl %edi,%edx
andl 24(%esp),%ecx
andl 28(%esp),%edx
andl 8(%esp),%esi
andl 12(%esp),%edi
orl %esi,%ecx
orl %edi,%edx
addl %ecx,%eax
adcl %edx,%ebx
movl %eax,(%esp)
movl %ebx,4(%esp)
movb (%ebp),%dl
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $23,%dl
jne .L00716_79_x86
movl 840(%esp),%esi
movl 844(%esp),%edi
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edx
addl 8(%esp),%eax
adcl 12(%esp),%ebx
movl %eax,(%esi)
movl %ebx,4(%esi)
addl 16(%esp),%ecx
adcl 20(%esp),%edx
movl %ecx,8(%esi)
movl %edx,12(%esi)
movl 16(%esi),%eax
movl 20(%esi),%ebx
movl 24(%esi),%ecx
movl 28(%esi),%edx
addl 24(%esp),%eax
adcl 28(%esp),%ebx
movl %eax,16(%esi)
movl %ebx,20(%esi)
addl 32(%esp),%ecx
adcl 36(%esp),%edx
movl %ecx,24(%esi)
movl %edx,28(%esi)
movl 32(%esi),%eax
movl 36(%esi),%ebx
movl 40(%esi),%ecx
movl 44(%esi),%edx
addl 40(%esp),%eax
adcl 44(%esp),%ebx
movl %eax,32(%esi)
movl %ebx,36(%esi)
addl 48(%esp),%ecx
adcl 52(%esp),%edx
movl %ecx,40(%esi)
movl %edx,44(%esi)
movl 48(%esi),%eax
movl 52(%esi),%ebx
movl 56(%esi),%ecx
movl 60(%esi),%edx
addl 56(%esp),%eax
adcl 60(%esp),%ebx
movl %eax,48(%esi)
movl %ebx,52(%esi)
addl 64(%esp),%ecx
adcl 68(%esp),%edx
movl %ecx,56(%esi)
movl %edx,60(%esi)
addl $840,%esp
subl $640,%ebp
cmpl 8(%esp),%edi
jb .L002loop_x86
movl 12(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.align 64
.L001K512:
.long 3609767458,1116352408
.long 602891725,1899447441
.long 3964484399,3049323471
.long 2173295548,3921009573
.long 4081628472,961987163
.long 3053834265,1508970993
.long 2937671579,2453635748
.long 3664609560,2870763221
.long 2734883394,3624381080
.long 1164996542,310598401
.long 1323610764,607225278
.long 3590304994,1426881987
.long 4068182383,1925078388
.long 991336113,2162078206
.long 633803317,2614888103
.long 3479774868,3248222580
.long 2666613458,3835390401
.long 944711139,4022224774
.long 2341262773,264347078
.long 2007800933,604807628
.long 1495990901,770255983
.long 1856431235,1249150122
.long 3175218132,1555081692
.long 2198950837,1996064986
.long 3999719339,2554220882
.long 766784016,2821834349
.long 2566594879,2952996808
.long 3203337956,3210313671
.long 1034457026,3336571891
.long 2466948901,3584528711
.long 3758326383,113926993
.long 168717936,338241895
.long 1188179964,666307205
.long 1546045734,773529912
.long 1522805485,1294757372
.long 2643833823,1396182291
.long 2343527390,1695183700
.long 1014477480,1986661051
.long 1206759142,2177026350
.long 344077627,2456956037
.long 1290863460,2730485921
.long 3158454273,2820302411
.long 3505952657,3259730800
.long 106217008,3345764771
.long 3606008344,3516065817
.long 1432725776,3600352804
.long 1467031594,4094571909
.long 851169720,275423344
.long 3100823752,430227734
.long 1363258195,506948616
.long 3750685593,659060556
.long 3785050280,883997877
.long 3318307427,958139571
.long 3812723403,1322822218
.long 2003034995,1537002063
.long 3602036899,1747873779
.long 1575990012,1955562222
.long 1125592928,2024104815
.long 2716904306,2227730452
.long 442776044,2361852424
.long 593698344,2428436474
.long 3733110249,2756734187
.long 2999351573,3204031479
.long 3815920427,3329325298
.long 3928383900,3391569614
.long 566280711,3515267271
.long 3454069534,3940187606
.long 4000239992,4118630271
.long 1914138554,116418474
.long 2731055270,174292421
.long 3203993006,289380356
.long 320620315,460393269
.long 587496836,685471733
.long 1086792851,852142971
.long 365543100,1017036298
.long 2618297676,1126000580
.long 3409855158,1288033470
.long 4234509866,1501505948
.long 987167468,1607167915
.long 1246189591,1816402316
.size sha512_block_data_order,.-.L_sha512_block_data_order_begin
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.comm OPENSSL_ia32cap_P,8,4

View File

@@ -0,0 +1,644 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA512 block transform for x86. September 2007.
#
# Performance in clock cycles per processed byte (less is better):
#
# Pentium PIII P4 AMD K8 Core2
# gcc 100 75 116 54 66
# icc 97 77 95 55 57
# x86 asm 61 56 82 36 40
# SSE2 asm - - 38 24 20
# x86_64 asm(*) - - 30 10.0 10.5
#
# (*) x86_64 assembler performance is presented for reference
# purposes.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
# to 50%, but it's less important as they are expected to execute SSE2
# code-path, which is commonly ~2-3x faster [than compiler generated
# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
# though it does not use 128-bit operations. The latter means that
# SSE2-aware kernel is no longer required to execute the code. Another
# difference is that new code optimizes amount of writes, but at the
# cost of increased data cache "footprint" by 1/2KB.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
$K512="ebp";
$Asse2=&QWP(0,"esp");
$Bsse2=&QWP(8,"esp");
$Csse2=&QWP(16,"esp");
$Dsse2=&QWP(24,"esp");
$Esse2=&QWP(32,"esp");
$Fsse2=&QWP(40,"esp");
$Gsse2=&QWP(48,"esp");
$Hsse2=&QWP(56,"esp");
$A="mm0"; # B-D and
$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
sub BODY_00_15_sse2 {
my $prefetch=shift;
&movq ("mm5",$Fsse2); # load f
&movq ("mm6",$Gsse2); # load g
&movq ("mm7",$Hsse2); # load h
&movq ("mm1",$E); # %mm1 is sliding right
&movq ("mm2",$E); # %mm2 is sliding left
&psrlq ("mm1",14);
&movq ($Esse2,$E); # modulo-scheduled save e
&psllq ("mm2",23);
&movq ("mm3","mm1"); # %mm3 is T1
&psrlq ("mm1",4);
&pxor ("mm3","mm2");
&psllq ("mm2",23);
&pxor ("mm3","mm1");
&psrlq ("mm1",23);
&pxor ("mm3","mm2");
&psllq ("mm2",4);
&pxor ("mm3","mm1");
&paddq ("mm7",QWP(0,$K512)); # h+=K512[i]
&pxor ("mm3","mm2"); # T1=Sigma1_512(e)
&pxor ("mm5","mm6"); # f^=g
&movq ("mm1",$Bsse2); # load b
&pand ("mm5",$E); # f&=e
&movq ("mm2",$Csse2); # load c
&pxor ("mm5","mm6"); # f^=g
&movq ($E,$Dsse2); # e = load d
&paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
&movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
&paddq ("mm3","mm7"); # T1+=h
&movq ("mm5",$A); # %mm5 is sliding right
&movq ("mm6",$A); # %mm6 is sliding left
&paddq ("mm3",&QWP(8*9,"esp")); # T1+=X[0]
&psrlq ("mm5",28);
&paddq ($E,"mm3"); # e += T1
&psllq ("mm6",25);
&movq ("mm7","mm5"); # %mm7 is T2
&psrlq ("mm5",6);
&pxor ("mm7","mm6");
&psllq ("mm6",5);
&pxor ("mm7","mm5");
&psrlq ("mm5",5);
&pxor ("mm7","mm6");
&psllq ("mm6",6);
&pxor ("mm7","mm5");
&sub ("esp",8);
&pxor ("mm7","mm6"); # T2=Sigma0_512(a)
&movq ("mm5",$A); # %mm5=a
&por ($A,"mm2"); # a=a|c
&movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
&pand ("mm5","mm2"); # %mm5=a&c
&pand ($A,"mm1"); # a=(a|c)&b
&movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
&por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
&paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
&movq ($A,"mm3"); # a=T1
&mov (&LB("edx"),&BP(0,$K512));
&paddq ($A,"mm7"); # a+=T2
&add ($K512,8);
}
sub BODY_00_15_x86 {
#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
# LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
# HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
&mov ("ecx",$Elo);
&mov ("edx",$Ehi);
&mov ("esi","ecx");
&shr ("ecx",9); # lo>>9
&mov ("edi","edx");
&shr ("edx",9); # hi>>9
&mov ("ebx","ecx");
&shl ("esi",14); # lo<<14
&mov ("eax","edx");
&shl ("edi",14); # hi<<14
&xor ("ebx","esi");
&shr ("ecx",14-9); # lo>>14
&xor ("eax","edi");
&shr ("edx",14-9); # hi>>14
&xor ("eax","ecx");
&shl ("esi",18-14); # lo<<18
&xor ("ebx","edx");
&shl ("edi",18-14); # hi<<18
&xor ("ebx","esi");
&shr ("ecx",18-14); # lo>>18
&xor ("eax","edi");
&shr ("edx",18-14); # hi>>18
&xor ("eax","ecx");
&shl ("esi",23-18); # lo<<23
&xor ("ebx","edx");
&shl ("edi",23-18); # hi<<23
&xor ("eax","esi");
&xor ("ebx","edi"); # T1 = Sigma1(e)
&mov ("ecx",$Flo);
&mov ("edx",$Fhi);
&mov ("esi",$Glo);
&mov ("edi",$Ghi);
&add ("eax",$Hlo);
&adc ("ebx",$Hhi); # T1 += h
&xor ("ecx","esi");
&xor ("edx","edi");
&and ("ecx",$Elo);
&and ("edx",$Ehi);
&add ("eax",&DWP(8*(9+15)+0,"esp"));
&adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
&xor ("ecx","esi");
&xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
&mov ("esi",&DWP(0,$K512));
&mov ("edi",&DWP(4,$K512)); # K[i]
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Ch(e,f,g)
&mov ("ecx",$Dlo);
&mov ("edx",$Dhi);
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += K[i]
&mov ($Tlo,"eax");
&mov ($Thi,"ebx"); # put T1 away
&add ("eax","ecx");
&adc ("ebx","edx"); # d += T1
#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
# LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
# HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ($Dlo,"eax");
&mov ($Dhi,"ebx");
&mov ("esi","ecx");
&shr ("ecx",2); # lo>>2
&mov ("edi","edx");
&shr ("edx",2); # hi>>2
&mov ("ebx","ecx");
&shl ("esi",4); # lo<<4
&mov ("eax","edx");
&shl ("edi",4); # hi<<4
&xor ("ebx","esi");
&shr ("ecx",7-2); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-2); # hi>>7
&xor ("ebx","ecx");
&shl ("esi",25-4); # lo<<25
&xor ("eax","edx");
&shl ("edi",25-4); # hi<<25
&xor ("eax","esi");
&shr ("ecx",28-7); # lo>>28
&xor ("ebx","edi");
&shr ("edx",28-7); # hi>>28
&xor ("eax","ecx");
&shl ("esi",30-25); # lo<<30
&xor ("ebx","edx");
&shl ("edi",30-25); # hi<<30
&xor ("eax","esi");
&xor ("ebx","edi"); # Sigma0(a)
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ("esi",$Blo);
&mov ("edi",$Bhi);
&add ("eax",$Tlo);
&adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
&or ("ecx","esi");
&or ("edx","edi");
&and ("ecx",$Clo);
&and ("edx",$Chi);
&and ("esi",$Alo);
&and ("edi",$Ahi);
&or ("ecx","esi");
&or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Maj(a,b,c)
&mov ($Tlo,"eax");
&mov ($Thi,"ebx");
&mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
&sub ("esp",8);
&lea ($K512,&DWP(8,$K512)); # K++
}
&function_begin("sha512_block_data_order");
&mov ("esi",wparam(0)); # ctx
&mov ("edi",wparam(1)); # inp
&mov ("eax",wparam(2)); # num
&mov ("ebx","esp"); # saved sp
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($K512);
&lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
&sub ("esp",16);
&and ("esp",-64);
&shl ("eax",7);
&add ("eax","edi");
&mov (&DWP(0,"esp"),"esi"); # ctx
&mov (&DWP(4,"esp"),"edi"); # inp
&mov (&DWP(8,"esp"),"eax"); # inp+num*128
&mov (&DWP(12,"esp"),"ebx"); # saved sp
if ($sse2) {
&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
&bt (&DWP(0,"edx"),26);
&jnc (&label("loop_x86"));
# load ctx->h[0-7]
&movq ($A,&QWP(0,"esi"));
&movq ("mm1",&QWP(8,"esi"));
&movq ("mm2",&QWP(16,"esi"));
&movq ("mm3",&QWP(24,"esi"));
&movq ($E,&QWP(32,"esi"));
&movq ("mm5",&QWP(40,"esi"));
&movq ("mm6",&QWP(48,"esi"));
&movq ("mm7",&QWP(56,"esi"));
&sub ("esp",8*10);
&set_label("loop_sse2",16);
# &movq ($Asse2,$A);
&movq ($Bsse2,"mm1");
&movq ($Csse2,"mm2");
&movq ($Dsse2,"mm3");
# &movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&movq ($Gsse2,"mm6");
&movq ($Hsse2,"mm7");
&mov ("ecx",&DWP(0,"edi"));
&mov ("edx",&DWP(4,"edi"));
&add ("edi",8);
&bswap ("ecx");
&bswap ("edx");
&mov (&DWP(8*9+4,"esp"),"ecx");
&mov (&DWP(8*9+0,"esp"),"edx");
&set_label("00_14_sse2",16);
&mov ("eax",&DWP(0,"edi"));
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&bswap ("eax");
&bswap ("ebx");
&mov (&DWP(8*8+4,"esp"),"eax");
&mov (&DWP(8*8+0,"esp"),"ebx");
&BODY_00_15_sse2();
&cmp (&LB("edx"),0x35);
&jne (&label("00_14_sse2"));
&BODY_00_15_sse2(1);
&set_label("16_79_sse2",16);
#&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
#&movq ("mm6",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm2");
&psrlq ("mm2",1);
&movq ("mm7","mm6");
&psrlq ("mm6",6);
&movq ("mm3","mm2");
&psrlq ("mm2",7-1);
&movq ("mm5","mm6");
&psrlq ("mm6",19-6);
&pxor ("mm3","mm2");
&psrlq ("mm2",8-7);
&pxor ("mm5","mm6");
&psrlq ("mm6",61-19);
&pxor ("mm3","mm2");
&movq ("mm2",&QWP(8*(9+16),"esp"));
&psllq ("mm1",56);
&pxor ("mm5","mm6");
&psllq ("mm7",3);
&pxor ("mm3","mm1");
&paddq ("mm2",&QWP(8*(9+16-9),"esp"));
&psllq ("mm1",63-56);
&pxor ("mm5","mm7");
&psllq ("mm7",45-3);
&pxor ("mm3","mm1");
&pxor ("mm5","mm7");
&paddq ("mm3","mm5");
&paddq ("mm3","mm2");
&movq (&QWP(8*9,"esp"),"mm3");
&BODY_00_15_sse2(1);
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_sse2"));
# &movq ($A,$Asse2);
&movq ("mm1",$Bsse2);
&movq ("mm2",$Csse2);
&movq ("mm3",$Dsse2);
# &movq ($E,$Esse2);
&movq ("mm5",$Fsse2);
&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
&paddq ("mm2",&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
&paddq ("mm6",&QWP(48,"esi"));
&paddq ("mm7",&QWP(56,"esi"));
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
&movq (&QWP(16,"esi"),"mm2");
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
&add ("esp",8*80); # destroy frame
&sub ($K512,8*80); # rewind K
&cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
&jb (&label("loop_sse2"));
&emms ();
&mov ("esp",&DWP(8*10+12,"esp")); # restore sp
&function_end_A();
}
&set_label("loop_x86",16);
# copy input block to stack reversing byte and qword order
for ($i=0;$i<8;$i++) {
&mov ("eax",&DWP($i*16+0,"edi"));
&mov ("ebx",&DWP($i*16+4,"edi"));
&mov ("ecx",&DWP($i*16+8,"edi"));
&mov ("edx",&DWP($i*16+12,"edi"));
&bswap ("eax");
&bswap ("ebx");
&bswap ("ecx");
&bswap ("edx");
&push ("eax");
&push ("ebx");
&push ("ecx");
&push ("edx");
}
&add ("edi",128);
&sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
&mov (&DWP(8*(9+16)+4,"esp"),"edi");
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&lea ("edi",&DWP(8,"esp"));
&mov ("ecx",16);
&data_word(0xA5F3F689); # rep movsd
&set_label("00_15_x86",16);
&BODY_00_15_x86();
&cmp (&LB("edx"),0x94);
&jne (&label("00_15_x86"));
&set_label("16_79_x86",16);
#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
# LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
# HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
&mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",1); # lo>>1
&mov ("edi","edx");
&shr ("edx",1); # hi>>1
&mov ("eax","ecx");
&shl ("esi",24); # lo<<24
&mov ("ebx","edx");
&shl ("edi",24); # hi<<24
&xor ("ebx","esi");
&shr ("ecx",7-1); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-1); # hi>>7
&xor ("eax","ecx");
&shl ("esi",31-24); # lo<<31
&xor ("ebx","edx");
&shl ("edi",25-24); # hi<<25
&xor ("ebx","esi");
&shr ("ecx",8-7); # lo>>8
&xor ("eax","edi");
&shr ("edx",8-7); # hi>>8
&xor ("eax","ecx");
&shl ("edi",31-25); # hi<<31
&xor ("ebx","edx");
&xor ("eax","edi"); # T1 = sigma0(X[-15])
&mov (&DWP(0,"esp"),"eax");
&mov (&DWP(4,"esp"),"ebx"); # put T1 away
#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
# LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
# HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
&mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",6); # lo>>6
&mov ("edi","edx");
&shr ("edx",6); # hi>>6
&mov ("eax","ecx");
&shl ("esi",3); # lo<<3
&mov ("ebx","edx");
&shl ("edi",3); # hi<<3
&xor ("eax","esi");
&shr ("ecx",19-6); # lo>>19
&xor ("ebx","edi");
&shr ("edx",19-6); # hi>>19
&xor ("eax","ecx");
&shl ("esi",13-3); # lo<<13
&xor ("ebx","edx");
&shl ("edi",13-3); # hi<<13
&xor ("ebx","esi");
&shr ("ecx",29-19); # lo>>29
&xor ("eax","edi");
&shr ("edx",29-19); # hi>>29
&xor ("ebx","ecx");
&shl ("edi",26-13); # hi<<26
&xor ("eax","edx");
&xor ("eax","edi"); # sigma1(X[-2])
&mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
&add ("eax",&DWP(0,"esp"));
&adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
&mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
&mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += X[-16]
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += X[-7]
&mov (&DWP(8*(9+15)+0,"esp"),"eax");
&mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
&BODY_00_15_x86();
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_x86"));
&mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
&mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
for($i=0;$i<4;$i++) {
&mov ("eax",&DWP($i*16+0,"esi"));
&mov ("ebx",&DWP($i*16+4,"esi"));
&mov ("ecx",&DWP($i*16+8,"esi"));
&mov ("edx",&DWP($i*16+12,"esi"));
&add ("eax",&DWP(8+($i*16)+0,"esp"));
&adc ("ebx",&DWP(8+($i*16)+4,"esp"));
&mov (&DWP($i*16+0,"esi"),"eax");
&mov (&DWP($i*16+4,"esi"),"ebx");
&add ("ecx",&DWP(8+($i*16)+8,"esp"));
&adc ("edx",&DWP(8+($i*16)+12,"esp"));
&mov (&DWP($i*16+8,"esi"),"ecx");
&mov (&DWP($i*16+12,"esi"),"edx");
}
&add ("esp",8*(9+16+80)); # destroy frame
&sub ($K512,8*80); # rewind K
&cmp ("edi",&DWP(8,"esp")); # are we done yet?
&jb (&label("loop_x86"));
&mov ("esp",&DWP(12,"esp")); # restore sp
&function_end_A();
&set_label("K512",64); # Yes! I keep it in the code segment!
&data_word(0xd728ae22,0x428a2f98); # u64
&data_word(0x23ef65cd,0x71374491); # u64
&data_word(0xec4d3b2f,0xb5c0fbcf); # u64
&data_word(0x8189dbbc,0xe9b5dba5); # u64
&data_word(0xf348b538,0x3956c25b); # u64
&data_word(0xb605d019,0x59f111f1); # u64
&data_word(0xaf194f9b,0x923f82a4); # u64
&data_word(0xda6d8118,0xab1c5ed5); # u64
&data_word(0xa3030242,0xd807aa98); # u64
&data_word(0x45706fbe,0x12835b01); # u64
&data_word(0x4ee4b28c,0x243185be); # u64
&data_word(0xd5ffb4e2,0x550c7dc3); # u64
&data_word(0xf27b896f,0x72be5d74); # u64
&data_word(0x3b1696b1,0x80deb1fe); # u64
&data_word(0x25c71235,0x9bdc06a7); # u64
&data_word(0xcf692694,0xc19bf174); # u64
&data_word(0x9ef14ad2,0xe49b69c1); # u64
&data_word(0x384f25e3,0xefbe4786); # u64
&data_word(0x8b8cd5b5,0x0fc19dc6); # u64
&data_word(0x77ac9c65,0x240ca1cc); # u64
&data_word(0x592b0275,0x2de92c6f); # u64
&data_word(0x6ea6e483,0x4a7484aa); # u64
&data_word(0xbd41fbd4,0x5cb0a9dc); # u64
&data_word(0x831153b5,0x76f988da); # u64
&data_word(0xee66dfab,0x983e5152); # u64
&data_word(0x2db43210,0xa831c66d); # u64
&data_word(0x98fb213f,0xb00327c8); # u64
&data_word(0xbeef0ee4,0xbf597fc7); # u64
&data_word(0x3da88fc2,0xc6e00bf3); # u64
&data_word(0x930aa725,0xd5a79147); # u64
&data_word(0xe003826f,0x06ca6351); # u64
&data_word(0x0a0e6e70,0x14292967); # u64
&data_word(0x46d22ffc,0x27b70a85); # u64
&data_word(0x5c26c926,0x2e1b2138); # u64
&data_word(0x5ac42aed,0x4d2c6dfc); # u64
&data_word(0x9d95b3df,0x53380d13); # u64
&data_word(0x8baf63de,0x650a7354); # u64
&data_word(0x3c77b2a8,0x766a0abb); # u64
&data_word(0x47edaee6,0x81c2c92e); # u64
&data_word(0x1482353b,0x92722c85); # u64
&data_word(0x4cf10364,0xa2bfe8a1); # u64
&data_word(0xbc423001,0xa81a664b); # u64
&data_word(0xd0f89791,0xc24b8b70); # u64
&data_word(0x0654be30,0xc76c51a3); # u64
&data_word(0xd6ef5218,0xd192e819); # u64
&data_word(0x5565a910,0xd6990624); # u64
&data_word(0x5771202a,0xf40e3585); # u64
&data_word(0x32bbd1b8,0x106aa070); # u64
&data_word(0xb8d2d0c8,0x19a4c116); # u64
&data_word(0x5141ab53,0x1e376c08); # u64
&data_word(0xdf8eeb99,0x2748774c); # u64
&data_word(0xe19b48a8,0x34b0bcb5); # u64
&data_word(0xc5c95a63,0x391c0cb3); # u64
&data_word(0xe3418acb,0x4ed8aa4a); # u64
&data_word(0x7763e373,0x5b9cca4f); # u64
&data_word(0xd6b2b8a3,0x682e6ff3); # u64
&data_word(0x5defb2fc,0x748f82ee); # u64
&data_word(0x43172f60,0x78a5636f); # u64
&data_word(0xa1f0ab72,0x84c87814); # u64
&data_word(0x1a6439ec,0x8cc70208); # u64
&data_word(0x23631e28,0x90befffa); # u64
&data_word(0xde82bde9,0xa4506ceb); # u64
&data_word(0xb2c67915,0xbef9a3f7); # u64
&data_word(0xe372532b,0xc67178f2); # u64
&data_word(0xea26619c,0xca273ece); # u64
&data_word(0x21c0c207,0xd186b8c7); # u64
&data_word(0xcde0eb1e,0xeada7dd6); # u64
&data_word(0xee6ed178,0xf57d4f7f); # u64
&data_word(0x72176fba,0x06f067aa); # u64
&data_word(0xa2c898a6,0x0a637dc5); # u64
&data_word(0xbef90dae,0x113f9804); # u64
&data_word(0x131c471b,0x1b710b35); # u64
&data_word(0x23047d84,0x28db77f5); # u64
&data_word(0x40c72493,0x32caab7b); # u64
&data_word(0x15c9bebc,0x3c9ebe0a); # u64
&data_word(0x9c100d4c,0x431d67c4); # u64
&data_word(0xcb3e42b6,0x4cc5d4be); # u64
&data_word(0xfc657e2a,0x597f299c); # u64
&data_word(0x3ad6faec,0x5fcb6fab); # u64
&data_word(0x4a475817,0x6c44198c); # u64
&function_end_B("sha512_block_data_order");
&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,583 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 25.5 cycles or 47% faster than integer-only code.
# Byte order [in]dependence. =========================================
#
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
$Ahi="r6";
$Elo="r7";
$Ehi="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
############ r13 is stack pointer
$Ktbl="r14";
############ r15 is program counter
$Aoff=8*0;
$Boff=8*1;
$Coff=8*2;
$Doff=8*3;
$Eoff=8*4;
$Foff=8*5;
$Goff=8*6;
$Hoff=8*7;
$Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
eor $t1,$t1,$Elo,lsl#14
eor $t0,$t0,$Ehi,lsr#9
eor $t1,$t1,$Elo,lsr#9
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
mov $t0,$Alo,lsr#28
mov $t1,$Ahi,lsr#28
eor $t0,$t0,$Ahi,lsl#4
eor $t1,$t1,$Alo,lsl#4
eor $t0,$t0,$Ahi,lsr#2
eor $t1,$t1,$Alo,lsr#2
eor $t0,$t0,$Alo,lsl#30
eor $t1,$t1,$Ahi,lsl#30
eor $t0,$t0,$Ahi,lsr#7
eor $t1,$t1,$Alo,lsr#7
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
adds $Alo,$Alo,$Tlo
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#include "arm_arch.h"
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
.code 32
.type K512,%object
.align 5
K512:
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha512_block_data_order
.skip 32-4
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
ldr $Ehi,[$ctx,#$Eoff+$hi]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
.Loop:
str $t0, [sp,#$Goff+0]
str $t1, [sp,#$Goff+4]
str $t2, [sp,#$Hoff+0]
str $t3, [sp,#$Hoff+4]
ldr $Alo,[$ctx,#$Aoff+$lo]
ldr $Ahi,[$ctx,#$Aoff+$hi]
ldr $Tlo,[$ctx,#$Boff+$lo]
ldr $Thi,[$ctx,#$Boff+$hi]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
str $Tlo,[sp,#$Boff+0]
str $Thi,[sp,#$Boff+4]
str $t0, [sp,#$Coff+0]
str $t1, [sp,#$Coff+4]
str $t2, [sp,#$Doff+0]
str $t3, [sp,#$Doff+4]
ldr $Tlo,[$ctx,#$Foff+$lo]
ldr $Thi,[$ctx,#$Foff+$hi]
str $Tlo,[sp,#$Foff+0]
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
ldrb $t2, [$inp,#4]
ldrb $Thi,[$inp,#3]
ldrb $t3, [$inp,#2]
orr $Tlo,$Tlo,$t0,lsl#8
ldrb $t0, [$inp,#1]
orr $Tlo,$Tlo,$t1,lsl#16
ldrb $t1, [$inp],#8
orr $Tlo,$Tlo,$t2,lsl#24
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
eor $Thi,$Thi,$t1,lsr#8
eor $Tlo,$Tlo,$t1,lsl#24
eor $Thi,$Thi,$t0,lsl#24
eor $Tlo,$Tlo,$t0,lsr#7
eor $Thi,$Thi,$t1,lsr#7
eor $Tlo,$Tlo,$t1,lsl#25
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
mov $t0,$t2,lsr#19
mov $t1,$t3,lsr#19
eor $t0,$t0,$t3,lsl#13
eor $t1,$t1,$t2,lsl#13
eor $t0,$t0,$t3,lsr#29
eor $t1,$t1,$t2,lsr#29
eor $t0,$t0,$t2,lsl#3
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
___
&BODY_00_15(0x17);
$code.=<<___;
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
ldr $Tlo,[sp,#$Boff+0]
ldr $Thi,[sp,#$Boff+4]
ldr $t0, [$ctx,#$Aoff+$lo]
ldr $t1, [$ctx,#$Aoff+$hi]
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
ldr $Ahi,[sp,#$Coff+4]
ldr $Tlo,[sp,#$Doff+0]
ldr $Thi,[sp,#$Doff+4]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
ldr $Thi,[sp,#$Foff+4]
ldr $t0, [$ctx,#$Eoff+$lo]
ldr $t1, [$ctx,#$Eoff+$hi]
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
ldr $Ahi,[sp,#$Goff+4]
ldr $Tlo,[sp,#$Hoff+0]
ldr $Thi,[sp,#$Hoff+4]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
sub $Ktbl,$Ktbl,#640
teq $inp,$len
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
vadd.i64 $T1,$K,$h
veor $Ch,$f,$g
veor $t0,$t1
vand $Ch,$e
veor $t0,$t2 @ Sigma1(e)
veor $Ch,$g @ Ch(e,f,g)
vadd.i64 $T1,$t0
vshr.u64 $t0,$a,#@Sigma0[0]
vadd.i64 $T1,$Ch
vshr.u64 $t1,$a,#@Sigma0[1]
vshr.u64 $t2,$a,#@Sigma0[2]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vsli.64 $t1,$a,#`64-@Sigma0[1]`
vsli.64 $t2,$a,#`64-@Sigma0[2]`
vadd.i64 $T1,@X[$i%16]
vorr $Maj,$a,$c
vand $Ch,$a,$c
veor $h,$t0,$t1
vand $Maj,$b
veor $h,$t2 @ Sigma0(a)
vorr $Maj,$Ch @ Maj(a,b,c)
vadd.i64 $h,$T1
vadd.i64 $d,$T1
vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.align 4
.LNEON:
dmb @ errata #451034 on early Cortex A8
vstmdb sp!,{d8-d15} @ ABI specification says so
sub $Ktbl,r3,#672 @ K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
ret @ bx lr
#endif
___
}
$code.=<<___;
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT; # enforce flush

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,414 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256/512 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
# Cortex-A5x n/a n/a n/a
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
# 10%, but at the cost of 20% loss on Cortex-A5x.
$flavour=shift;
$output=shift;
open STDOUT,">$output";
if ($output =~ /512/) {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
} else {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
}
$func="sha${BITS}_block_data_order";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@X=map("$reg_t$_",(3..15,0..2));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
sub BODY_00_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)&15;
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __ARMEB__
rev @X[$i],@X[$i] // $i
#endif
___
$code.=<<___ if ($i<13 && ($i&1));
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
___
$code.=<<___ if ($i==13);
ldp @X[14],@X[15],[$inp]
___
$code.=<<___ if ($i>=14);
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
___
$code.=<<___ if ($i>0 && $i<16);
add $a,$a,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=11);
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
___
# While ARMv8 specifies merged rotate-n-logical operation such as
# 'eor x,y,z,ror#n', it was found to negatively affect performance
# on Apple A7. The reason seems to be that it requires even 'y' to
# be available earlier. This means that such merged instruction is
# not necessarily best choice on critical path... On the other hand
# Cortex-A5x handles merged instructions much better than disjoint
# rotate and logical... See (**) footnote above.
$code.=<<___ if ($i<15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
and $t1,$f,$e
bic $t2,$g,$e
add $h,$h,@X[$i&15] // h+=X[i]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
ror $T0,$a,#$Sigma0[0]
add $h,$h,$t1 // h+=Ch(e,f,g)
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
add $h,$h,$t0 // h+=Sigma1(e)
and $t3,$t3,$t2 // (b^c)&=(a^b)
add $d,$d,$h // d+=h
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
//add $h,$h,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
ror $T1,@X[($j+1)&15],#$sigma0[0]
and $t1,$f,$e
ror $T2,@X[($j+14)&15],#$sigma1[0]
bic $t2,$g,$e
ror $T0,$a,#$Sigma0[0]
add $h,$h,@X[$i&15] // h+=X[i]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
eor $T0,$T0,$a,ror#$Sigma0[1]
add $h,$h,$t1 // h+=Ch(e,f,g)
and $t3,$t3,$t2 // (b^c)&=(a^b)
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
add $h,$h,$t0 // h+=Sigma1(e)
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
add @X[$j],@X[$j],@X[($j+9)&15]
add $d,$d,$h // d+=h
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
add @X[$j],@X[$j],$T1
add $h,$h,$t1 // h+=Sigma0(a)
add @X[$j],@X[$j],$T2
___
($t2,$t3)=($t3,$t2);
}
$code.=<<___;
#include "arm_arch.h"
.text
.globl $func
.type $func,%function
.align 6
$func:
___
$code.=<<___ if ($SZ==4);
ldr x16,.LOPENSSL_armcap_P
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA256
b.ne .Lv8_entry
___
$code.=<<___;
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#4*$SZ
ldp $A,$B,[$ctx] // load context
ldp $C,$D,[$ctx,#2*$SZ]
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adr $Ktbl,K$BITS
stp $ctx,$num,[x29,#96]
.Loop:
ldp @X[0],@X[1],[$inp],#2*$SZ
ldr $t2,[$Ktbl],#$SZ // *K++
eor $t3,$B,$C // magic seed
str $inp,[x29,#112]
___
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=".Loop_16_xx:\n";
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
cbnz $t2,.Loop_16_xx
ldp $ctx,$num,[x29,#96]
ldr $inp,[x29,#112]
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
ldp @X[0],@X[1],[$ctx]
ldp @X[2],@X[3],[$ctx,#2*$SZ]
add $inp,$inp,#14*$SZ // advance input pointer
ldp @X[4],@X[5],[$ctx,#4*$SZ]
add $A,$A,@X[0]
ldp @X[6],@X[7],[$ctx,#6*$SZ]
add $B,$B,@X[1]
add $C,$C,@X[2]
add $D,$D,@X[3]
stp $A,$B,[$ctx]
add $E,$E,@X[4]
add $F,$F,@X[5]
stp $C,$D,[$ctx,#2*$SZ]
add $G,$G,@X[6]
add $H,$H,@X[7]
cmp $inp,$num
stp $E,$F,[$ctx,#4*$SZ]
stp $G,$H,[$ctx,#6*$SZ]
b.ne .Loop
ldp x19,x20,[x29,#16]
add sp,sp,#4*$SZ
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
ret
.size $func,.-$func
.align 6
.type K$BITS,%object
K$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
___
$code.=<<___;
.size K$BITS,.-K$BITS
.align 3
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
.type sha256_block_armv8,%function
.align 6
sha256_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adr $Ktbl,K256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size sha256_block_armv8,.-sha256_block_armv8
___
}
$code.=<<___;
.comm OPENSSL_armcap_P,4,4
___
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,672 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256/512_Transform for Itanium.
#
# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
# faster than gcc and >60%(!) faster than code generated by HP-UX
# compiler (yes, HP-UX is generating slower code, because unlike gcc,
# it failed to deploy "shift right pair," 'shrp' instruction, which
# substitutes for 64-bit rotate).
#
# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
# this one big time). Note that "formally" 924 is about 100 cycles
# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
# are spent on extra work to provide for 32-bit rotations. 32-bit
# rotations are still handled by 'shrp' instruction and for this
# reason lower 32 bits are deposited to upper half of 64-bit register
# prior 'shrp' issue. And in order to minimize the amount of such
# operations, X[16] values are *maintained* with copies of lower
# halves in upper halves, which is why you'll spot such instructions
# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
# 32-bit unsigned right shift," 'pshr4.u' instructions here.
#
# Rules of engagement.
#
# There is only one integer shifter meaning that if I have two rotate,
# deposit or extract instructions in adjacent bundles, they shall
# split [at run-time if they have to]. But note that variable and
# parallel shifts are performed by multi-media ALU and *are* pairable
# with rotates [and alike]. On the backside MMALU is rather slow: it
# takes 2 extra cycles before the result of integer operation is
# available *to* MMALU and 2(*) extra cycles before the result of MM
# operation is available "back" *to* integer ALU, not to mention that
# MMALU itself has 2 cycles latency. However! I explicitly scheduled
# these MM instructions to avoid MM stalls, so that all these extra
# latencies get "hidden" in instruction-level parallelism.
#
# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
# for 2 in order to provide for best *overall* performance,
# because on Itanium 1 stall on MM result is accompanied by
# pipeline flush, which takes 6 cycles:-(
#
# Resulting performance numbers for 900MHz Itanium 2 system:
#
# The 'numbers' are in 1000s of bytes per second processed.
# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k
# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k
# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k
#
# (*) SHA1 numbers are for HP-UX compiler and are presented purely
# for reference purposes. I bet it can improved too...
#
# To generate code, pass the file name with either 256 or 512 in its
# name and compiler flags.
$output=shift;
if ($output =~ /512.*\.[s|asm]/) {
$SZ=8;
$BITS=8*$SZ;
$LDW="ld8";
$STW="st8";
$ADD="add";
$SHRU="shr.u";
$TABLE="K512";
$func="sha512_block_data_order";
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
} elsif ($output =~ /256.*\.[s|asm]/) {
$SZ=4;
$BITS=8*$SZ;
$LDW="ld4";
$STW="st4";
$ADD="padd4";
$SHRU="pshr4.u";
$TABLE="K256";
$func="sha256_block_data_order";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
} else { die "nonsense $output"; }
open STDOUT,">$output" || die "can't open $output: $!";
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
$big_endian=0 if (/\-DL_ENDIAN/); }
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
$code=<<___;
.ident \"$output, version 1.1\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit
.text
pfssave=r2;
lcsave=r3;
prsave=r14;
K=r15;
A=r16; B=r17; C=r18; D=r19;
E=r20; F=r21; G=r22; H=r23;
T1=r24; T2=r25;
s0=r26; s1=r27; t0=r28; t1=r29;
Ktbl=r30;
ctx=r31; // 1st arg
input=r48; // 2nd arg
num=r49; // 3rd arg
sgm0=r50; sgm1=r51; // small constants
A_=r54; B_=r55; C_=r56; D_=r57;
E_=r58; F_=r59; G_=r60; H_=r61;
// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
.global $func#
.proc $func#
.align 32
$func:
.prologue
.save ar.pfs,pfssave
{ .mmi; alloc pfssave=ar.pfs,3,27,0,16
$ADDP ctx=0,r32 // 1st arg
.save ar.lc,lcsave
mov lcsave=ar.lc }
{ .mmi; $ADDP input=0,r33 // 2nd arg
mov num=r34 // 3rd arg
.save pr,prsave
mov prsave=pr };;
.body
{ .mib; add r8=0*$SZ,ctx
add r9=1*$SZ,ctx
brp.loop.imp .L_first16,.L_first16_end-16 }
{ .mib; add r10=2*$SZ,ctx
add r11=3*$SZ,ctx
brp.loop.imp .L_rest,.L_rest_end-16 };;
// load A-H
.Lpic_point:
{ .mmi; $LDW A_=[r8],4*$SZ
$LDW B_=[r9],4*$SZ
mov Ktbl=ip }
{ .mmi; $LDW C_=[r10],4*$SZ
$LDW D_=[r11],4*$SZ
mov sgm0=$sigma0[2] };;
{ .mmi; $LDW E_=[r8]
$LDW F_=[r9]
add Ktbl=($TABLE#-.Lpic_point),Ktbl }
{ .mmi; $LDW G_=[r10]
$LDW H_=[r11]
cmp.ne p0,p16=0,r0 };; // used in sha256_block
___
$code.=<<___ if ($BITS==64);
{ .mii; and r8=7,input
and input=~7,input;;
cmp.eq p9,p0=1,r8 }
{ .mmi; cmp.eq p10,p0=2,r8
cmp.eq p11,p0=3,r8
cmp.eq p12,p0=4,r8 }
{ .mmi; cmp.eq p13,p0=5,r8
cmp.eq p14,p0=6,r8
cmp.eq p15,p0=7,r8 };;
___
$code.=<<___;
.L_outer:
.rotr X[16]
{ .mmi; mov A=A_
mov B=B_
mov ar.lc=14 }
{ .mmi; mov C=C_
mov D=D_
mov E=E_ }
{ .mmi; mov F=F_
mov G=G_
mov ar.ec=2 }
{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit
mov H=H_
mov sgm1=$sigma1[2] };;
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
.align 32
.L_first16:
{ .mmi; add r9=1-$SZ,input
add r10=2-$SZ,input
add r11=3-$SZ,input };;
{ .mmi; ld1 r9=[r9]
ld1 r10=[r10]
dep.z $t1=E,32,32 }
{ .mmi; $LDW K=[Ktbl],$SZ
ld1 r11=[r11]
zxt4 E=E };;
{ .mii; or $t1=$t1,E
dep X[15]=X[15],r9,8,8
dep r11=r10,r11,8,8 };;
{ .mmi; and T1=F,E
and T2=A,B
dep X[15]=X[15],r11,16,16 }
{ .mmi; andcm r8=G,E
and r9=A,C
mux2 $t0=A,0x44 };; // copy lower half to upper
{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
_rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mib; and r10=B,C
xor T2=T2,r9 };;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
// in 64-bit mode I load whole X[16] at once and take care of alignment...
{ .mmi; add r8=1*$SZ,input
add r9=2*$SZ,input
add r10=3*$SZ,input };;
{ .mmb; $LDW X[15]=[input],4*$SZ
$LDW X[14]=[r8],4*$SZ
(p9) br.cond.dpnt.many .L1byte };;
{ .mmb; $LDW X[13]=[r9],4*$SZ
$LDW X[12]=[r10],4*$SZ
(p10) br.cond.dpnt.many .L2byte };;
{ .mmb; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
(p11) br.cond.dpnt.many .L3byte };;
{ .mmb; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
(p12) br.cond.dpnt.many .L4byte };;
{ .mmb; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
(p13) br.cond.dpnt.many .L5byte };;
{ .mmb; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
(p14) br.cond.dpnt.many .L6byte };;
{ .mmb; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
(p15) br.cond.dpnt.many .L7byte };;
{ .mmb; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
br.many .L_first16 };;
.L1byte:
{ .mmi; $LDW X[13]=[r9],4*$SZ
$LDW X[12]=[r10],4*$SZ
shrp X[15]=X[15],X[14],56 };;
{ .mmi; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
shrp X[14]=X[14],X[13],56 }
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[13]=X[13],X[12],56 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[12]=X[12],X[11],56 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[11]=X[11],X[10],56 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[10]=X[10],X[ 9],56 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[ 9]=X[ 9],X[ 8],56 };;
{ .mii; $LDW T1=[input]
shrp X[ 8]=X[ 8],X[ 7],56
shrp X[ 7]=X[ 7],X[ 6],56 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],56
shrp X[ 5]=X[ 5],X[ 4],56 };;
{ .mii; shrp X[ 4]=X[ 4],X[ 3],56
shrp X[ 3]=X[ 3],X[ 2],56 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
shrp X[ 1]=X[ 1],X[ 0],56 }
{ .mib; shrp X[ 0]=X[ 0],T1,56
br.many .L_first16 };;
.L2byte:
{ .mmi; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
shrp X[15]=X[15],X[14],48 }
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[14]=X[14],X[13],48 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[13]=X[13],X[12],48 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[12]=X[12],X[11],48 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[11]=X[11],X[10],48 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[10]=X[10],X[ 9],48 };;
{ .mii; $LDW T1=[input]
shrp X[ 9]=X[ 9],X[ 8],48
shrp X[ 8]=X[ 8],X[ 7],48 }
{ .mii; shrp X[ 7]=X[ 7],X[ 6],48
shrp X[ 6]=X[ 6],X[ 5],48 };;
{ .mii; shrp X[ 5]=X[ 5],X[ 4],48
shrp X[ 4]=X[ 4],X[ 3],48 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],48
shrp X[ 2]=X[ 2],X[ 1],48 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
shrp X[ 0]=X[ 0],T1,48 }
{ .mfb; br.many .L_first16 };;
.L3byte:
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[15]=X[15],X[14],40 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[14]=X[14],X[13],40 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[13]=X[13],X[12],40 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[12]=X[12],X[11],40 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[11]=X[11],X[10],40 };;
{ .mii; $LDW T1=[input]
shrp X[10]=X[10],X[ 9],40
shrp X[ 9]=X[ 9],X[ 8],40 }
{ .mii; shrp X[ 8]=X[ 8],X[ 7],40
shrp X[ 7]=X[ 7],X[ 6],40 };;
{ .mii; shrp X[ 6]=X[ 6],X[ 5],40
shrp X[ 5]=X[ 5],X[ 4],40 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],40
shrp X[ 3]=X[ 3],X[ 2],40 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
shrp X[ 1]=X[ 1],X[ 0],40 }
{ .mib; shrp X[ 0]=X[ 0],T1,40
br.many .L_first16 };;
.L4byte:
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[15]=X[15],X[14],32 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[14]=X[14],X[13],32 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[13]=X[13],X[12],32 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[12]=X[12],X[11],32 };;
{ .mii; $LDW T1=[input]
shrp X[11]=X[11],X[10],32
shrp X[10]=X[10],X[ 9],32 }
{ .mii; shrp X[ 9]=X[ 9],X[ 8],32
shrp X[ 8]=X[ 8],X[ 7],32 };;
{ .mii; shrp X[ 7]=X[ 7],X[ 6],32
shrp X[ 6]=X[ 6],X[ 5],32 }
{ .mii; shrp X[ 5]=X[ 5],X[ 4],32
shrp X[ 4]=X[ 4],X[ 3],32 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],32
shrp X[ 2]=X[ 2],X[ 1],32 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
shrp X[ 0]=X[ 0],T1,32 }
{ .mfb; br.many .L_first16 };;
.L5byte:
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[15]=X[15],X[14],24 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[14]=X[14],X[13],24 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[13]=X[13],X[12],24 };;
{ .mii; $LDW T1=[input]
shrp X[12]=X[12],X[11],24
shrp X[11]=X[11],X[10],24 }
{ .mii; shrp X[10]=X[10],X[ 9],24
shrp X[ 9]=X[ 9],X[ 8],24 };;
{ .mii; shrp X[ 8]=X[ 8],X[ 7],24
shrp X[ 7]=X[ 7],X[ 6],24 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],24
shrp X[ 5]=X[ 5],X[ 4],24 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],24
shrp X[ 3]=X[ 3],X[ 2],24 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
shrp X[ 1]=X[ 1],X[ 0],24 }
{ .mib; shrp X[ 0]=X[ 0],T1,24
br.many .L_first16 };;
.L6byte:
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[15]=X[15],X[14],16 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[14]=X[14],X[13],16 };;
{ .mii; $LDW T1=[input]
shrp X[13]=X[13],X[12],16
shrp X[12]=X[12],X[11],16 }
{ .mii; shrp X[11]=X[11],X[10],16
shrp X[10]=X[10],X[ 9],16 };;
{ .mii; shrp X[ 9]=X[ 9],X[ 8],16
shrp X[ 8]=X[ 8],X[ 7],16 }
{ .mii; shrp X[ 7]=X[ 7],X[ 6],16
shrp X[ 6]=X[ 6],X[ 5],16 }
{ .mii; shrp X[ 5]=X[ 5],X[ 4],16
shrp X[ 4]=X[ 4],X[ 3],16 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],16
shrp X[ 2]=X[ 2],X[ 1],16 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
shrp X[ 0]=X[ 0],T1,16 }
{ .mfb; br.many .L_first16 };;
.L7byte:
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[15]=X[15],X[14],8 };;
{ .mii; $LDW T1=[input]
shrp X[14]=X[14],X[13],8
shrp X[13]=X[13],X[12],8 }
{ .mii; shrp X[12]=X[12],X[11],8
shrp X[11]=X[11],X[10],8 };;
{ .mii; shrp X[10]=X[10],X[ 9],8
shrp X[ 9]=X[ 9],X[ 8],8 }
{ .mii; shrp X[ 8]=X[ 8],X[ 7],8
shrp X[ 7]=X[ 7],X[ 6],8 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],8
shrp X[ 5]=X[ 5],X[ 4],8 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],8
shrp X[ 3]=X[ 3],X[ 2],8 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
shrp X[ 1]=X[ 1],X[ 0],8 }
{ .mib; shrp X[ 0]=X[ 0],T1,8
br.many .L_first16 };;
.align 32
.L_first16:
{ .mmi; $LDW K=[Ktbl],$SZ
and T1=F,E
and T2=A,B }
{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++
andcm r8=G,E
and r9=A,C };;
{ .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g))
and r10=B,C
_rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mmi; xor T2=T2,r9
mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
___
$code.=<<___;
{ .mib; add T1=T1,H // T1=Ch(e,f,g)+h
_rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
mov H=G };;
{ .mib; xor r11=r8,r11
_rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
{ .mib; mov G=F
mov F=E };;
{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
_rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
mov E=D };;
{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
_rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
{ .mib; mov D=C
mov C=B };;
{ .mib; add T1=T1,X[15] // T1+=X[i]
_rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
{ .mib; xor r10=r10,r11
mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
mov B=A
add A=T1,T2 };;
{ .mib; add E=E,T1
add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
br.ctop.sptk .L_first16 };;
.L_first16_end:
{ .mii; mov ar.lc=$rounds-17
mov ar.ec=1 };;
.align 32
.L_rest:
.rotr X[16]
{ .mib; $LDW K=[Ktbl],$SZ
_rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
$SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
{ .mib; and T1=F,E
_rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
{ .mib; andcm r10=G,E
$SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
xor r9=r8,r9
_rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
{ .mib; and T2=A,B
_rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
{ .mib; and r8=A,C };;
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
// pipeline flush in last bundle. Note that even on Itanium2 the
// latter stalls for one clock cycle...
{ .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
dep.z $t1=E,32,32 }
{ .mmi; xor r10=r11,r10
zxt4 E=E };;
{ .mmi; or $t1=$t1,E
xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
mux2 $t0=A,0x44 };; // copy lower half to upper
{ .mmi; xor T2=T2,r8
_rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mmi; and r10=B,C
add T1=T1,H // T1=Ch(e,f,g)+h
$ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
_rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mib; xor r10=r11,r10
xor T2=T2,r8 };;
{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
add T1=T1,H }
{ .mib; and r10=B,C
$ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
___
$code.=<<___;
{ .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
mov H=G
_rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18)
{ .mmi; xor r11=r8,r9
$ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF])
_rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
{ .mmi; mov G=F
mov F=E };;
{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
_rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
mov E=D };;
{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
_rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
{ .mib; mov D=C
mov C=B };;
{ .mmi; add T1=T1,X[15] // T1+=X[i]
xor r10=r10,r11
_rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
mov B=A
add A=T1,T2 };;
{ .mib; add E=E,T1
add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
br.ctop.sptk .L_rest };;
.L_rest_end:
{ .mmi; add A_=A_,A
add B_=B_,B
add C_=C_,C }
{ .mmi; add D_=D_,D
add E_=E_,E
cmp.ltu p16,p0=1,num };;
{ .mmi; add F_=F_,F
add G_=G_,G
add H_=H_,H }
{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl
(p16) add num=-1,num
(p16) br.dptk.many .L_outer };;
{ .mib; add r8=0*$SZ,ctx
add r9=1*$SZ,ctx }
{ .mib; add r10=2*$SZ,ctx
add r11=3*$SZ,ctx };;
{ .mmi; $STW [r8]=A_,4*$SZ
$STW [r9]=B_,4*$SZ
mov ar.lc=lcsave }
{ .mmi; $STW [r10]=C_,4*$SZ
$STW [r11]=D_,4*$SZ
mov pr=prsave,0x1ffff };;
{ .mmb; $STW [r8]=E_
$STW [r9]=F_ }
{ .mmb; $STW [r10]=G_
$STW [r11]=H_
br.ret.sptk.many b0 };;
.endp $func#
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
if ($BITS==64) {
$code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
$code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
$code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
if (!$big_endian);
$code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
}
print $code;
print<<___ if ($BITS==32);
.align 64
.type K256#,\@object
K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256#,$SZ*$rounds
stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
print<<___ if ($BITS==64);
.align 64
.type K512#,\@object
K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
data8 0x3956c25bf348b538,0x59f111f1b605d019
data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118
data8 0xd807aa98a3030242,0x12835b0145706fbe
data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1
data8 0x9bdc06a725c71235,0xc19bf174cf692694
data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3
data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483
data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
data8 0x983e5152ee66dfab,0xa831c66d2db43210
data8 0xb00327c898fb213f,0xbf597fc7beef0ee4
data8 0xc6e00bf33da88fc2,0xd5a79147930aa725
data8 0x06ca6351e003826f,0x142929670a0e6e70
data8 0x27b70a8546d22ffc,0x2e1b21385c26c926
data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
data8 0x650a73548baf63de,0x766a0abb3c77b2a8
data8 0x81c2c92e47edaee6,0x92722c851482353b
data8 0xa2bfe8a14cf10364,0xa81a664bbc423001
data8 0xc24b8b70d0f89791,0xc76c51a30654be30
data8 0xd192e819d6ef5218,0xd69906245565a910
data8 0xf40e35855771202a,0x106aa07032bbd1b8
data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53
data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
data8 0x748f82ee5defb2fc,0x78a5636f43172f60
data8 0x84c87814a1f0ab72,0x8cc702081a6439ec
data8 0x90befffa23631e28,0xa4506cebde82bde9
data8 0xbef9a3f7b2c67915,0xc67178f2e372532b
data8 0xca273eceea26619c,0xd186b8c721c0c207
data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
data8 0x113f9804bef90dae,0x1b710b35131c471b
data8 0x28db77f523047d84,0x32caab7b40c72493
data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.size K512#,$SZ*$rounds
stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___

View File

@@ -0,0 +1,455 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA2 block procedures for MIPS.
# October 2010.
#
# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
# for now can only be compiled for MIPS64 ISA] improvement is modest
# ~17%, but it comes for free, because it's same instruction sequence.
# Improvement coefficients are for aligned input.
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
# excluded from the rule, because it's specified volatile];
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
$SZREG=4;
}
$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
# <appro@openssl.org>
#
######################################################################
$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="ld"; # load from memory
$ST="sd"; # store to memory
$SLL="dsll"; # shift left logical
$SRL="dsrl"; # shift right logical
$ADDU="daddu";
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=( 7, 1, 8); # right shift first
@sigma1=( 6,19,61); # right shift first
$lastK=0x817;
$rounds=80;
} else {
$label="256";
$SZ=4;
$LD="lw"; # load from memory
$ST="sw"; # store to memory
$SLL="sll"; # shift left logical
$SRL="srl"; # shift right logical
$ADDU="addu";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 3, 7,18); # right shift first
@sigma1=(10,17,19); # right shift first
$lastK=0x8f2;
$rounds=64;
}
$MSB = $big_endian ? 0 : ($SZ-1);
$LSB = ($SZ-1)&~$MSB;
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
@X=map("\$$_",(8..23));
$ctx=$a0;
$inp=$a1;
$len=$a2; $Ktbl=$len;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___ if ($i<15);
${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
srl $tmp0,@X[0],24 # byte swap($i)
srl $tmp1,@X[0],8
andi $tmp2,@X[0],0xFF00
sll @X[0],@X[0],24
andi $tmp1,0xFF00
sll $tmp2,$tmp2,8
or @X[0],$tmp0
or $tmp1,$tmp2
or @X[0],$tmp1
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
and $tmp1,@X[0],$tmp0 # byte swap($i)
dsrl $tmp2,@X[0],24
dsll $tmp1,24
and $tmp2,$tmp0
dsll $tmp0,8 # 0x0000FF000000FF00
or $tmp1,$tmp2
and $tmp2,@X[0],$tmp0
dsrl @X[0],8
dsll $tmp2,8
and @X[0],$tmp0
or $tmp1,$tmp2
or @X[0],$tmp1
dsrl $tmp1,@X[0],32
dsll @X[0],32
or @X[0],$tmp1
___
$code.=<<___;
$ADDU $T1,$X[0],$h # $i
$SRL $h,$e,@Sigma1[0]
xor $tmp2,$f,$g
$SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
and $tmp2,$e
$SRL $tmp0,$e,@Sigma1[1]
xor $h,$tmp1
$SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
xor $h,$tmp0
$SRL $tmp0,$e,@Sigma1[2]
xor $h,$tmp1
$SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
xor $h,$tmp0
xor $tmp2,$g # Ch(e,f,g)
xor $tmp0,$tmp1,$h # Sigma1(e)
$SRL $h,$a,@Sigma0[0]
$ADDU $T1,$tmp2
$LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
$SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
$ADDU $T1,$tmp0
$SRL $tmp0,$a,@Sigma0[1]
xor $h,$tmp1
$SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
xor $h,$tmp0
$SRL $tmp0,$a,@Sigma0[2]
xor $h,$tmp1
$SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
xor $h,$tmp0
$ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
xor $h,$tmp1 # Sigma0(a)
or $tmp0,$a,$b
and $tmp1,$a,$b
and $tmp0,$c
or $tmp1,$tmp0 # Maj(a,b,c)
$ADDU $T1,$tmp2 # +=K[$i]
$ADDU $h,$tmp1
$ADDU $d,$T1
$ADDU $h,$T1
___
$code.=<<___ if ($i>=13);
$LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
___
}
sub BODY_16_XX {
my $i=@_[0];
my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___;
$SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
$ADDU @X[0],@X[9] # +=X[i+9]
$SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
$SRL $tmp0,@X[1],@sigma0[1]
xor $tmp2,$tmp1
$SLL $tmp1,`@sigma0[2]-@sigma0[1]`
xor $tmp2,$tmp0
$SRL $tmp0,@X[1],@sigma0[2]
xor $tmp2,$tmp1
$SRL $tmp3,@X[14],@sigma1[0]
xor $tmp2,$tmp0 # sigma0(X[i+1])
$SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
$ADDU @X[0],$tmp2
$SRL $tmp0,@X[14],@sigma1[1]
xor $tmp3,$tmp1
$SLL $tmp1,`@sigma1[2]-@sigma1[1]`
xor $tmp3,$tmp0
$SRL $tmp0,@X[14],@sigma1[2]
xor $tmp3,$tmp1
xor $tmp3,$tmp0 # sigma1(X[i+14])
$ADDU @X[0],$tmp3
___
&BODY_00_15(@_);
}
$FRAMESIZE=16*$SZ+16*$SZREG;
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
$code.=<<___;
#ifdef OPENSSL_FIPSCANISTER
# include <openssl/fipssyms.h>
#endif
.text
.set noat
#if !defined(__vxworks) || defined(__pic__)
.option pic2
#endif
.align 5
.globl sha${label}_block_data_order
.ent sha${label}_block_data_order
sha${label}_block_data_order:
.frame $sp,$FRAMESIZE,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
___
$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
.cpload $pf
___
$code.=<<___;
$PTR_SUB $sp,$FRAMESIZE
$REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
$REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
$REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
$REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
$REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
$REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
$REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
$REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
$REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
$REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
$REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
$REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
$REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
$REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
$REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
___
$code.=<<___;
$PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
___
$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
.cplocal $Ktbl
.cpsetup $pf,$zero,sha${label}_block_data_order
___
$code.=<<___;
.set reorder
la $Ktbl,K${label} # PIC-ified 'load address'
$LD $A,0*$SZ($ctx) # load context
$LD $B,1*$SZ($ctx)
$LD $C,2*$SZ($ctx)
$LD $D,3*$SZ($ctx)
$LD $E,4*$SZ($ctx)
$LD $F,5*$SZ($ctx)
$LD $G,6*$SZ($ctx)
$LD $H,7*$SZ($ctx)
$PTR_ADD @X[15],$inp # pointer to the end of input
$REG_S @X[15],16*$SZ($sp)
b .Loop
.align 5
.Loop:
${LD}l @X[0],$MSB($inp)
${LD}r @X[0],$LSB($inp)
___
for ($i=0;$i<16;$i++)
{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
$code.=<<___;
b .L16_xx
.align 4
.L16_xx:
___
for (;$i<32;$i++)
{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
$code.=<<___;
and @X[6],0xfff
li @X[7],$lastK
.set noreorder
bne @X[6],@X[7],.L16_xx
$PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
$REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
$LD @X[0],0*$SZ($ctx)
$LD @X[1],1*$SZ($ctx)
$LD @X[2],2*$SZ($ctx)
$PTR_ADD $inp,16*$SZ
$LD @X[3],3*$SZ($ctx)
$ADDU $A,@X[0]
$LD @X[4],4*$SZ($ctx)
$ADDU $B,@X[1]
$LD @X[5],5*$SZ($ctx)
$ADDU $C,@X[2]
$LD @X[6],6*$SZ($ctx)
$ADDU $D,@X[3]
$LD @X[7],7*$SZ($ctx)
$ADDU $E,@X[4]
$ST $A,0*$SZ($ctx)
$ADDU $F,@X[5]
$ST $B,1*$SZ($ctx)
$ADDU $G,@X[6]
$ST $C,2*$SZ($ctx)
$ADDU $H,@X[7]
$ST $D,3*$SZ($ctx)
$ST $E,4*$SZ($ctx)
$ST $F,5*$SZ($ctx)
$ST $G,6*$SZ($ctx)
$ST $H,7*$SZ($ctx)
bne $inp,@X[15],.Loop
$PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
$REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
$REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
$REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
$REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
$REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
$REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
$REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
$REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
$REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
$REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
$REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
$REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
$REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
$REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE
.end sha${label}_block_data_order
.rdata
.align 5
K${label}:
___
if ($SZ==4) {
$code.=<<___;
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
___
} else {
$code.=<<___;
.dword 0x428a2f98d728ae22, 0x7137449123ef65cd
.dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
.dword 0x3956c25bf348b538, 0x59f111f1b605d019
.dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
.dword 0xd807aa98a3030242, 0x12835b0145706fbe
.dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
.dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
.dword 0x9bdc06a725c71235, 0xc19bf174cf692694
.dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
.dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
.dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
.dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
.dword 0x983e5152ee66dfab, 0xa831c66d2db43210
.dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
.dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
.dword 0x06ca6351e003826f, 0x142929670a0e6e70
.dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
.dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
.dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
.dword 0x81c2c92e47edaee6, 0x92722c851482353b
.dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
.dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
.dword 0xd192e819d6ef5218, 0xd69906245565a910
.dword 0xf40e35855771202a, 0x106aa07032bbd1b8
.dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
.dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
.dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
.dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
.dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
.dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
.dword 0x90befffa23631e28, 0xa4506cebde82bde9
.dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
.dword 0xca273eceea26619c, 0xd186b8c721c0c207
.dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
.dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
.dword 0x113f9804bef90dae, 0x1b710b35131c471b
.dword 0x28db77f523047d84, 0x32caab7b40c72493
.dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
.dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
.dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
___
}
$code.=<<___;
.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,793 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 block procedure for PA-RISC.
# June 2009.
#
# SHA256 performance is >75% better than gcc 3.2 generated code on
# PA-7100LC. Compared to code generated by vendor compiler this
# implementation is almost 70% faster in 64-bit build, but delivers
# virtually same performance in 32-bit build on PA-8600.
#
# SHA512 performance is >2.9x better than gcc 3.2 generated code on
# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
# code is executed on PA-RISC 2.0 processor and switches to 64-bit
# code path delivering adequate peformance even in "blended" 32-bit
# build. Though 64-bit code is not any faster than code generated by
# vendor compiler on PA-8600...
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
if ($output =~ /512/) {
$func="sha512_block_data_order";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$LAST10BITS=0x017;
$LD="ldd";
$LDM="ldd,ma";
$ST="std";
} else {
$func="sha256_block_data_order";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$LAST10BITS=0x0f2;
$LD="ldw";
$LDM="ldwm";
$ST="stw";
}
$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
# [+ argument transfer]
$XOFF=16*$SZ+32; # local variables
$FRAME+=$XOFF;
$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
$ctx="%r26"; # zapped by $a0
$inp="%r25"; # zapped by $a1
$num="%r24"; # zapped by $t0
$a0 ="%r26";
$a1 ="%r25";
$t0 ="%r24";
$t1 ="%r29";
$Tbl="%r31";
@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
_ror $e,$Sigma1[0],$a0
and $f,$e,$t0
_ror $e,$Sigma1[1],$a1
addl $t1,$h,$h
andcm $g,$e,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
or $t0,$t1,$t1 ; Ch(e,f,g)
addl @X[$i%16],$h,$h
xor $a0,$a1,$a1 ; Sigma1(e)
addl $t1,$h,$h
_ror $a,$Sigma0[0],$a0
addl $a1,$h,$h
_ror $a,$Sigma0[1],$a1
and $a,$b,$t0
and $a,$c,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
xor $t1,$t0,$t0
and $b,$c,$t1
xor $a0,$a1,$a1 ; Sigma0(a)
addl $h,$d,$d
xor $t1,$t0,$t0 ; Maj(a,b,c)
`"$LDM $SZ($Tbl),$t1" if ($i<15)`
addl $a1,$h,$h
addl $t0,$h,$h
___
}
sub ROUND_16_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$i-=16;
$code.=<<___;
_ror @X[($i+1)%16],$sigma0[0],$a0
_ror @X[($i+1)%16],$sigma0[1],$a1
addl @X[($i+9)%16],@X[$i],@X[$i]
_ror @X[($i+14)%16],$sigma1[0],$t0
_ror @X[($i+14)%16],$sigma1[1],$t1
xor $a1,$a0,$a0
_shr @X[($i+1)%16],$sigma0[2],$a1
xor $t1,$t0,$t0
_shr @X[($i+14)%16],$sigma1[2],$t1
xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
$LDM $SZ($Tbl),$t1
addl $a0,@X[$i],@X[$i]
addl $t0,@X[$i],@X[$i]
___
$code.=<<___ if ($i==15);
extru $t1,31,10,$a1
comiclr,<> $LAST10BITS,$a1,%r0
ldo 1($Tbl),$Tbl ; signal end of $Tbl
___
&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.ALIGN 64
L\$table
___
$code.=<<___ if ($SZ==8);
.WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
.WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
.WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
.WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
.WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
.WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
.WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
.WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
.WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
.WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
.WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
.WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
.WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
.WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
.WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
.WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
.WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
.WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
.WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
.WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
.WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
.WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
.WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
.WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
.WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
.WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
.WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
.WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
.WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
.WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
.WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
.WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
.WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
.WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
.WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
.WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
.WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
.WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
.WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
.WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
___
$code.=<<___ if ($SZ==4);
.WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code.=<<___;
.EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
.ALIGN 64
$func
.PROC
.CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
$PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
$PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
_shl $num,`log(16*$SZ)/log(2)`,$num
addl $inp,$num,$num ; $num to point at the end of $inp
$PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
$PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
blr %r0,$Tbl
ldi 3,$t1
L\$pic
andcm $Tbl,$t1,$Tbl ; wipe privilege level
ldo L\$table-L\$pic($Tbl),$Tbl
___
$code.=<<___ if ($SZ==8 && $SIZE_T==4);
ldi 31,$t1
mtctl $t1,%cr11
extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
b L\$parisc1
nop
___
$code.=<<___;
$LD `0*$SZ`($ctx),$A ; load context
$LD `1*$SZ`($ctx),$B
$LD `2*$SZ`($ctx),$C
$LD `3*$SZ`($ctx),$D
$LD `4*$SZ`($ctx),$E
$LD `5*$SZ`($ctx),$F
$LD `6*$SZ`($ctx),$G
$LD `7*$SZ`($ctx),$H
extru $inp,31,`log($SZ)/log(2)`,$t0
sh3addl $t0,%r0,$t0
subi `8*$SZ`,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop
ldi `$SZ-1`,$t0
$LDM $SZ($Tbl),$t1
andcm $inp,$t0,$t0 ; align $inp
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
$LD `$SZ*15`($t0),@X[15]
$LD `$SZ*16`($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align data
$code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
L\$rounds
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
nop
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
$LD `0*$SZ`($ctx),@X[0] ; load context
$LD `1*$SZ`($ctx),@X[1]
$LD `2*$SZ`($ctx),@X[2]
$LD `3*$SZ`($ctx),@X[3]
$LD `4*$SZ`($ctx),@X[4]
$LD `5*$SZ`($ctx),@X[5]
addl @X[0],$A,$A
$LD `6*$SZ`($ctx),@X[6]
addl @X[1],$B,$B
$LD `7*$SZ`($ctx),@X[7]
ldo `16*$SZ`($inp),$inp ; advance $inp
$ST $A,`0*$SZ`($ctx) ; save context
addl @X[2],$C,$C
$ST $B,`1*$SZ`($ctx)
addl @X[3],$D,$D
$ST $C,`2*$SZ`($ctx)
addl @X[4],$E,$E
$ST $D,`3*$SZ`($ctx)
addl @X[5],$F,$F
$ST $E,`4*$SZ`($ctx)
addl @X[6],$G,$G
$ST $F,`5*$SZ`($ctx)
addl @X[7],$H,$H
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
cmpb,*<>,n $inp,$num,L\$oop
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
___
if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
{{
$code.=<<___;
b L\$done
nop
.ALIGN 64
L\$parisc1
___
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
$a0 ="%r17";
$a1 ="%r18";
$a2 ="%r19";
$a3 ="%r20";
$t0 ="%r21";
$t1 ="%r22";
$t2 ="%r28";
$t3 ="%r29";
$Tbl="%r31";
@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
sub ROUND_00_15_pa1 {
my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
$code.=<<___ if (!$flag);
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
___
$code.=<<___;
shd $ehi,$elo,$Sigma1[0],$t0
add $Xlo,$hlo,$hlo
shd $elo,$ehi,$Sigma1[0],$t1
addc $Xhi,$hhi,$hhi ; h += X[i]
shd $ehi,$elo,$Sigma1[1],$t2
ldwm 8($Tbl),$Xhi
shd $elo,$ehi,$Sigma1[1],$t3
ldw -4($Tbl),$Xlo ; load K[i]
xor $t2,$t0,$t0
xor $t3,$t1,$t1
and $flo,$elo,$a0
and $fhi,$ehi,$a1
shd $ehi,$elo,$Sigma1[2],$t2
andcm $glo,$elo,$a2
shd $elo,$ehi,$Sigma1[2],$t3
andcm $ghi,$ehi,$a3
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma1(e)
add $Xlo,$hlo,$hlo
xor $a2,$a0,$a0
addc $Xhi,$hhi,$hhi ; h += K[i]
xor $a3,$a1,$a1 ; Ch(e,f,g)
add $t0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[0],$t0
addc $t1,$hhi,$hhi ; h += Sigma1(e)
shd $alo,$ahi,$Sigma0[0],$t1
add $a0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[1],$t2
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
shd $alo,$ahi,$Sigma0[1],$t3
xor $t2,$t0,$t0
xor $t3,$t1,$t1
shd $ahi,$alo,$Sigma0[2],$t2
and $alo,$blo,$a0
shd $alo,$ahi,$Sigma0[2],$t3
and $ahi,$bhi,$a1
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma0(a)
and $alo,$clo,$a2
and $ahi,$chi,$a3
xor $a2,$a0,$a0
add $hlo,$dlo,$dlo
xor $a3,$a1,$a1
addc $hhi,$dhi,$dhi ; d += h
and $blo,$clo,$a2
add $t0,$hlo,$hlo
and $bhi,$chi,$a3
addc $t1,$hhi,$hhi ; h += Sigma0(a)
xor $a2,$a0,$a0
add $a0,$hlo,$hlo
xor $a3,$a1,$a1 ; Maj(a,b,c)
addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
___
$code.=<<___ if ($i==15 && $flag);
extru $Xlo,31,10,$Xlo
comiclr,= $LAST10BITS,$Xlo,%r0
b L\$rounds_pa1
nop
___
push(@X,shift(@X)); push(@X,shift(@X));
}
sub ROUND_16_xx_pa1 {
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
my ($i)=shift;
$i-=16;
$code.=<<___;
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
shd $Xnhi,$Xnlo,$sigma0[0],$t0
shd $Xnlo,$Xnhi,$sigma0[0],$t1
add $a0,$Xlo,$Xlo
shd $Xnhi,$Xnlo,$sigma0[1],$t2
addc $a1,$Xhi,$Xhi
shd $Xnlo,$Xnhi,$sigma0[1],$t3
xor $t2,$t0,$t0
shd $Xnhi,$Xnlo,$sigma0[2],$t2
xor $t3,$t1,$t1
extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
xor $t2,$t0,$t0
shd $a3,$a2,$sigma1[0],$a0
xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
shd $a2,$a3,$sigma1[0],$a1
add $t0,$Xlo,$Xlo
shd $a3,$a2,$sigma1[1],$t2
addc $t1,$Xhi,$Xhi
shd $a2,$a3,$sigma1[1],$t3
xor $t2,$a0,$a0
shd $a3,$a2,$sigma1[2],$t2
xor $t3,$a1,$a1
extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
xor $t2,$a0,$a0
xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
add $a0,$Xlo,$Xlo
addc $a1,$Xhi,$Xhi
stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
___
&ROUND_00_15_pa1($i,@_,1);
}
$code.=<<___;
ldw `0*4`($ctx),$Ahi ; load context
ldw `1*4`($ctx),$Alo
ldw `2*4`($ctx),$Bhi
ldw `3*4`($ctx),$Blo
ldw `4*4`($ctx),$Chi
ldw `5*4`($ctx),$Clo
ldw `6*4`($ctx),$Dhi
ldw `7*4`($ctx),$Dlo
ldw `8*4`($ctx),$Ehi
ldw `9*4`($ctx),$Elo
ldw `10*4`($ctx),$Fhi
ldw `11*4`($ctx),$Flo
ldw `12*4`($ctx),$Ghi
ldw `13*4`($ctx),$Glo
ldw `14*4`($ctx),$Hhi
ldw `15*4`($ctx),$Hlo
extru $inp,31,2,$t0
sh3addl $t0,%r0,$t0
subi 32,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop_pa1
extru $inp,31,2,$a3
comib,= 0,$a3,L\$aligned_pa1
sub $inp,$a3,$inp
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
vshd $X[0],$X[1],$X[0]
vshd $X[1],$t2,$X[1]
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
vshd $t2,$t3,$t2
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
vshd $t3,$a0,$t3
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<=(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
for (;$i<(128/4-1);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
$code.=<<___;
b L\$collected_pa1
stw $t[0],`-$XOFF+$i*4`(%sp)
___
}
$code.=<<___;
L\$aligned_pa1
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
___
push(@t,shift(@t));
}
for (;$i<128/4;$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
___
push(@t,shift(@t));
}
$code.="L\$collected_pa1\n";
}
for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.="L\$rounds_pa1\n";
for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.=<<___;
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
ldw `0*4`($ctx),$t1 ; update context
ldw `1*4`($ctx),$t0
ldw `2*4`($ctx),$t3
ldw `3*4`($ctx),$t2
ldw `4*4`($ctx),$a1
ldw `5*4`($ctx),$a0
ldw `6*4`($ctx),$a3
add $t0,$Alo,$Alo
ldw `7*4`($ctx),$a2
addc $t1,$Ahi,$Ahi
ldw `8*4`($ctx),$t1
add $t2,$Blo,$Blo
ldw `9*4`($ctx),$t0
addc $t3,$Bhi,$Bhi
ldw `10*4`($ctx),$t3
add $a0,$Clo,$Clo
ldw `11*4`($ctx),$t2
addc $a1,$Chi,$Chi
ldw `12*4`($ctx),$a1
add $a2,$Dlo,$Dlo
ldw `13*4`($ctx),$a0
addc $a3,$Dhi,$Dhi
ldw `14*4`($ctx),$a3
add $t0,$Elo,$Elo
ldw `15*4`($ctx),$a2
addc $t1,$Ehi,$Ehi
stw $Ahi,`0*4`($ctx)
add $t2,$Flo,$Flo
stw $Alo,`1*4`($ctx)
addc $t3,$Fhi,$Fhi
stw $Bhi,`2*4`($ctx)
add $a0,$Glo,$Glo
stw $Blo,`3*4`($ctx)
addc $a1,$Ghi,$Ghi
stw $Chi,`4*4`($ctx)
add $a2,$Hlo,$Hlo
stw $Clo,`5*4`($ctx)
addc $a3,$Hhi,$Hhi
stw $Dhi,`6*4`($ctx)
ldo `16*$SZ`($inp),$inp ; advance $inp
stw $Dlo,`7*4`($ctx)
stw $Ehi,`8*4`($ctx)
stw $Elo,`9*4`($ctx)
stw $Fhi,`10*4`($ctx)
stw $Flo,`11*4`($ctx)
stw $Ghi,`12*4`($ctx)
stw $Glo,`13*4`($ctx)
stw $Hhi,`14*4`($ctx)
comb,= $inp,$num,L\$done
stw $Hlo,`15*4`($ctx)
b L\$oop_pa1
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
L\$done
___
}}
$code.=<<___;
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
$POP `-$FRAME+14*$SIZE_T`(%sp),%r17
$POP `-$FRAME+15*$SIZE_T`(%sp),%r18
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
{ my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
$opcode|=(1<<3) if ($mod =~ /^,m/);
$opcode|=(1<<2) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
{ sprintf "\t.WORD\t0x%08x\t; %s",
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
: sprintf("shd\t%$1,%$2,%d",$3)/e or
# translate made up instructons: _ror, _shr, _align, _shl
s/_ror(\s+)(%r[0-9]+),/
($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
s/_shr(\s+%r[0-9]+),([0-9]+),/
$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
s/_align(\s+%r[0-9]+,%r[0-9]+),/
($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
s/_shl(\s+%r[0-9]+),([0-9]+),/
$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
s/cmpb,\*/comb,/ if ($SIZE_T==4);
s/\bbv\b/bve/ if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,460 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# I let hardware handle unaligned input, except on page boundaries
# (see below for details). Otherwise straightforward implementation
# with X vector in register bank. The module is big-endian [which is
# not big deal as there're no little-endian targets left around].
# sha256 | sha512
# -m64 -m32 | -m64 -m32
# --------------------------------------+-----------------------
# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
# Power6,xlc-7 +150% +90% | +100% +430%(*)
#
# (*) 64-bit code in 32-bit application context, which actually is
# on TODO list. It should be noted that for safe deployment in
# 32-bit *mutli-threaded* context asyncronous signals should be
# blocked upon entry to SHA512 block routine. This is because
# 32-bit signaling procedure invalidates upper halves of GPRs.
# Context switch procedure preserves them, but not signaling:-(
# Second version is true multi-thread safe. Trouble with the original
# version was that it was using thread local storage pointer register.
# Well, it scrupulously preserved it, but the problem would arise the
# moment asynchronous signal was delivered and signal handler would
# dereference the TLS pointer. While it's never the case in openssl
# application or test suite, we have to respect this scenario and not
# use TLS pointer register. Alternative would be to require caller to
# block signals prior calling this routine. For the record, in 32-bit
# context R2 serves as TLS pointer, while in 64-bit context - R13.
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$UCMP="cmpld";
$SHL="sldi";
$POP="ld";
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$UCMP="cmplw";
$SHL="slwi";
$POP="lwz";
$PUSH="stw";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
if ($output =~ /512/) {
$func="sha512_block_data_order";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$LD="ld";
$ST="std";
$ROR="rotrdi";
$SHR="srdi";
} else {
$func="sha256_block_data_order";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$LD="lwz";
$ST="stw";
$ROR="rotrwi";
$SHR="srwi";
}
$FRAME=32*$SIZE_T+16*$SZ;
$LOCALS=6*$SIZE_T;
$sp ="r1";
$toc="r2";
$ctx="r3"; # zapped by $a0
$inp="r4"; # zapped by $a1
$num="r5"; # zapped by $t0
$T ="r0";
$a0 ="r3";
$a1 ="r4";
$t0 ="r5";
$t1 ="r6";
$Tbl="r7";
$A ="r8";
$B ="r9";
$C ="r10";
$D ="r11";
$E ="r12";
$F ="r13"; $F="r2" if ($SIZE_T==8);# reassigned to exempt TLS pointer
$G ="r14";
$H ="r15";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
$inp="r31"; # reassigned $inp! aliases with @X[15]
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
$LD $T,`$i*$SZ`($Tbl)
$ROR $a0,$e,$Sigma1[0]
$ROR $a1,$e,$Sigma1[1]
and $t0,$f,$e
andc $t1,$g,$e
add $T,$T,$h
xor $a0,$a0,$a1
$ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
or $t0,$t0,$t1 ; Ch(e,f,g)
add $T,$T,@X[$i]
xor $a0,$a0,$a1 ; Sigma1(e)
add $T,$T,$t0
add $T,$T,$a0
$ROR $a0,$a,$Sigma0[0]
$ROR $a1,$a,$Sigma0[1]
and $t0,$a,$b
and $t1,$a,$c
xor $a0,$a0,$a1
$ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
xor $t0,$t0,$t1
and $t1,$b,$c
xor $a0,$a0,$a1 ; Sigma0(a)
add $d,$d,$T
xor $t0,$t0,$t1 ; Maj(a,b,c)
add $h,$T,$a0
add $h,$h,$t0
___
}
sub ROUND_16_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$i-=16;
$code.=<<___;
$ROR $a0,@X[($i+1)%16],$sigma0[0]
$ROR $a1,@X[($i+1)%16],$sigma0[1]
$ROR $t0,@X[($i+14)%16],$sigma1[0]
$ROR $t1,@X[($i+14)%16],$sigma1[1]
xor $a0,$a0,$a1
$SHR $a1,@X[($i+1)%16],$sigma0[2]
xor $t0,$t0,$t1
$SHR $t1,@X[($i+14)%16],$sigma1[2]
add @X[$i],@X[$i],@X[($i+9)%16]
xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
add @X[$i],@X[$i],$a0
add @X[$i],@X[$i],$t0
___
&ROUND_00_15($i,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
.machine "any"
.text
.globl $func
.align 6
$func:
$STU $sp,-$FRAME($sp)
mflr r0
$SHL $num,$num,`log(16*$SZ)/log(2)`
$PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
$LD $A,`0*$SZ`($ctx)
mr $inp,r4 ; incarnate $inp
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
$LD $D,`3*$SZ`($ctx)
$LD $E,`4*$SZ`($ctx)
$LD $F,`5*$SZ`($ctx)
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
bl LPICmeup
LPICedup:
andi. r0,$inp,3
bne Lunaligned
Laligned:
add $num,$inp,$num
$PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for the input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
subfc $num,$t1,$num
add $t1,$inp,$t1
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
; $inp equals to the intermediate end pointer here
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
addi r20,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$LOCALS ; fictitious inp pointer
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
$POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
addic. $num,$num,`-16*$SZ` ; num--
bne- Lunaligned
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
.align 4
Lsha2_block_private:
___
for($i=0;$i<16;$i++) {
$code.=<<___ if ($SZ==4);
lwz @X[$i],`$i*$SZ`($inp)
___
# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
# unaligned 64-bit loads, only 32-bit ones...
$code.=<<___ if ($SZ==8);
lwz $t0,`$i*$SZ`($inp)
lwz @X[$i],`$i*$SZ+4`($inp)
insrdi @X[$i],$t0,32,0
___
&ROUND_00_15($i,@V);
unshift(@V,pop(@V));
}
$code.=<<___;
li $T,`$rounds/16-1`
mtctr $T
.align 4
Lrounds:
addi $Tbl,$Tbl,`16*$SZ`
___
for(;$i<32;$i++) {
&ROUND_16_xx($i,@V);
unshift(@V,pop(@V));
}
$code.=<<___;
bdnz- Lrounds
$POP $ctx,`$FRAME-$SIZE_T*22`($sp)
$POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
$POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
$LD r16,`0*$SZ`($ctx)
$LD r17,`1*$SZ`($ctx)
$LD r18,`2*$SZ`($ctx)
$LD r19,`3*$SZ`($ctx)
$LD r20,`4*$SZ`($ctx)
$LD r21,`5*$SZ`($ctx)
$LD r22,`6*$SZ`($ctx)
addi $inp,$inp,`16*$SZ` ; advance inp
$LD r23,`7*$SZ`($ctx)
add $A,$A,r16
add $B,$B,r17
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
add $C,$C,r18
$ST $A,`0*$SZ`($ctx)
add $D,$D,r19
$ST $B,`1*$SZ`($ctx)
add $E,$E,r20
$ST $C,`2*$SZ`($ctx)
add $F,$F,r21
$ST $D,`3*$SZ`($ctx)
add $G,$G,r22
$ST $E,`4*$SZ`($ctx)
add $H,$H,r23
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$UCMP $inp,$num
$ST $H,`7*$SZ`($ctx)
bne Lsha2_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
$code.=<<___;
.align 6
LPICmeup:
mflr r0
bcl 20,31,\$+4
mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
addi $Tbl,$Tbl,`64-8`
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
___
$code.=<<___ if ($SZ==8);
.long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
.long 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
.long 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
.long 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
.long 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
.long 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
.long 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
.long 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
.long 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
.long 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
.long 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
.long 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
.long 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
.long 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
.long 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
.long 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
.long 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
.long 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
.long 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
.long 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
.long 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
.long 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
.long 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
.long 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
.long 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
.long 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
.long 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
.long 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
.long 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
.long 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
.long 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
.long 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
.long 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
.long 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
.long 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
.long 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
.long 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
.long 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
.long 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
.long 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,322 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 block procedures for s390x.
# April 2007.
#
# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
# generated code (must be a bug in compiler, as improvement is
# "pathologically" high, in particular in comparison to other SHA
# modules). But the real twist is that it detects if hardware support
# for SHA256 is available and in such case utilizes it. Then the
# performance can reach >6.5x of assembler one for larger chunks.
#
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
# January 2009.
#
# Add support for hardware SHA512 and reschedule instructions to
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
# than software.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z900 SHA256 was measured to
# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
$t0="%r0";
$t1="%r1";
$ctx="%r2"; $t2="%r2";
$inp="%r3";
$len="%r4"; # used as index in inner loop
$A="%r5";
$B="%r6";
$C="%r7";
$D="%r8";
$E="%r9";
$F="%r10";
$G="%r11";
$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
$tbl="%r13";
$T1="%r14";
$sp="%r15";
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="lg"; # load from memory
$ST="stg"; # store to memory
$ADD="alg"; # add with memory operand
$ROT="rllg"; # rotate left
$SHR="srlg"; # logical right shift [see even at the end]
@Sigma0=(25,30,36);
@Sigma1=(23,46,50);
@sigma0=(56,63, 7);
@sigma1=( 3,45, 6);
$rounds=80;
$kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
} else {
$label="256";
$SZ=4;
$LD="llgf"; # load from memory
$ST="st"; # store to memory
$ADD="al"; # add with memory operand
$ROT="rll"; # rotate left
$SHR="srl"; # logical right shift
@Sigma0=(10,19,30);
@Sigma1=( 7,21,26);
@sigma0=(14,25, 3);
@sigma1=(13,15,10);
$rounds=64;
$kimdfunc=2; # magic function code for kimd instruction
}
$Func="sha${label}_block_data_order";
$Table="K${label}";
$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+16*$SZ;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
$LD $T1,`$i*$SZ`($inp) ### $i
___
$code.=<<___;
$ROT $t0,$e,$Sigma1[0]
$ROT $t1,$e,$Sigma1[1]
lgr $t2,$f
xgr $t0,$t1
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
xgr $t2,$g
$ST $T1,`$stdframe+$SZ*($i%16)`($sp)
xgr $t0,$t1 # Sigma1(e)
algr $T1,$h # T1+=h
ngr $t2,$e
lgr $t1,$a
algr $T1,$t0 # T1+=Sigma1(e)
$ROT $h,$a,$Sigma0[0]
xgr $t2,$g # Ch(e,f,g)
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
$ROT $t0,$a,$Sigma0[1]
algr $T1,$t2 # T1+=Ch(e,f,g)
ogr $t1,$b
xgr $h,$t0
lgr $t2,$a
ngr $t1,$c
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
xgr $h,$t0 # h=Sigma0(a)
ngr $t2,$b
algr $h,$T1 # h+=T1
ogr $t2,$t1 # Maj(a,b,c)
algr $d,$T1 # d+=T1
algr $h,$t2 # h+=Maj(a,b,c)
___
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
$LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
$LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
$ROT $t0,$T1,$sigma0[0]
$SHR $T1,$sigma0[2]
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0
$ROT $t0,$t1,$sigma1[0]
xgr $T1,$t2 # sigma0(X[i+1])
$SHR $t1,$sigma1[2]
$ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
xgr $t1,$t0
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
$ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
xgr $t1,$t0 # sigma1(X[i+14])
algr $T1,$t1 # +=sigma1(X[i+14])
___
&BODY_00_15(@_);
}
$code.=<<___;
.text
.align 64
.type $Table,\@object
$Table:
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
___
$code.=<<___;
.size $Table,.-$Table
.globl $Func
.type $Func,\@function
$Func:
sllg $len,$len,`log(16*$SZ)/log(2)`
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
lg %r0,0(%r1)
tmhl %r0,0x4000 # check for message-security assist
jz .Lsoftware
lghi %r0,0
la %r1,`2*$SIZE_T`($sp)
.long 0xb93e0002 # kimd %r0,%r2
lg %r0,`2*$SIZE_T`($sp)
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
lgr %r1,$ctx
lgr %r2,$inp
lgr %r3,$len
.long 0xb93e0002 # kimd %r0,%r2
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 16
.Lsoftware:
___
$code.=<<___;
lghi %r1,-$frame
la $len,0($len,$inp)
stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
larl $tbl,$Table
$LD $A,`0*$SZ`($ctx)
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
$LD $D,`3*$SZ`($ctx)
$LD $E,`4*$SZ`($ctx)
$LD $F,`5*$SZ`($ctx)
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
.Lloop:
lghi $len,0
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
aghi $len,`16*$SZ`
lghi $t0,`($rounds-16)*$SZ`
clgr $len,$t0
jne .Lrounds_16_xx
l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,`16*$SZ`($inp)
$ADD $A,`0*$SZ`($ctx)
$ADD $B,`1*$SZ`($ctx)
$ADD $C,`2*$SZ`($ctx)
$ADD $D,`3*$SZ`($ctx)
$ADD $E,`4*$SZ`($ctx)
$ADD $F,`5*$SZ`($ctx)
$ADD $G,`6*$SZ`($ctx)
$ADD $H,`7*$SZ`($ctx)
$ST $A,`0*$SZ`($ctx)
$ST $B,`1*$SZ`($ctx)
$ST $C,`2*$SZ`($ctx)
$ST $D,`3*$SZ`($ctx)
$ST $E,`4*$SZ`($ctx)
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
cl${g} $inp,`$frame+4*$SIZE_T`($sp)
jne .Lloop
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size $Func,.-$Func
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
# unlike 32-bit shift 64-bit one takes three arguments
$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
print $code;
close STDOUT;

View File

@@ -0,0 +1,594 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256 performance improvement over compiler generated code varies
# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
# build]. Just like in SHA1 module I aim to ensure scalability on
# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
# SHA512 on pre-T1 UltraSPARC.
#
# Performance is >75% better than 64-bit code generated by Sun C and
# over 2x than 32-bit code. X[16] resides on stack, but access to it
# is scheduled for L2 latency and staged through 32 least significant
# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
# good [optimal coefficient is 50%].
#
# SHA512 on UltraSPARC T1.
#
# It's not any faster than 64-bit code generated by Sun C 5.8. This is
# because 64-bit code generator has the advantage of using 64-bit
# loads(*) to access X[16], which I consciously traded for 32-/64-bit
# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
# code by 60%, not to mention that it doesn't suffer from severe decay
# when running 4 times physical cores threads and that it leaves gcc
# [3.4] behind by over 4x factor! If compared to SHA256, single thread
# performance is only 10% better, but overall throughput for maximum
# amount of threads for given CPU exceeds corresponding one of SHA256
# by 30% [again, optimal coefficient is 50%].
#
# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
# in-order, i.e. load instruction has to complete prior next
# instruction in given thread is executed, even if the latter is
# not dependent on load result! This means that on T1 two 32-bit
# loads are always slower than one 64-bit load. Once again this
# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
# 2x32-bit loads can be as fast as 1x64-bit ones.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$output=shift;
open STDOUT,">$output";
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="ldx"; # load from memory
$ST="stx"; # store to memory
$SLL="sllx"; # shift left logical
$SRL="srlx"; # shift right logical
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=( 7, 1, 8); # right shift first
@sigma1=( 6,19,61); # right shift first
$lastK=0x817;
$rounds=80;
$align=4;
$locals=16*$SZ; # X[16]
$A="%o0";
$B="%o1";
$C="%o2";
$D="%o3";
$E="%o4";
$F="%o5";
$G="%g1";
$H="%o7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
} else {
$label="256";
$SZ=4;
$LD="ld"; # load from memory
$ST="st"; # store to memory
$SLL="sll"; # shift left logical
$SRL="srl"; # shift right logical
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 3, 7,18); # right shift first
@sigma1=(10,17,19); # right shift first
$lastK=0x8f2;
$rounds=64;
$align=8;
$locals=0; # X[16] is register resident
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
$F="%l5";
$G="%l6";
$H="%l7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
}
$T1="%g2";
$tmp0="%g3";
$tmp1="%g4";
$tmp2="%g5";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$Ktbl="%i3";
$tmp31="%i4";
$tmp32="%i5";
########### SHA256
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i==0) {
$code.=<<___;
ldx [$inp+0],@X[0]
ldx [$inp+16],@X[2]
ldx [$inp+32],@X[4]
ldx [$inp+48],@X[6]
ldx [$inp+8],@X[1]
ldx [$inp+24],@X[3]
subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
ldx [$inp+40],@X[5]
bz,pt %icc,.Laligned
ldx [$inp+56],@X[7]
sllx @X[0],$tmp31,@X[0]
ldx [$inp+64],$T1
___
for($j=0;$j<7;$j++)
{ $code.=<<___;
srlx @X[$j+1],$tmp32,$tmp1
sllx @X[$j+1],$tmp31,@X[$j+1]
or $tmp1,@X[$j],@X[$j]
___
}
$code.=<<___;
srlx $T1,$tmp32,$T1
or $T1,@X[7],@X[7]
.Laligned:
___
}
if ($i&1) {
$code.="\tadd @X[$i/2],$h,$T1\n";
} else {
$code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
}
} if ($SZ==4);
########### SHA512
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
$code.=<<___ if ($i==0);
ld [$inp+0],%l0
ld [$inp+4],%l1
ld [$inp+8],%l2
ld [$inp+12],%l3
ld [$inp+16],%l4
ld [$inp+20],%l5
ld [$inp+24],%l6
ld [$inp+28],%l7
___
$code.=<<___ if ($i<15);
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
sllx @pair[0],$tmp0,$tmp1
`"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
or @pair[1],$tmp2,$tmp2
`"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
add $h,$tmp2,$T1
$ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
___
$code.=<<___ if ($i==12);
brnz,a $tmp31,.+8
ld [$inp+128],%l0
___
$code.=<<___ if ($i==15);
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
sllx @pair[0],$tmp0,$tmp1
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
or @pair[1],$tmp2,$tmp2
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
add $h,$tmp2,$T1
$ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
___
} if ($SZ==8);
########### common
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i<16) {
&$Xload(@_);
} else {
$code.="\tadd $h,$T1,$T1\n";
}
$code.=<<___;
$SRL $e,@Sigma1[0],$h !! $i
xor $f,$g,$tmp2
$SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
and $e,$tmp2,$tmp2
$SRL $e,@Sigma1[1],$tmp0
xor $tmp1,$h,$h
$SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $e,@Sigma1[2],$tmp0
xor $tmp1,$h,$h
$SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
xor $tmp0,$h,$h
xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
xor $tmp1,$h,$tmp0 ! Sigma1(e)
$SRL $a,@Sigma0[0],$h
add $tmp2,$T1,$T1
$LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
$SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
add $tmp0,$T1,$T1
$SRL $a,@Sigma0[1],$tmp0
xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $a,@Sigma0[2],$tmp0
xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
xor $tmp0,$h,$h
xor $tmp1,$h,$h ! Sigma0(a)
or $a,$b,$tmp0
and $a,$b,$tmp1
and $c,$tmp0,$tmp0
or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
add $tmp2,$T1,$T1 ! +=K[$i]
add $tmp1,$h,$h
add $T1,$d,$d
add $T1,$h,$h
___
}
########### SHA256
$BODY_16_XX = sub {
my $i=@_[0];
my $xi;
if ($i&1) {
$xi=$tmp32;
$code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
} else {
$xi=@X[(($i+1)/2)%8];
}
$code.=<<___;
srl $xi,@sigma0[0],$T1 !! Xupdate($i)
sll $xi,`32-@sigma0[2]`,$tmp1
srl $xi,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
xor $tmp0,$T1,$T1
srl $xi,@sigma0[2],$tmp0
xor $tmp1,$T1,$T1
___
if ($i&1) {
$xi=@X[(($i+14)/2)%8];
} else {
$xi=$tmp32;
$code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
}
$code.=<<___;
srl $xi,@sigma1[0],$tmp2
xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
sll $xi,`32-@sigma1[2]`,$tmp1
srl $xi,@sigma1[1],$tmp0
xor $tmp1,$tmp2,$tmp2
sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
xor $tmp0,$tmp2,$tmp2
srl $xi,@sigma1[2],$tmp0
xor $tmp1,$tmp2,$tmp2
___
if ($i&1) {
$xi=@X[($i/2)%8];
$code.=<<___;
srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
srl @X[($i/2)%8],0,$tmp0
add $tmp2,$tmp1,$tmp1
add $xi,$T1,$T1 ! +=X[i]
xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
add $tmp1,$T1,$T1
srl $T1,0,$T1
or $T1,@X[($i/2)%8],@X[($i/2)%8]
___
} else {
$xi=@X[(($i+9)/2)%8];
$code.=<<___;
srlx @X[($i/2)%8],32,$tmp1 ! X[i]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
add $xi,$T1,$T1 ! +=X[i+9]
add $tmp2,$tmp1,$tmp1
srl @X[($i/2)%8],0,@X[($i/2)%8]
add $tmp1,$T1,$T1
sllx $T1,32,$tmp0
or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
___
}
&BODY_00_15(@_);
} if ($SZ==4);
########### SHA512
$BODY_16_XX = sub {
my $i=@_[0];
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
$code.=<<___;
sllx %l2,32,$tmp0 !! Xupdate($i)
or %l3,$tmp0,$tmp0
srlx $tmp0,@sigma0[0],$T1
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
sllx $tmp0,`64-@sigma0[2]`,$tmp1
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
srlx $tmp0,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
xor $tmp0,$T1,$T1
srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
xor $tmp1,$T1,$T1
sllx %l6,32,$tmp2
xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
or %l7,$tmp2,$tmp2
srlx $tmp2,@sigma1[0],$tmp1
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
sllx $tmp2,`64-@sigma1[2]`,$tmp0
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
srlx $tmp2,@sigma1[1],$tmp2
xor $tmp0,$tmp1,$tmp1
sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
xor $tmp2,$tmp1,$tmp1
srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
xor $tmp0,$tmp1,$tmp1
sllx %l4,32,$tmp0
xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
or %l5,$tmp0,$tmp0
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
sllx %l0,32,$tmp2
add $tmp1,$T1,$T1
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
or %l1,$tmp2,$tmp2
add $tmp0,$T1,$T1 ! +=X[$i+9]
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
add $tmp2,$T1,$T1 ! +=X[$i]
$ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
___
&BODY_00_15(@_);
} if ($SZ==8);
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.section ".text",#alloc,#execinstr
.align 64
K${label}:
.type K${label},#object
___
if ($SZ==4) {
$code.=<<___;
.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
___
} else {
$code.=<<___;
.long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
.long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
.long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
.long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
.long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
.long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
.long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
.long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
.long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
.long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
.long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
.long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
.long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
.long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
.long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
.long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
.long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
.long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
.long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
.long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
.long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
.long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
.long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
.long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
.long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
.long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
.long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
.long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
.long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
.long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
.long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
.long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
.long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
.long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
.long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
.long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
.long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
.long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
.long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
.long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
___
}
$code.=<<___;
.size K${label},.-K${label}
.globl sha${label}_block_data_order
sha${label}_block_data_order:
save %sp,`-$frame-$locals`,%sp
and $inp,`$align-1`,$tmp31
sllx $len,`log(16*$SZ)/log(2)`,$len
andn $inp,`$align-1`,$inp
sll $tmp31,3,$tmp31
add $inp,$len,$len
___
$code.=<<___ if ($SZ==8); # SHA512
mov 32,$tmp32
sub $tmp32,$tmp31,$tmp32
___
$code.=<<___;
.Lpic: call .+8
add %o7,K${label}-.Lpic,$Ktbl
$LD [$ctx+`0*$SZ`],$A
$LD [$ctx+`1*$SZ`],$B
$LD [$ctx+`2*$SZ`],$C
$LD [$ctx+`3*$SZ`],$D
$LD [$ctx+`4*$SZ`],$E
$LD [$ctx+`5*$SZ`],$F
$LD [$ctx+`6*$SZ`],$G
$LD [$ctx+`7*$SZ`],$H
.Lloop:
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".L16_xx:\n";
for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
and $tmp2,0xfff,$tmp2
cmp $tmp2,$lastK
bne .L16_xx
add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
___
$code.=<<___ if ($SZ==4); # SHA256
$LD [$ctx+`0*$SZ`],@X[0]
$LD [$ctx+`1*$SZ`],@X[1]
$LD [$ctx+`2*$SZ`],@X[2]
$LD [$ctx+`3*$SZ`],@X[3]
$LD [$ctx+`4*$SZ`],@X[4]
$LD [$ctx+`5*$SZ`],@X[5]
$LD [$ctx+`6*$SZ`],@X[6]
$LD [$ctx+`7*$SZ`],@X[7]
add $A,@X[0],$A
$ST $A,[$ctx+`0*$SZ`]
add $B,@X[1],$B
$ST $B,[$ctx+`1*$SZ`]
add $C,@X[2],$C
$ST $C,[$ctx+`2*$SZ`]
add $D,@X[3],$D
$ST $D,[$ctx+`3*$SZ`]
add $E,@X[4],$E
$ST $E,[$ctx+`4*$SZ`]
add $F,@X[5],$F
$ST $F,[$ctx+`5*$SZ`]
add $G,@X[6],$G
$ST $G,[$ctx+`6*$SZ`]
add $H,@X[7],$H
$ST $H,[$ctx+`7*$SZ`]
___
$code.=<<___ if ($SZ==8); # SHA512
ld [$ctx+`0*$SZ+0`],%l0
ld [$ctx+`0*$SZ+4`],%l1
ld [$ctx+`1*$SZ+0`],%l2
ld [$ctx+`1*$SZ+4`],%l3
ld [$ctx+`2*$SZ+0`],%l4
ld [$ctx+`2*$SZ+4`],%l5
ld [$ctx+`3*$SZ+0`],%l6
sllx %l0,32,$tmp0
ld [$ctx+`3*$SZ+4`],%l7
sllx %l2,32,$tmp1
or %l1,$tmp0,$tmp0
or %l3,$tmp1,$tmp1
add $tmp0,$A,$A
add $tmp1,$B,$B
$ST $A,[$ctx+`0*$SZ`]
sllx %l4,32,$tmp2
$ST $B,[$ctx+`1*$SZ`]
sllx %l6,32,$T1
or %l5,$tmp2,$tmp2
or %l7,$T1,$T1
add $tmp2,$C,$C
$ST $C,[$ctx+`2*$SZ`]
add $T1,$D,$D
$ST $D,[$ctx+`3*$SZ`]
ld [$ctx+`4*$SZ+0`],%l0
ld [$ctx+`4*$SZ+4`],%l1
ld [$ctx+`5*$SZ+0`],%l2
ld [$ctx+`5*$SZ+4`],%l3
ld [$ctx+`6*$SZ+0`],%l4
ld [$ctx+`6*$SZ+4`],%l5
ld [$ctx+`7*$SZ+0`],%l6
sllx %l0,32,$tmp0
ld [$ctx+`7*$SZ+4`],%l7
sllx %l2,32,$tmp1
or %l1,$tmp0,$tmp0
or %l3,$tmp1,$tmp1
add $tmp0,$E,$E
add $tmp1,$F,$F
$ST $E,[$ctx+`4*$SZ`]
sllx %l4,32,$tmp2
$ST $F,[$ctx+`5*$SZ`]
sllx %l6,32,$T1
or %l5,$tmp2,$tmp2
or %l7,$T1,$T1
add $tmp2,$G,$G
$ST $G,[$ctx+`6*$SZ`]
add $T1,$H,$H
$ST $H,[$ctx+`7*$SZ`]
___
$code.=<<___;
add $inp,`16*$SZ`,$inp ! advance inp
cmp $inp,$len
bne `$bits==64?"%xcc":"%icc"`,.Lloop
sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
ret
restore
.type sha${label}_block_data_order,#function
.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,451 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# sha256/512_block procedure for x86_64.
#
# 40% improvement over compiler-generated code on Opteron. On EM64T
# sha256 was observed to run >80% faster and sha512 - >40%. No magical
# tricks, just straight implementation... I really wonder why gcc
# [being armed with inline assembler] fails to generate as fast code.
# The only thing which is cool about this module is that it's very
# same instruction sequence used for both SHA-256 and SHA-512. In
# former case the instructions operate on 32-bit operands, while in
# latter - on 64-bit ones. All I had to do is to get one flavor right,
# the other one passed the test right away:-)
#
# sha256_block runs in ~1005 cycles on Opteron, which gives you
# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
# frequency in GHz. sha512_block runs in ~1275 cycles, which results
# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
# Well, if you compare it to IA-64 implementation, which maintains
# X[16] in register bank[!], tends to 4 instructions per CPU clock
# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
# issue Opteron pipeline and X[16] maintained in memory. So that *if*
# there is a way to improve it, *then* the only way would be to try to
# offload X[16] updates to SSE unit, but that would require "deeper"
# loop unroll, which in turn would naturally cause size blow-up, not
# to mention increased complexity! And once again, only *if* it's
# actually possible to noticeably improve overall ILP, instruction
# level parallelism, on a given CPU implementation in this case.
#
# Special note on Intel EM64T. While Opteron CPU exhibits perfect
# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
# [currently available] EM64T CPUs apparently are far from it. On the
# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
# sha256_block:-( This is presumably because 64-bit shifts/rotates
# apparently are not atomic instructions, but implemented in microcode.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
if ($output =~ /512/) {
$func="sha512_block_data_order";
$TABLE="K512";
$SZ=8;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
"%r8", "%r9", "%r10","%r11");
($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15");
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
} else {
$func="sha256_block_data_order";
$TABLE="K256";
$SZ=4;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
"%r8d","%r9d","%r10d","%r11d");
($T1,$a0,$a1,$a2)=("%r12d","%r13d","%r14d","%r15d");
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
}
$ctx="%rdi"; # 1st arg
$round="%rdi"; # zaps $ctx
$inp="%rsi"; # 2nd arg
$Tbl="%rbp";
$_ctx="16*$SZ+0*8(%rsp)";
$_inp="16*$SZ+1*8(%rsp)";
$_end="16*$SZ+2*8(%rsp)";
$_rsp="16*$SZ+3*8(%rsp)";
$framesz="16*$SZ+4*8";
sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
mov $f,$a2
mov $T1,`$SZ*($i&0xf)`(%rsp)
ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
xor $e,$a0
xor $g,$a2 # f^g
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
add $h,$T1 # T1+=h
xor $a,$a1
add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
and $e,$a2 # (f^g)&e
mov $b,$h
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
xor $e,$a0
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
xor $c,$h # b^c
xor $a,$a1
add $a2,$T1 # T1+=Ch(e,f,g)
mov $b,$a2
ror \$$Sigma1[0],$a0 # Sigma1(e)
and $a,$h # h=(b^c)&a
and $c,$a2 # b&c
ror \$$Sigma0[0],$a1 # Sigma0(a)
add $a0,$T1 # T1+=Sigma1(e)
add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
add $T1,$d # d+=T1
add $T1,$h # h+=T1
lea 1($round),$round # round++
add $a1,$h # h+=Sigma0(a)
___
}
sub ROUND_16_XX()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
mov $a0,$T1
mov $a1,$a2
ror \$`$sigma0[1]-$sigma0[0]`,$T1
xor $a0,$T1
shr \$$sigma0[2],$a0
ror \$$sigma0[0],$T1
xor $T1,$a0 # sigma0(X[(i+1)&0xf])
mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
ror \$`$sigma1[1]-$sigma1[0]`,$a2
xor $a1,$a2
shr \$$sigma1[2],$a1
ror \$$sigma1[0],$a2
add $a0,$T1
xor $a2,$a1 # sigma1(X[(i+14)&0xf])
add `$SZ*($i&0xf)`(%rsp),$T1
mov $e,$a0
add $a1,$T1
mov $a,$a1
___
&ROUND_00_15(@_);
}
$code=<<___;
.text
.globl $func
.type $func,\@function,4
.align 16
$func:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$$framesz,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
and \$-64,%rsp # align stack frame
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
.Lprologue:
lea $TABLE(%rip),$Tbl
mov $SZ*0($ctx),$A
mov $SZ*1($ctx),$B
mov $SZ*2($ctx),$C
mov $SZ*3($ctx),$D
mov $SZ*4($ctx),$E
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H
jmp .Lloop
.align 16
.Lloop:
xor $round,$round
___
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
$code.=" mov @ROT[4],$a0\n";
$code.=" mov @ROT[0],$a1\n";
$code.=" bswap $T1\n";
&ROUND_00_15($i,@ROT);
unshift(@ROT,pop(@ROT));
}
$code.=<<___;
jmp .Lrounds_16_xx
.align 16
.Lrounds_16_xx:
___
for(;$i<32;$i++) {
&ROUND_16_XX($i,@ROT);
unshift(@ROT,pop(@ROT));
}
$code.=<<___;
cmp \$$rounds,$round
jb .Lrounds_16_xx
mov $_ctx,$ctx
lea 16*$SZ($inp),$inp
add $SZ*0($ctx),$A
add $SZ*1($ctx),$B
add $SZ*2($ctx),$C
add $SZ*3($ctx),$D
add $SZ*4($ctx),$E
add $SZ*5($ctx),$F
add $SZ*6($ctx),$G
add $SZ*7($ctx),$H
cmp $_end,$inp
mov $A,$SZ*0($ctx)
mov $B,$SZ*1($ctx)
mov $C,$SZ*2($ctx)
mov $D,$SZ*3($ctx)
mov $E,$SZ*4($ctx)
mov $F,$SZ*5($ctx)
mov $G,$SZ*6($ctx)
mov $H,$SZ*7($ctx)
jb .Lloop
mov $_rsp,%rsi
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.Lepilogue:
ret
.size $func,.-$func
___
if ($SZ==4) {
$code.=<<___;
.align 64
.type $TABLE,\@object
$TABLE:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
} else {
$code.=<<___;
.align 64
.type $TABLE,\@object
$TABLE:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
___
}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
lea .Lepilogue(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva .LSEH_begin_$func
.rva .LSEH_end_$func
.rva .LSEH_info_$func
.section .xdata
.align 8
.LSEH_info_$func:
.byte 9,0,0,0
.rva se_handler
___
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,124 @@
/* crypto/sha/sha.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <stdlib.h>
#include <openssl/sha.h>
#define BUFSIZE 1024*16
void do_fp(FILE *f);
void pt(unsigned char *md);
int read(int, void *, unsigned int);
int main(int argc, char **argv)
{
int i,err=0;
FILE *IN;
if (argc == 1)
{
do_fp(stdin);
}
else
{
for (i=1; i<argc; i++)
{
IN=fopen(argv[i],"r");
if (IN == NULL)
{
perror(argv[i]);
err++;
continue;
}
printf("SHA(%s)= ",argv[i]);
do_fp(IN);
fclose(IN);
}
}
exit(err);
}
void do_fp(FILE *f)
{
SHA_CTX c;
unsigned char md[SHA_DIGEST_LENGTH];
int fd;
int i;
unsigned char buf[BUFSIZE];
fd=fileno(f);
SHA_Init(&c);
for (;;)
{
i=read(fd,buf,BUFSIZE);
if (i <= 0) break;
SHA_Update(&c,buf,(unsigned long)i);
}
SHA_Final(&(md[0]),&c);
pt(md);
}
void pt(unsigned char *md)
{
int i;
for (i=0; i<SHA_DIGEST_LENGTH; i++)
printf("%02x",md[i]);
printf("\n");
}

View File

@@ -0,0 +1,214 @@
/* crypto/sha/sha.h */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#ifndef HEADER_SHA_H
#define HEADER_SHA_H
#include <openssl/e_os2.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
#if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1))
#error SHA is disabled.
#endif
#if defined(OPENSSL_FIPS)
#define FIPS_SHA_SIZE_T size_t
#endif
/*
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
* ! SHA_LONG has to be at least 32 bits wide. If it's wider, then !
* ! SHA_LONG_LOG2 has to be defined along. !
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*/
#if defined(__LP32__)
#define SHA_LONG unsigned long
#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__)
#define SHA_LONG unsigned long
#define SHA_LONG_LOG2 3
#else
#define SHA_LONG unsigned int
#endif
#define SHA_LBLOCK 16
#define SHA_CBLOCK (SHA_LBLOCK*4) /* SHA treats input data as a
* contiguous array of 32 bit
* wide big-endian values. */
#define SHA_LAST_BLOCK (SHA_CBLOCK-8)
#define SHA_DIGEST_LENGTH 20
typedef struct SHAstate_st
{
SHA_LONG h0,h1,h2,h3,h4;
SHA_LONG Nl,Nh;
SHA_LONG data[SHA_LBLOCK];
unsigned int num;
} SHA_CTX;
#ifndef OPENSSL_NO_SHA0
#ifdef OPENSSL_FIPS
int private_SHA_Init(SHA_CTX *c);
#endif
int SHA_Init(SHA_CTX *c);
int SHA_Update(SHA_CTX *c, const void *data, size_t len);
int SHA_Final(unsigned char *md, SHA_CTX *c);
unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
void SHA_Transform(SHA_CTX *c, const unsigned char *data);
#endif
#ifndef OPENSSL_NO_SHA1
#ifdef OPENSSL_FIPS
int private_SHA1_Init(SHA_CTX *c);
#endif
int SHA1_Init(SHA_CTX *c);
int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
int SHA1_Final(unsigned char *md, SHA_CTX *c);
unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
#endif
#define SHA256_CBLOCK (SHA_LBLOCK*4) /* SHA-256 treats input data as a
* contiguous array of 32 bit
* wide big-endian values. */
#define SHA224_DIGEST_LENGTH 28
#define SHA256_DIGEST_LENGTH 32
typedef struct SHA256state_st
{
SHA_LONG h[8];
SHA_LONG Nl,Nh;
SHA_LONG data[SHA_LBLOCK];
unsigned int num,md_len;
} SHA256_CTX;
#ifndef OPENSSL_NO_SHA256
#ifdef OPENSSL_FIPS
int private_SHA224_Init(SHA256_CTX *c);
int private_SHA256_Init(SHA256_CTX *c);
#endif
int SHA224_Init(SHA256_CTX *c);
int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
int SHA224_Final(unsigned char *md, SHA256_CTX *c);
unsigned char *SHA224(const unsigned char *d, size_t n,unsigned char *md);
int SHA256_Init(SHA256_CTX *c);
int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
int SHA256_Final(unsigned char *md, SHA256_CTX *c);
unsigned char *SHA256(const unsigned char *d, size_t n,unsigned char *md);
void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
#endif
#define SHA384_DIGEST_LENGTH 48
#define SHA512_DIGEST_LENGTH 64
#ifndef OPENSSL_NO_SHA512
/*
* Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
* being exactly 64-bit wide. See Implementation Notes in sha512.c
* for further details.
*/
#define SHA512_CBLOCK (SHA_LBLOCK*8) /* SHA-512 treats input data as a
* contiguous array of 64 bit
* wide big-endian values. */
#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
#define SHA_LONG64 unsigned __int64
#define U64(C) C##UI64
#elif defined(__arch64__)
#define SHA_LONG64 unsigned long
#define U64(C) C##UL
#else
#define SHA_LONG64 unsigned long long
#define U64(C) C##ULL
#endif
typedef struct SHA512state_st
{
SHA_LONG64 h[8];
SHA_LONG64 Nl,Nh;
union {
SHA_LONG64 d[SHA_LBLOCK];
unsigned char p[SHA512_CBLOCK];
} u;
unsigned int num,md_len;
} SHA512_CTX;
#endif
#ifndef OPENSSL_NO_SHA512
#ifdef OPENSSL_FIPS
int private_SHA384_Init(SHA512_CTX *c);
int private_SHA512_Init(SHA512_CTX *c);
#endif
int SHA384_Init(SHA512_CTX *c);
int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
int SHA384_Final(unsigned char *md, SHA512_CTX *c);
unsigned char *SHA384(const unsigned char *d, size_t n,unsigned char *md);
int SHA512_Init(SHA512_CTX *c);
int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
int SHA512_Final(unsigned char *md, SHA512_CTX *c);
unsigned char *SHA512(const unsigned char *d, size_t n,unsigned char *md);
void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,127 @@
/* crypto/sha/sha1.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <stdlib.h>
#include <openssl/sha.h>
#define BUFSIZE 1024*16
void do_fp(FILE *f);
void pt(unsigned char *md);
#ifndef _OSD_POSIX
int read(int, void *, unsigned int);
#endif
int main(int argc, char **argv)
{
int i,err=0;
FILE *IN;
if (argc == 1)
{
do_fp(stdin);
}
else
{
for (i=1; i<argc; i++)
{
IN=fopen(argv[i],"r");
if (IN == NULL)
{
perror(argv[i]);
err++;
continue;
}
printf("SHA1(%s)= ",argv[i]);
do_fp(IN);
fclose(IN);
}
}
exit(err);
}
void do_fp(FILE *f)
{
SHA_CTX c;
unsigned char md[SHA_DIGEST_LENGTH];
int fd;
int i;
unsigned char buf[BUFSIZE];
fd=fileno(f);
SHA1_Init(&c);
for (;;)
{
i=read(fd,buf,BUFSIZE);
if (i <= 0) break;
SHA1_Update(&c,buf,(unsigned long)i);
}
SHA1_Final(&(md[0]),&c);
pt(md);
}
void pt(unsigned char *md)
{
int i;
for (i=0; i<SHA_DIGEST_LENGTH; i++)
printf("%02x",md[i]);
printf("\n");
}

View File

@@ -0,0 +1,78 @@
/* crypto/sha/sha1_one.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <string.h>
#include <openssl/crypto.h>
#include <openssl/sha.h>
#ifndef OPENSSL_NO_SHA1
unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
{
SHA_CTX c;
static unsigned char m[SHA_DIGEST_LENGTH];
if (md == NULL) md=m;
if (!SHA1_Init(&c))
return NULL;
SHA1_Update(&c,d,n);
SHA1_Final(md,&c);
OPENSSL_cleanse(&c,sizeof(c));
return(md);
}
#endif

View File

@@ -0,0 +1,75 @@
/* crypto/sha/sha1dgst.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <openssl/crypto.h>
#include <openssl/opensslconf.h>
#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
#undef SHA_0
#define SHA_1
#include <openssl/opensslv.h>
const char SHA1_version[]="SHA1" OPENSSL_VERSION_PTEXT;
/* The implementation is in ../md32_common.h */
#include "sha_locl.h"
#endif

View File

@@ -0,0 +1,178 @@
/* crypto/sha/sha1test.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "../e_os.h"
#ifdef OPENSSL_NO_SHA
int main(int argc, char *argv[])
{
printf("No SHA support\n");
return(0);
}
#else
#include <openssl/evp.h>
#include <openssl/sha.h>
#ifdef CHARSET_EBCDIC
#include <openssl/ebcdic.h>
#endif
#undef SHA_0 /* FIPS 180 */
#define SHA_1 /* FIPS 180-1 */
static char *test[]={
"abc",
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
NULL,
};
#ifdef SHA_0
static char *ret[]={
"0164b8a914cd2a5e74c4f7ff082c4d97f1edf880",
"d2516ee1acfa5baf33dfc1c471e438449ef134c8",
};
static char *bigret=
"3232affa48628a26653b5aaa44541fd90d690603";
#endif
#ifdef SHA_1
static char *ret[]={
"a9993e364706816aba3e25717850c26c9cd0d89d",
"84983e441c3bd26ebaae4aa1f95129e5e54670f1",
};
static char *bigret=
"34aa973cd4c4daa4f61eeb2bdbad27316534016f";
#endif
static char *pt(unsigned char *md);
int main(int argc, char *argv[])
{
int i,err=0;
char **P,**R;
static unsigned char buf[1000];
char *p,*r;
EVP_MD_CTX c;
unsigned char md[SHA_DIGEST_LENGTH];
#ifdef CHARSET_EBCDIC
ebcdic2ascii(test[0], test[0], strlen(test[0]));
ebcdic2ascii(test[1], test[1], strlen(test[1]));
#endif
EVP_MD_CTX_init(&c);
P=test;
R=ret;
i=1;
while (*P != NULL)
{
EVP_Digest(*P,strlen((char *)*P),md,NULL,EVP_sha1(), NULL);
p=pt(md);
if (strcmp(p,(char *)*R) != 0)
{
printf("error calculating SHA1 on '%s'\n",*P);
printf("got %s instead of %s\n",p,*R);
err++;
}
else
printf("test %d ok\n",i);
i++;
R++;
P++;
}
memset(buf,'a',1000);
#ifdef CHARSET_EBCDIC
ebcdic2ascii(buf, buf, 1000);
#endif /*CHARSET_EBCDIC*/
EVP_DigestInit_ex(&c,EVP_sha1(), NULL);
for (i=0; i<1000; i++)
EVP_DigestUpdate(&c,buf,1000);
EVP_DigestFinal_ex(&c,md,NULL);
p=pt(md);
r=bigret;
if (strcmp(p,r) != 0)
{
printf("error calculating SHA1 on 'a' * 1000\n");
printf("got %s instead of %s\n",p,r);
err++;
}
else
printf("test 3 ok\n");
#ifdef OPENSSL_SYS_NETWARE
if (err) printf("ERROR: %d\n", err);
#endif
EXIT(err);
EVP_MD_CTX_cleanup(&c);
return(0);
}
static char *pt(unsigned char *md)
{
int i;
static char buf[80];
for (i=0; i<SHA_DIGEST_LENGTH; i++)
sprintf(&(buf[i*2]),"%02x",md[i]);
return(buf);
}
#endif

View File

@@ -0,0 +1,282 @@
/* crypto/sha/sha256.c */
/* ====================================================================
* Copyright (c) 2004 The OpenSSL Project. All rights reserved
* according to the OpenSSL license [found in ../../LICENSE].
* ====================================================================
*/
#include <openssl/opensslconf.h>
#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256)
#include <stdlib.h>
#include <string.h>
#include <openssl/crypto.h>
#include <openssl/sha.h>
#include <openssl/opensslv.h>
const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
fips_md_init_ctx(SHA224, SHA256)
{
memset (c,0,sizeof(*c));
c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
c->h[2]=0x3070dd17UL; c->h[3]=0xf70e5939UL;
c->h[4]=0xffc00b31UL; c->h[5]=0x68581511UL;
c->h[6]=0x64f98fa7UL; c->h[7]=0xbefa4fa4UL;
c->md_len=SHA224_DIGEST_LENGTH;
return 1;
}
fips_md_init(SHA256)
{
memset (c,0,sizeof(*c));
c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL;
c->h[4]=0x510e527fUL; c->h[5]=0x9b05688cUL;
c->h[6]=0x1f83d9abUL; c->h[7]=0x5be0cd19UL;
c->md_len=SHA256_DIGEST_LENGTH;
return 1;
}
unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
{
SHA256_CTX c;
static unsigned char m[SHA224_DIGEST_LENGTH];
if (md == NULL) md=m;
SHA224_Init(&c);
SHA256_Update(&c,d,n);
SHA256_Final(md,&c);
OPENSSL_cleanse(&c,sizeof(c));
return(md);
}
unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
{
SHA256_CTX c;
static unsigned char m[SHA256_DIGEST_LENGTH];
if (md == NULL) md=m;
SHA256_Init(&c);
SHA256_Update(&c,d,n);
SHA256_Final(md,&c);
OPENSSL_cleanse(&c,sizeof(c));
return(md);
}
int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
{ return SHA256_Update (c,data,len); }
int SHA224_Final (unsigned char *md, SHA256_CTX *c)
{ return SHA256_Final (md,c); }
#define DATA_ORDER_IS_BIG_ENDIAN
#define HASH_LONG SHA_LONG
#define HASH_CTX SHA256_CTX
#define HASH_CBLOCK SHA_CBLOCK
/*
* Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
* default: case below covers for it. It's not clear however if it's
* permitted to truncate to amount of bytes not divisible by 4. I bet not,
* but if it is, then default: case shall be extended. For reference.
* Idea behind separate cases for pre-defined lenghts is to let the
* compiler decide if it's appropriate to unroll small loops.
*/
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
unsigned int nn; \
switch ((c)->md_len) \
{ case SHA224_DIGEST_LENGTH: \
for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \
{ ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
case SHA256_DIGEST_LENGTH: \
for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \
{ ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
default: \
if ((c)->md_len > SHA256_DIGEST_LENGTH) \
return 0; \
for (nn=0;nn<(c)->md_len/4;nn++) \
{ ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
} \
} while (0)
#define HASH_UPDATE SHA256_Update
#define HASH_TRANSFORM SHA256_Transform
#define HASH_FINAL SHA256_Final
#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
#ifndef SHA256_ASM
static
#endif
void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num);
#include "md32_common.h"
#ifndef SHA256_ASM
static const SHA_LONG K256[64] = {
0x428a2f98UL,0x71374491UL,0xb5c0fbcfUL,0xe9b5dba5UL,
0x3956c25bUL,0x59f111f1UL,0x923f82a4UL,0xab1c5ed5UL,
0xd807aa98UL,0x12835b01UL,0x243185beUL,0x550c7dc3UL,
0x72be5d74UL,0x80deb1feUL,0x9bdc06a7UL,0xc19bf174UL,
0xe49b69c1UL,0xefbe4786UL,0x0fc19dc6UL,0x240ca1ccUL,
0x2de92c6fUL,0x4a7484aaUL,0x5cb0a9dcUL,0x76f988daUL,
0x983e5152UL,0xa831c66dUL,0xb00327c8UL,0xbf597fc7UL,
0xc6e00bf3UL,0xd5a79147UL,0x06ca6351UL,0x14292967UL,
0x27b70a85UL,0x2e1b2138UL,0x4d2c6dfcUL,0x53380d13UL,
0x650a7354UL,0x766a0abbUL,0x81c2c92eUL,0x92722c85UL,
0xa2bfe8a1UL,0xa81a664bUL,0xc24b8b70UL,0xc76c51a3UL,
0xd192e819UL,0xd6990624UL,0xf40e3585UL,0x106aa070UL,
0x19a4c116UL,0x1e376c08UL,0x2748774cUL,0x34b0bcb5UL,
0x391c0cb3UL,0x4ed8aa4aUL,0x5b9cca4fUL,0x682e6ff3UL,
0x748f82eeUL,0x78a5636fUL,0x84c87814UL,0x8cc70208UL,
0x90befffaUL,0xa4506cebUL,0xbef9a3f7UL,0xc67178f2UL };
/*
* FIPS specification refers to right rotations, while our ROTATE macro
* is left one. This is why you might notice that rotation coefficients
* differ from those observed in FIPS document by 32-N...
*/
#define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
#define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
#define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
#define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#ifdef OPENSSL_SMALL_FOOTPRINT
static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
{
unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2;
SHA_LONG X[16],l;
int i;
const unsigned char *data=in;
while (num--) {
a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
for (i=0;i<16;i++)
{
HOST_c2l(data,l); T1 = X[i] = l;
T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
T2 = Sigma0(a) + Maj(a,b,c);
h = g; g = f; f = e; e = d + T1;
d = c; c = b; b = a; a = T1 + T2;
}
for (;i<64;i++)
{
s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
T2 = Sigma0(a) + Maj(a,b,c);
h = g; g = f; f = e; e = d + T1;
d = c; c = b; b = a; a = T1 + T2;
}
ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
}
}
#else
#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \
h = Sigma0(a) + Maj(a,b,c); \
d += T1; h += T1; } while (0)
#define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0)
static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
{
unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1;
SHA_LONG X[16];
int i;
const unsigned char *data=in;
const union { long one; char little; } is_endian = {1};
while (num--) {
a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)in%4)==0)
{
const SHA_LONG *W=(const SHA_LONG *)data;
T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
data += SHA256_CBLOCK;
}
else
{
SHA_LONG l;
HOST_c2l(data,l); T1 = X[0] = l; ROUND_00_15(0,a,b,c,d,e,f,g,h);
HOST_c2l(data,l); T1 = X[1] = l; ROUND_00_15(1,h,a,b,c,d,e,f,g);
HOST_c2l(data,l); T1 = X[2] = l; ROUND_00_15(2,g,h,a,b,c,d,e,f);
HOST_c2l(data,l); T1 = X[3] = l; ROUND_00_15(3,f,g,h,a,b,c,d,e);
HOST_c2l(data,l); T1 = X[4] = l; ROUND_00_15(4,e,f,g,h,a,b,c,d);
HOST_c2l(data,l); T1 = X[5] = l; ROUND_00_15(5,d,e,f,g,h,a,b,c);
HOST_c2l(data,l); T1 = X[6] = l; ROUND_00_15(6,c,d,e,f,g,h,a,b);
HOST_c2l(data,l); T1 = X[7] = l; ROUND_00_15(7,b,c,d,e,f,g,h,a);
HOST_c2l(data,l); T1 = X[8] = l; ROUND_00_15(8,a,b,c,d,e,f,g,h);
HOST_c2l(data,l); T1 = X[9] = l; ROUND_00_15(9,h,a,b,c,d,e,f,g);
HOST_c2l(data,l); T1 = X[10] = l; ROUND_00_15(10,g,h,a,b,c,d,e,f);
HOST_c2l(data,l); T1 = X[11] = l; ROUND_00_15(11,f,g,h,a,b,c,d,e);
HOST_c2l(data,l); T1 = X[12] = l; ROUND_00_15(12,e,f,g,h,a,b,c,d);
HOST_c2l(data,l); T1 = X[13] = l; ROUND_00_15(13,d,e,f,g,h,a,b,c);
HOST_c2l(data,l); T1 = X[14] = l; ROUND_00_15(14,c,d,e,f,g,h,a,b);
HOST_c2l(data,l); T1 = X[15] = l; ROUND_00_15(15,b,c,d,e,f,g,h,a);
}
for (i=16;i<64;i+=8)
{
ROUND_16_63(i+0,a,b,c,d,e,f,g,h,X);
ROUND_16_63(i+1,h,a,b,c,d,e,f,g,X);
ROUND_16_63(i+2,g,h,a,b,c,d,e,f,X);
ROUND_16_63(i+3,f,g,h,a,b,c,d,e,X);
ROUND_16_63(i+4,e,f,g,h,a,b,c,d,X);
ROUND_16_63(i+5,d,e,f,g,h,a,b,c,X);
ROUND_16_63(i+6,c,d,e,f,g,h,a,b,X);
ROUND_16_63(i+7,b,c,d,e,f,g,h,a,X);
}
ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
}
}
#endif
#endif /* SHA256_ASM */
#endif /* OPENSSL_NO_SHA256 */

View File

@@ -0,0 +1,147 @@
/* crypto/sha/sha256t.c */
/* ====================================================================
* Copyright (c) 2004 The OpenSSL Project. All rights reserved.
* ====================================================================
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <openssl/sha.h>
#include <openssl/evp.h>
#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA256)
int main(int argc, char *argv[])
{
printf("No SHA256 support\n");
return(0);
}
#else
unsigned char app_b1[SHA256_DIGEST_LENGTH] = {
0xba,0x78,0x16,0xbf,0x8f,0x01,0xcf,0xea,
0x41,0x41,0x40,0xde,0x5d,0xae,0x22,0x23,
0xb0,0x03,0x61,0xa3,0x96,0x17,0x7a,0x9c,
0xb4,0x10,0xff,0x61,0xf2,0x00,0x15,0xad };
unsigned char app_b2[SHA256_DIGEST_LENGTH] = {
0x24,0x8d,0x6a,0x61,0xd2,0x06,0x38,0xb8,
0xe5,0xc0,0x26,0x93,0x0c,0x3e,0x60,0x39,
0xa3,0x3c,0xe4,0x59,0x64,0xff,0x21,0x67,
0xf6,0xec,0xed,0xd4,0x19,0xdb,0x06,0xc1 };
unsigned char app_b3[SHA256_DIGEST_LENGTH] = {
0xcd,0xc7,0x6e,0x5c,0x99,0x14,0xfb,0x92,
0x81,0xa1,0xc7,0xe2,0x84,0xd7,0x3e,0x67,
0xf1,0x80,0x9a,0x48,0xa4,0x97,0x20,0x0e,
0x04,0x6d,0x39,0xcc,0xc7,0x11,0x2c,0xd0 };
unsigned char addenum_1[SHA224_DIGEST_LENGTH] = {
0x23,0x09,0x7d,0x22,0x34,0x05,0xd8,0x22,
0x86,0x42,0xa4,0x77,0xbd,0xa2,0x55,0xb3,
0x2a,0xad,0xbc,0xe4,0xbd,0xa0,0xb3,0xf7,
0xe3,0x6c,0x9d,0xa7 };
unsigned char addenum_2[SHA224_DIGEST_LENGTH] = {
0x75,0x38,0x8b,0x16,0x51,0x27,0x76,0xcc,
0x5d,0xba,0x5d,0xa1,0xfd,0x89,0x01,0x50,
0xb0,0xc6,0x45,0x5c,0xb4,0xf5,0x8b,0x19,
0x52,0x52,0x25,0x25 };
unsigned char addenum_3[SHA224_DIGEST_LENGTH] = {
0x20,0x79,0x46,0x55,0x98,0x0c,0x91,0xd8,
0xbb,0xb4,0xc1,0xea,0x97,0x61,0x8a,0x4b,
0xf0,0x3f,0x42,0x58,0x19,0x48,0xb2,0xee,
0x4e,0xe7,0xad,0x67 };
int main (int argc,char **argv)
{ unsigned char md[SHA256_DIGEST_LENGTH];
int i;
EVP_MD_CTX evp;
fprintf(stdout,"Testing SHA-256 ");
EVP_Digest ("abc",3,md,NULL,EVP_sha256(),NULL);
if (memcmp(md,app_b1,sizeof(app_b1)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 1 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_Digest ("abcdbcde""cdefdefg""efghfghi""ghijhijk"
"ijkljklm""klmnlmno""mnopnopq",56,md,NULL,EVP_sha256(),NULL);
if (memcmp(md,app_b2,sizeof(app_b2)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 2 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_MD_CTX_init (&evp);
EVP_DigestInit_ex (&evp,EVP_sha256(),NULL);
for (i=0;i<1000000;i+=160)
EVP_DigestUpdate (&evp, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
(1000000-i)<160?1000000-i:160);
EVP_DigestFinal_ex (&evp,md,NULL);
EVP_MD_CTX_cleanup (&evp);
if (memcmp(md,app_b3,sizeof(app_b3)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 3 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
fprintf(stdout," passed.\n"); fflush(stdout);
fprintf(stdout,"Testing SHA-224 ");
EVP_Digest ("abc",3,md,NULL,EVP_sha224(),NULL);
if (memcmp(md,addenum_1,sizeof(addenum_1)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 1 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_Digest ("abcdbcde""cdefdefg""efghfghi""ghijhijk"
"ijkljklm""klmnlmno""mnopnopq",56,md,NULL,EVP_sha224(),NULL);
if (memcmp(md,addenum_2,sizeof(addenum_2)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 2 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_MD_CTX_init (&evp);
EVP_DigestInit_ex (&evp,EVP_sha224(),NULL);
for (i=0;i<1000000;i+=64)
EVP_DigestUpdate (&evp, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
(1000000-i)<64?1000000-i:64);
EVP_DigestFinal_ex (&evp,md,NULL);
EVP_MD_CTX_cleanup (&evp);
if (memcmp(md,addenum_3,sizeof(addenum_3)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 3 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
fprintf(stdout," passed.\n"); fflush(stdout);
return 0;
}
#endif

View File

@@ -0,0 +1,604 @@
/* crypto/sha/sha512.c */
/* ====================================================================
* Copyright (c) 2004 The OpenSSL Project. All rights reserved
* according to the OpenSSL license [found in ../../LICENSE].
* ====================================================================
*/
#include <openssl/opensslconf.h>
#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
/*
* IMPLEMENTATION NOTES.
*
* As you might have noticed 32-bit hash algorithms:
*
* - permit SHA_LONG to be wider than 32-bit (case on CRAY);
* - optimized versions implement two transform functions: one operating
* on [aligned] data in host byte order and one - on data in input
* stream byte order;
* - share common byte-order neutral collector and padding function
* implementations, ../md32_common.h;
*
* Neither of the above applies to this SHA-512 implementations. Reasons
* [in reverse order] are:
*
* - it's the only 64-bit hash algorithm for the moment of this writing,
* there is no need for common collector/padding implementation [yet];
* - by supporting only one transform function [which operates on
* *aligned* data in input stream byte order, big-endian in this case]
* we minimize burden of maintenance in two ways: a) collector/padding
* function is simpler; b) only one transform function to stare at;
* - SHA_LONG64 is required to be exactly 64-bit in order to be able to
* apply a number of optimizations to mitigate potential performance
* penalties caused by previous design decision;
*
* Caveat lector.
*
* Implementation relies on the fact that "long long" is 64-bit on
* both 32- and 64-bit platforms. If some compiler vendor comes up
* with 128-bit long long, adjustment to sha.h would be required.
* As this implementation relies on 64-bit integer type, it's totally
* inappropriate for platforms which don't support it, most notably
* 16-bit platforms.
* <appro@fy.chalmers.se>
*/
#include <stdlib.h>
#include <string.h>
#include <openssl/crypto.h>
#include <openssl/sha.h>
#include <openssl/opensslv.h>
#include "cryptlib.h"
const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \
defined(__s390__) || defined(__s390x__) || \
defined(SHA512_ASM)
#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
#endif
fips_md_init_ctx(SHA384, SHA512)
{
c->h[0]=U64(0xcbbb9d5dc1059ed8);
c->h[1]=U64(0x629a292a367cd507);
c->h[2]=U64(0x9159015a3070dd17);
c->h[3]=U64(0x152fecd8f70e5939);
c->h[4]=U64(0x67332667ffc00b31);
c->h[5]=U64(0x8eb44a8768581511);
c->h[6]=U64(0xdb0c2e0d64f98fa7);
c->h[7]=U64(0x47b5481dbefa4fa4);
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
return 1;
}
fips_md_init(SHA512)
{
c->h[0]=U64(0x6a09e667f3bcc908);
c->h[1]=U64(0xbb67ae8584caa73b);
c->h[2]=U64(0x3c6ef372fe94f82b);
c->h[3]=U64(0xa54ff53a5f1d36f1);
c->h[4]=U64(0x510e527fade682d1);
c->h[5]=U64(0x9b05688c2b3e6c1f);
c->h[6]=U64(0x1f83d9abfb41bd6b);
c->h[7]=U64(0x5be0cd19137e2179);
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
return 1;
}
#ifndef SHA512_ASM
static
#endif
void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num);
int SHA512_Final (unsigned char *md, SHA512_CTX *c)
{
unsigned char *p=(unsigned char *)c->u.p;
size_t n=c->num;
p[n]=0x80; /* There always is a room for one */
n++;
if (n > (sizeof(c->u)-16))
memset (p+n,0,sizeof(c->u)-n), n=0,
sha512_block_data_order (c,p,1);
memset (p+n,0,sizeof(c->u)-16-n);
#ifdef B_ENDIAN
c->u.d[SHA_LBLOCK-2] = c->Nh;
c->u.d[SHA_LBLOCK-1] = c->Nl;
#else
p[sizeof(c->u)-1] = (unsigned char)(c->Nl);
p[sizeof(c->u)-2] = (unsigned char)(c->Nl>>8);
p[sizeof(c->u)-3] = (unsigned char)(c->Nl>>16);
p[sizeof(c->u)-4] = (unsigned char)(c->Nl>>24);
p[sizeof(c->u)-5] = (unsigned char)(c->Nl>>32);
p[sizeof(c->u)-6] = (unsigned char)(c->Nl>>40);
p[sizeof(c->u)-7] = (unsigned char)(c->Nl>>48);
p[sizeof(c->u)-8] = (unsigned char)(c->Nl>>56);
p[sizeof(c->u)-9] = (unsigned char)(c->Nh);
p[sizeof(c->u)-10] = (unsigned char)(c->Nh>>8);
p[sizeof(c->u)-11] = (unsigned char)(c->Nh>>16);
p[sizeof(c->u)-12] = (unsigned char)(c->Nh>>24);
p[sizeof(c->u)-13] = (unsigned char)(c->Nh>>32);
p[sizeof(c->u)-14] = (unsigned char)(c->Nh>>40);
p[sizeof(c->u)-15] = (unsigned char)(c->Nh>>48);
p[sizeof(c->u)-16] = (unsigned char)(c->Nh>>56);
#endif
sha512_block_data_order (c,p,1);
if (md==0) return 0;
switch (c->md_len)
{
/* Let compiler decide if it's appropriate to unroll... */
case SHA384_DIGEST_LENGTH:
for (n=0;n<SHA384_DIGEST_LENGTH/8;n++)
{
SHA_LONG64 t = c->h[n];
*(md++) = (unsigned char)(t>>56);
*(md++) = (unsigned char)(t>>48);
*(md++) = (unsigned char)(t>>40);
*(md++) = (unsigned char)(t>>32);
*(md++) = (unsigned char)(t>>24);
*(md++) = (unsigned char)(t>>16);
*(md++) = (unsigned char)(t>>8);
*(md++) = (unsigned char)(t);
}
break;
case SHA512_DIGEST_LENGTH:
for (n=0;n<SHA512_DIGEST_LENGTH/8;n++)
{
SHA_LONG64 t = c->h[n];
*(md++) = (unsigned char)(t>>56);
*(md++) = (unsigned char)(t>>48);
*(md++) = (unsigned char)(t>>40);
*(md++) = (unsigned char)(t>>32);
*(md++) = (unsigned char)(t>>24);
*(md++) = (unsigned char)(t>>16);
*(md++) = (unsigned char)(t>>8);
*(md++) = (unsigned char)(t);
}
break;
/* ... as well as make sure md_len is not abused. */
default: return 0;
}
return 1;
}
int SHA384_Final (unsigned char *md,SHA512_CTX *c)
{ return SHA512_Final (md,c); }
int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
{
SHA_LONG64 l;
unsigned char *p=c->u.p;
const unsigned char *data=(const unsigned char *)_data;
if (len==0) return 1;
l = (c->Nl+(((SHA_LONG64)len)<<3))&U64(0xffffffffffffffff);
if (l < c->Nl) c->Nh++;
if (sizeof(len)>=8) c->Nh+=(((SHA_LONG64)len)>>61);
c->Nl=l;
if (c->num != 0)
{
size_t n = sizeof(c->u) - c->num;
if (len < n)
{
memcpy (p+c->num,data,len), c->num += (unsigned int)len;
return 1;
}
else {
memcpy (p+c->num,data,n), c->num = 0;
len-=n, data+=n;
sha512_block_data_order (c,p,1);
}
}
if (len >= sizeof(c->u))
{
#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
if ((size_t)data%sizeof(c->u.d[0]) != 0)
while (len >= sizeof(c->u))
memcpy (p,data,sizeof(c->u)),
sha512_block_data_order (c,p,1),
len -= sizeof(c->u),
data += sizeof(c->u);
else
#endif
sha512_block_data_order (c,data,len/sizeof(c->u)),
data += len,
len %= sizeof(c->u),
data -= len;
}
if (len != 0) memcpy (p,data,len), c->num = (int)len;
return 1;
}
int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
{ return SHA512_Update (c,data,len); }
void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
{
#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
if ((size_t)data%sizeof(c->u.d[0]) != 0)
memcpy(c->u.p,data,sizeof(c->u.p)),
data = c->u.p;
#endif
sha512_block_data_order (c,data,1);
}
unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
{
SHA512_CTX c;
static unsigned char m[SHA384_DIGEST_LENGTH];
if (md == NULL) md=m;
SHA384_Init(&c);
SHA512_Update(&c,d,n);
SHA512_Final(md,&c);
OPENSSL_cleanse(&c,sizeof(c));
return(md);
}
unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
{
SHA512_CTX c;
static unsigned char m[SHA512_DIGEST_LENGTH];
if (md == NULL) md=m;
SHA512_Init(&c);
SHA512_Update(&c,d,n);
SHA512_Final(md,&c);
OPENSSL_cleanse(&c,sizeof(c));
return(md);
}
#ifndef SHA512_ASM
static const SHA_LONG64 K512[80] = {
U64(0x428a2f98d728ae22),U64(0x7137449123ef65cd),
U64(0xb5c0fbcfec4d3b2f),U64(0xe9b5dba58189dbbc),
U64(0x3956c25bf348b538),U64(0x59f111f1b605d019),
U64(0x923f82a4af194f9b),U64(0xab1c5ed5da6d8118),
U64(0xd807aa98a3030242),U64(0x12835b0145706fbe),
U64(0x243185be4ee4b28c),U64(0x550c7dc3d5ffb4e2),
U64(0x72be5d74f27b896f),U64(0x80deb1fe3b1696b1),
U64(0x9bdc06a725c71235),U64(0xc19bf174cf692694),
U64(0xe49b69c19ef14ad2),U64(0xefbe4786384f25e3),
U64(0x0fc19dc68b8cd5b5),U64(0x240ca1cc77ac9c65),
U64(0x2de92c6f592b0275),U64(0x4a7484aa6ea6e483),
U64(0x5cb0a9dcbd41fbd4),U64(0x76f988da831153b5),
U64(0x983e5152ee66dfab),U64(0xa831c66d2db43210),
U64(0xb00327c898fb213f),U64(0xbf597fc7beef0ee4),
U64(0xc6e00bf33da88fc2),U64(0xd5a79147930aa725),
U64(0x06ca6351e003826f),U64(0x142929670a0e6e70),
U64(0x27b70a8546d22ffc),U64(0x2e1b21385c26c926),
U64(0x4d2c6dfc5ac42aed),U64(0x53380d139d95b3df),
U64(0x650a73548baf63de),U64(0x766a0abb3c77b2a8),
U64(0x81c2c92e47edaee6),U64(0x92722c851482353b),
U64(0xa2bfe8a14cf10364),U64(0xa81a664bbc423001),
U64(0xc24b8b70d0f89791),U64(0xc76c51a30654be30),
U64(0xd192e819d6ef5218),U64(0xd69906245565a910),
U64(0xf40e35855771202a),U64(0x106aa07032bbd1b8),
U64(0x19a4c116b8d2d0c8),U64(0x1e376c085141ab53),
U64(0x2748774cdf8eeb99),U64(0x34b0bcb5e19b48a8),
U64(0x391c0cb3c5c95a63),U64(0x4ed8aa4ae3418acb),
U64(0x5b9cca4f7763e373),U64(0x682e6ff3d6b2b8a3),
U64(0x748f82ee5defb2fc),U64(0x78a5636f43172f60),
U64(0x84c87814a1f0ab72),U64(0x8cc702081a6439ec),
U64(0x90befffa23631e28),U64(0xa4506cebde82bde9),
U64(0xbef9a3f7b2c67915),U64(0xc67178f2e372532b),
U64(0xca273eceea26619c),U64(0xd186b8c721c0c207),
U64(0xeada7dd6cde0eb1e),U64(0xf57d4f7fee6ed178),
U64(0x06f067aa72176fba),U64(0x0a637dc5a2c898a6),
U64(0x113f9804bef90dae),U64(0x1b710b35131c471b),
U64(0x28db77f523047d84),U64(0x32caab7b40c72493),
U64(0x3c9ebe0a15c9bebc),U64(0x431d67c49c100d4c),
U64(0x4cc5d4becb3e42b6),U64(0x597f299cfc657e2a),
U64(0x5fcb6fab3ad6faec),U64(0x6c44198c4a475817) };
#ifndef PEDANTIC
# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
# if defined(__x86_64) || defined(__x86_64__)
# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("rorq %1,%0" \
: "=r"(ret) \
: "J"(n),"0"(a) \
: "cc"); ret; })
# if !defined(B_ENDIAN)
# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
asm ("bswapq %0" \
: "=r"(ret) \
: "0"(ret)); ret; })
# endif
# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN)
# if defined(I386_ONLY)
# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
unsigned int hi=p[0],lo=p[1]; \
asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
"roll $16,%%eax; roll $16,%%edx; "\
"xchgb %%ah,%%al;xchgb %%dh,%%dl;" \
: "=a"(lo),"=d"(hi) \
: "0"(lo),"1"(hi) : "cc"); \
((SHA_LONG64)hi)<<32|lo; })
# else
# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
unsigned int hi=p[0],lo=p[1]; \
asm ("bswapl %0; bswapl %1;" \
: "=r"(lo),"=r"(hi) \
: "0"(lo),"1"(hi)); \
((SHA_LONG64)hi)<<32|lo; })
# endif
# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("rotrdi %0,%1,%2" \
: "=r"(ret) \
: "r"(a),"K"(n)); ret; })
# endif
# elif defined(_MSC_VER)
# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
# pragma intrinsic(_rotr64)
# define ROTR(a,n) _rotr64((a),n)
# endif
# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
# if defined(I386_ONLY)
static SHA_LONG64 __fastcall __pull64be(const void *x)
{ _asm mov edx, [ecx + 0]
_asm mov eax, [ecx + 4]
_asm xchg dh,dl
_asm xchg ah,al
_asm rol edx,16
_asm rol eax,16
_asm xchg dh,dl
_asm xchg ah,al
}
# else
static SHA_LONG64 __fastcall __pull64be(const void *x)
{ _asm mov edx, [ecx + 0]
_asm mov eax, [ecx + 4]
_asm bswap edx
_asm bswap eax
}
# endif
# define PULL64(x) __pull64be(&(x))
# if _MSC_VER<=1200
# pragma inline_depth(0)
# endif
# endif
# endif
#endif
#ifndef PULL64
#define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
#endif
#ifndef ROTR
#define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
#endif
#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
/*
* This code should give better results on 32-bit CPU with less than
* ~24 registers, both size and performance wise...
*/
static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
{
const SHA_LONG64 *W=in;
SHA_LONG64 A,E,T;
SHA_LONG64 X[9+80],*F;
int i;
while (num--) {
F = X+80;
A = ctx->h[0]; F[1] = ctx->h[1];
F[2] = ctx->h[2]; F[3] = ctx->h[3];
E = ctx->h[4]; F[5] = ctx->h[5];
F[6] = ctx->h[6]; F[7] = ctx->h[7];
for (i=0;i<16;i++,F--)
{
#ifdef B_ENDIAN
T = W[i];
#else
T = PULL64(W[i]);
#endif
F[0] = A;
F[4] = E;
F[8] = T;
T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
E = F[3] + T;
A = T + Sigma0(A) + Maj(A,F[1],F[2]);
}
for (;i<80;i++,F--)
{
T = sigma0(F[8+16-1]);
T += sigma1(F[8+16-14]);
T += F[8+16] + F[8+16-9];
F[0] = A;
F[4] = E;
F[8] = T;
T += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
E = F[3] + T;
A = T + Sigma0(A) + Maj(A,F[1],F[2]);
}
ctx->h[0] += A; ctx->h[1] += F[1];
ctx->h[2] += F[2]; ctx->h[3] += F[3];
ctx->h[4] += E; ctx->h[5] += F[5];
ctx->h[6] += F[6]; ctx->h[7] += F[7];
W+=SHA_LBLOCK;
}
}
#elif defined(OPENSSL_SMALL_FOOTPRINT)
static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
{
const SHA_LONG64 *W=in;
SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1,T2;
SHA_LONG64 X[16];
int i;
while (num--) {
a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
for (i=0;i<16;i++)
{
#ifdef B_ENDIAN
T1 = X[i] = W[i];
#else
T1 = X[i] = PULL64(W[i]);
#endif
T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
T2 = Sigma0(a) + Maj(a,b,c);
h = g; g = f; f = e; e = d + T1;
d = c; c = b; b = a; a = T1 + T2;
}
for (;i<80;i++)
{
s0 = X[(i+1)&0x0f]; s0 = sigma0(s0);
s1 = X[(i+14)&0x0f]; s1 = sigma1(s1);
T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
T2 = Sigma0(a) + Maj(a,b,c);
h = g; g = f; f = e; e = d + T1;
d = c; c = b; b = a; a = T1 + T2;
}
ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
W+=SHA_LBLOCK;
}
}
#else
#define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \
h = Sigma0(a) + Maj(a,b,c); \
d += T1; h += T1; } while (0)
#define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \
s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \
T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \
ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0)
static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
{
const SHA_LONG64 *W=in;
SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1;
SHA_LONG64 X[16];
int i;
while (num--) {
a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3];
e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7];
#ifdef B_ENDIAN
T1 = X[0] = W[0]; ROUND_00_15(0,a,b,c,d,e,f,g,h);
T1 = X[1] = W[1]; ROUND_00_15(1,h,a,b,c,d,e,f,g);
T1 = X[2] = W[2]; ROUND_00_15(2,g,h,a,b,c,d,e,f);
T1 = X[3] = W[3]; ROUND_00_15(3,f,g,h,a,b,c,d,e);
T1 = X[4] = W[4]; ROUND_00_15(4,e,f,g,h,a,b,c,d);
T1 = X[5] = W[5]; ROUND_00_15(5,d,e,f,g,h,a,b,c);
T1 = X[6] = W[6]; ROUND_00_15(6,c,d,e,f,g,h,a,b);
T1 = X[7] = W[7]; ROUND_00_15(7,b,c,d,e,f,g,h,a);
T1 = X[8] = W[8]; ROUND_00_15(8,a,b,c,d,e,f,g,h);
T1 = X[9] = W[9]; ROUND_00_15(9,h,a,b,c,d,e,f,g);
T1 = X[10] = W[10]; ROUND_00_15(10,g,h,a,b,c,d,e,f);
T1 = X[11] = W[11]; ROUND_00_15(11,f,g,h,a,b,c,d,e);
T1 = X[12] = W[12]; ROUND_00_15(12,e,f,g,h,a,b,c,d);
T1 = X[13] = W[13]; ROUND_00_15(13,d,e,f,g,h,a,b,c);
T1 = X[14] = W[14]; ROUND_00_15(14,c,d,e,f,g,h,a,b);
T1 = X[15] = W[15]; ROUND_00_15(15,b,c,d,e,f,g,h,a);
#else
T1 = X[0] = PULL64(W[0]); ROUND_00_15(0,a,b,c,d,e,f,g,h);
T1 = X[1] = PULL64(W[1]); ROUND_00_15(1,h,a,b,c,d,e,f,g);
T1 = X[2] = PULL64(W[2]); ROUND_00_15(2,g,h,a,b,c,d,e,f);
T1 = X[3] = PULL64(W[3]); ROUND_00_15(3,f,g,h,a,b,c,d,e);
T1 = X[4] = PULL64(W[4]); ROUND_00_15(4,e,f,g,h,a,b,c,d);
T1 = X[5] = PULL64(W[5]); ROUND_00_15(5,d,e,f,g,h,a,b,c);
T1 = X[6] = PULL64(W[6]); ROUND_00_15(6,c,d,e,f,g,h,a,b);
T1 = X[7] = PULL64(W[7]); ROUND_00_15(7,b,c,d,e,f,g,h,a);
T1 = X[8] = PULL64(W[8]); ROUND_00_15(8,a,b,c,d,e,f,g,h);
T1 = X[9] = PULL64(W[9]); ROUND_00_15(9,h,a,b,c,d,e,f,g);
T1 = X[10] = PULL64(W[10]); ROUND_00_15(10,g,h,a,b,c,d,e,f);
T1 = X[11] = PULL64(W[11]); ROUND_00_15(11,f,g,h,a,b,c,d,e);
T1 = X[12] = PULL64(W[12]); ROUND_00_15(12,e,f,g,h,a,b,c,d);
T1 = X[13] = PULL64(W[13]); ROUND_00_15(13,d,e,f,g,h,a,b,c);
T1 = X[14] = PULL64(W[14]); ROUND_00_15(14,c,d,e,f,g,h,a,b);
T1 = X[15] = PULL64(W[15]); ROUND_00_15(15,b,c,d,e,f,g,h,a);
#endif
for (i=16;i<80;i+=16)
{
ROUND_16_80(i, 0,a,b,c,d,e,f,g,h,X);
ROUND_16_80(i, 1,h,a,b,c,d,e,f,g,X);
ROUND_16_80(i, 2,g,h,a,b,c,d,e,f,X);
ROUND_16_80(i, 3,f,g,h,a,b,c,d,e,X);
ROUND_16_80(i, 4,e,f,g,h,a,b,c,d,X);
ROUND_16_80(i, 5,d,e,f,g,h,a,b,c,X);
ROUND_16_80(i, 6,c,d,e,f,g,h,a,b,X);
ROUND_16_80(i, 7,b,c,d,e,f,g,h,a,X);
ROUND_16_80(i, 8,a,b,c,d,e,f,g,h,X);
ROUND_16_80(i, 9,h,a,b,c,d,e,f,g,X);
ROUND_16_80(i,10,g,h,a,b,c,d,e,f,X);
ROUND_16_80(i,11,f,g,h,a,b,c,d,e,X);
ROUND_16_80(i,12,e,f,g,h,a,b,c,d,X);
ROUND_16_80(i,13,d,e,f,g,h,a,b,c,X);
ROUND_16_80(i,14,c,d,e,f,g,h,a,b,X);
ROUND_16_80(i,15,b,c,d,e,f,g,h,a,X);
}
ctx->h[0] += a; ctx->h[1] += b; ctx->h[2] += c; ctx->h[3] += d;
ctx->h[4] += e; ctx->h[5] += f; ctx->h[6] += g; ctx->h[7] += h;
W+=SHA_LBLOCK;
}
}
#endif
#endif /* SHA512_ASM */
#else /* !OPENSSL_NO_SHA512 */
#if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX)
static void *dummy=&dummy;
#endif
#endif /* !OPENSSL_NO_SHA512 */

View File

@@ -0,0 +1,184 @@
/* crypto/sha/sha512t.c */
/* ====================================================================
* Copyright (c) 2004 The OpenSSL Project. All rights reserved.
* ====================================================================
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <openssl/sha.h>
#include <openssl/evp.h>
#include <openssl/crypto.h>
#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA512)
int main(int argc, char *argv[])
{
printf("No SHA512 support\n");
return(0);
}
#else
unsigned char app_c1[SHA512_DIGEST_LENGTH] = {
0xdd,0xaf,0x35,0xa1,0x93,0x61,0x7a,0xba,
0xcc,0x41,0x73,0x49,0xae,0x20,0x41,0x31,
0x12,0xe6,0xfa,0x4e,0x89,0xa9,0x7e,0xa2,
0x0a,0x9e,0xee,0xe6,0x4b,0x55,0xd3,0x9a,
0x21,0x92,0x99,0x2a,0x27,0x4f,0xc1,0xa8,
0x36,0xba,0x3c,0x23,0xa3,0xfe,0xeb,0xbd,
0x45,0x4d,0x44,0x23,0x64,0x3c,0xe8,0x0e,
0x2a,0x9a,0xc9,0x4f,0xa5,0x4c,0xa4,0x9f };
unsigned char app_c2[SHA512_DIGEST_LENGTH] = {
0x8e,0x95,0x9b,0x75,0xda,0xe3,0x13,0xda,
0x8c,0xf4,0xf7,0x28,0x14,0xfc,0x14,0x3f,
0x8f,0x77,0x79,0xc6,0xeb,0x9f,0x7f,0xa1,
0x72,0x99,0xae,0xad,0xb6,0x88,0x90,0x18,
0x50,0x1d,0x28,0x9e,0x49,0x00,0xf7,0xe4,
0x33,0x1b,0x99,0xde,0xc4,0xb5,0x43,0x3a,
0xc7,0xd3,0x29,0xee,0xb6,0xdd,0x26,0x54,
0x5e,0x96,0xe5,0x5b,0x87,0x4b,0xe9,0x09 };
unsigned char app_c3[SHA512_DIGEST_LENGTH] = {
0xe7,0x18,0x48,0x3d,0x0c,0xe7,0x69,0x64,
0x4e,0x2e,0x42,0xc7,0xbc,0x15,0xb4,0x63,
0x8e,0x1f,0x98,0xb1,0x3b,0x20,0x44,0x28,
0x56,0x32,0xa8,0x03,0xaf,0xa9,0x73,0xeb,
0xde,0x0f,0xf2,0x44,0x87,0x7e,0xa6,0x0a,
0x4c,0xb0,0x43,0x2c,0xe5,0x77,0xc3,0x1b,
0xeb,0x00,0x9c,0x5c,0x2c,0x49,0xaa,0x2e,
0x4e,0xad,0xb2,0x17,0xad,0x8c,0xc0,0x9b };
unsigned char app_d1[SHA384_DIGEST_LENGTH] = {
0xcb,0x00,0x75,0x3f,0x45,0xa3,0x5e,0x8b,
0xb5,0xa0,0x3d,0x69,0x9a,0xc6,0x50,0x07,
0x27,0x2c,0x32,0xab,0x0e,0xde,0xd1,0x63,
0x1a,0x8b,0x60,0x5a,0x43,0xff,0x5b,0xed,
0x80,0x86,0x07,0x2b,0xa1,0xe7,0xcc,0x23,
0x58,0xba,0xec,0xa1,0x34,0xc8,0x25,0xa7 };
unsigned char app_d2[SHA384_DIGEST_LENGTH] = {
0x09,0x33,0x0c,0x33,0xf7,0x11,0x47,0xe8,
0x3d,0x19,0x2f,0xc7,0x82,0xcd,0x1b,0x47,
0x53,0x11,0x1b,0x17,0x3b,0x3b,0x05,0xd2,
0x2f,0xa0,0x80,0x86,0xe3,0xb0,0xf7,0x12,
0xfc,0xc7,0xc7,0x1a,0x55,0x7e,0x2d,0xb9,
0x66,0xc3,0xe9,0xfa,0x91,0x74,0x60,0x39 };
unsigned char app_d3[SHA384_DIGEST_LENGTH] = {
0x9d,0x0e,0x18,0x09,0x71,0x64,0x74,0xcb,
0x08,0x6e,0x83,0x4e,0x31,0x0a,0x4a,0x1c,
0xed,0x14,0x9e,0x9c,0x00,0xf2,0x48,0x52,
0x79,0x72,0xce,0xc5,0x70,0x4c,0x2a,0x5b,
0x07,0xb8,0xb3,0xdc,0x38,0xec,0xc4,0xeb,
0xae,0x97,0xdd,0xd8,0x7f,0x3d,0x89,0x85 };
int main (int argc,char **argv)
{ unsigned char md[SHA512_DIGEST_LENGTH];
int i;
EVP_MD_CTX evp;
#ifdef OPENSSL_IA32_SSE2
/* Alternative to this is to call OpenSSL_add_all_algorithms...
* The below code is retained exclusively for debugging purposes. */
{ char *env;
if ((env=getenv("OPENSSL_ia32cap")))
OPENSSL_ia32cap = strtoul (env,NULL,0);
}
#endif
fprintf(stdout,"Testing SHA-512 ");
EVP_Digest ("abc",3,md,NULL,EVP_sha512(),NULL);
if (memcmp(md,app_c1,sizeof(app_c1)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 1 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_Digest ("abcdefgh""bcdefghi""cdefghij""defghijk"
"efghijkl""fghijklm""ghijklmn""hijklmno"
"ijklmnop""jklmnopq""klmnopqr""lmnopqrs"
"mnopqrst""nopqrstu",112,md,NULL,EVP_sha512(),NULL);
if (memcmp(md,app_c2,sizeof(app_c2)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 2 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_MD_CTX_init (&evp);
EVP_DigestInit_ex (&evp,EVP_sha512(),NULL);
for (i=0;i<1000000;i+=288)
EVP_DigestUpdate (&evp, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
(1000000-i)<288?1000000-i:288);
EVP_DigestFinal_ex (&evp,md,NULL);
EVP_MD_CTX_cleanup (&evp);
if (memcmp(md,app_c3,sizeof(app_c3)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 3 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
fprintf(stdout," passed.\n"); fflush(stdout);
fprintf(stdout,"Testing SHA-384 ");
EVP_Digest ("abc",3,md,NULL,EVP_sha384(),NULL);
if (memcmp(md,app_d1,sizeof(app_d1)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 1 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_Digest ("abcdefgh""bcdefghi""cdefghij""defghijk"
"efghijkl""fghijklm""ghijklmn""hijklmno"
"ijklmnop""jklmnopq""klmnopqr""lmnopqrs"
"mnopqrst""nopqrstu",112,md,NULL,EVP_sha384(),NULL);
if (memcmp(md,app_d2,sizeof(app_d2)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 2 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
EVP_MD_CTX_init (&evp);
EVP_DigestInit_ex (&evp,EVP_sha384(),NULL);
for (i=0;i<1000000;i+=64)
EVP_DigestUpdate (&evp, "aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa"
"aaaaaaaa""aaaaaaaa""aaaaaaaa""aaaaaaaa",
(1000000-i)<64?1000000-i:64);
EVP_DigestFinal_ex (&evp,md,NULL);
EVP_MD_CTX_cleanup (&evp);
if (memcmp(md,app_d3,sizeof(app_d3)))
{ fflush(stdout);
fprintf(stderr,"\nTEST 3 of 3 failed.\n");
return 1;
}
else
fprintf(stdout,"."); fflush(stdout);
fprintf(stdout," passed.\n"); fflush(stdout);
return 0;
}
#endif

View File

@@ -0,0 +1,75 @@
/* crypto/sha/sha1dgst.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <openssl/crypto.h>
#include <openssl/opensslconf.h>
#if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
#undef SHA_1
#define SHA_0
#include <openssl/opensslv.h>
const char SHA_version[]="SHA" OPENSSL_VERSION_PTEXT;
/* The implementation is in ../md32_common.h */
#include "sha_locl.h"
#endif

View File

@@ -0,0 +1,441 @@
/* crypto/sha/sha_locl.h */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdlib.h>
#include <string.h>
#include <openssl/opensslconf.h>
#include <openssl/sha.h>
#define DATA_ORDER_IS_BIG_ENDIAN
#define HASH_LONG SHA_LONG
#define HASH_CTX SHA_CTX
#define HASH_CBLOCK SHA_CBLOCK
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
ll=(c)->h0; (void)HOST_l2c(ll,(s)); \
ll=(c)->h1; (void)HOST_l2c(ll,(s)); \
ll=(c)->h2; (void)HOST_l2c(ll,(s)); \
ll=(c)->h3; (void)HOST_l2c(ll,(s)); \
ll=(c)->h4; (void)HOST_l2c(ll,(s)); \
} while (0)
#if defined(SHA_0)
# define HASH_UPDATE SHA_Update
# define HASH_TRANSFORM SHA_Transform
# define HASH_FINAL SHA_Final
# define HASH_INIT SHA_Init
# define HASH_BLOCK_DATA_ORDER sha_block_data_order
# define Xupdate(a,ix,ia,ib,ic,id) (ix=(a)=(ia^ib^ic^id))
static void sha_block_data_order (SHA_CTX *c, const void *p,size_t num);
#elif defined(SHA_1)
# define HASH_UPDATE SHA1_Update
# define HASH_TRANSFORM SHA1_Transform
# define HASH_FINAL SHA1_Final
# define HASH_INIT SHA1_Init
# define HASH_BLOCK_DATA_ORDER sha1_block_data_order
# if defined(__MWERKS__) && defined(__MC68K__)
/* Metrowerks for Motorola fails otherwise:-( <appro@fy.chalmers.se> */
# define Xupdate(a,ix,ia,ib,ic,id) do { (a)=(ia^ib^ic^id); \
ix=(a)=ROTATE((a),1); \
} while (0)
# else
# define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \
ix=(a)=ROTATE((a),1) \
)
# endif
#ifndef SHA1_ASM
static
#endif
void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
#else
# error "Either SHA_0 or SHA_1 must be defined."
#endif
#include "md32_common.h"
#define INIT_DATA_h0 0x67452301UL
#define INIT_DATA_h1 0xefcdab89UL
#define INIT_DATA_h2 0x98badcfeUL
#define INIT_DATA_h3 0x10325476UL
#define INIT_DATA_h4 0xc3d2e1f0UL
#ifdef SHA_0
fips_md_init(SHA)
#else
fips_md_init_ctx(SHA1, SHA)
#endif
{
memset (c,0,sizeof(*c));
c->h0=INIT_DATA_h0;
c->h1=INIT_DATA_h1;
c->h2=INIT_DATA_h2;
c->h3=INIT_DATA_h3;
c->h4=INIT_DATA_h4;
return 1;
}
#define K_00_19 0x5a827999UL
#define K_20_39 0x6ed9eba1UL
#define K_40_59 0x8f1bbcdcUL
#define K_60_79 0xca62c1d6UL
/* As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be
* simplified to the code in F_00_19. Wei attributes these optimisations
* to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel.
* #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
* I've just become aware of another tweak to be made, again from Wei Dai,
* in F_40_59, (x&a)|(y&a) -> (x|y)&a
*/
#define F_00_19(b,c,d) ((((c) ^ (d)) & (b)) ^ (d))
#define F_20_39(b,c,d) ((b) ^ (c) ^ (d))
#define F_40_59(b,c,d) (((b) & (c)) | (((b)|(c)) & (d)))
#define F_60_79(b,c,d) F_20_39(b,c,d)
#ifndef OPENSSL_SMALL_FOOTPRINT
#define BODY_00_15(i,a,b,c,d,e,f,xi) \
(f)=xi+(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
(b)=ROTATE((b),30);
#define BODY_16_19(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
Xupdate(f,xi,xa,xb,xc,xd); \
(f)+=(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
(b)=ROTATE((b),30);
#define BODY_20_31(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
Xupdate(f,xi,xa,xb,xc,xd); \
(f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
(b)=ROTATE((b),30);
#define BODY_32_39(i,a,b,c,d,e,f,xa,xb,xc,xd) \
Xupdate(f,xa,xa,xb,xc,xd); \
(f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
(b)=ROTATE((b),30);
#define BODY_40_59(i,a,b,c,d,e,f,xa,xb,xc,xd) \
Xupdate(f,xa,xa,xb,xc,xd); \
(f)+=(e)+K_40_59+ROTATE((a),5)+F_40_59((b),(c),(d)); \
(b)=ROTATE((b),30);
#define BODY_60_79(i,a,b,c,d,e,f,xa,xb,xc,xd) \
Xupdate(f,xa,xa,xb,xc,xd); \
(f)=xa+(e)+K_60_79+ROTATE((a),5)+F_60_79((b),(c),(d)); \
(b)=ROTATE((b),30);
#ifdef X
#undef X
#endif
#ifndef MD32_XARRAY
/*
* Originally X was an array. As it's automatic it's natural
* to expect RISC compiler to accomodate at least part of it in
* the register bank, isn't it? Unfortunately not all compilers
* "find" this expectation reasonable:-( On order to make such
* compilers generate better code I replace X[] with a bunch of
* X0, X1, etc. See the function body below...
* <appro@fy.chalmers.se>
*/
# define X(i) XX##i
#else
/*
* However! Some compilers (most notably HP C) get overwhelmed by
* that many local variables so that we have to have the way to
* fall down to the original behavior.
*/
# define X(i) XX[i]
#endif
#if !defined(SHA_1) || !defined(SHA1_ASM)
static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
{
const unsigned char *data=p;
register unsigned MD32_REG_T A,B,C,D,E,T,l;
#ifndef MD32_XARRAY
unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7,
XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15;
#else
SHA_LONG XX[16];
#endif
A=c->h0;
B=c->h1;
C=c->h2;
D=c->h3;
E=c->h4;
for (;;)
{
const union { long one; char little; } is_endian = {1};
if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)p%4)==0)
{
const SHA_LONG *W=(const SHA_LONG *)data;
X( 0) = W[0]; X( 1) = W[ 1];
BODY_00_15( 0,A,B,C,D,E,T,X( 0)); X( 2) = W[ 2];
BODY_00_15( 1,T,A,B,C,D,E,X( 1)); X( 3) = W[ 3];
BODY_00_15( 2,E,T,A,B,C,D,X( 2)); X( 4) = W[ 4];
BODY_00_15( 3,D,E,T,A,B,C,X( 3)); X( 5) = W[ 5];
BODY_00_15( 4,C,D,E,T,A,B,X( 4)); X( 6) = W[ 6];
BODY_00_15( 5,B,C,D,E,T,A,X( 5)); X( 7) = W[ 7];
BODY_00_15( 6,A,B,C,D,E,T,X( 6)); X( 8) = W[ 8];
BODY_00_15( 7,T,A,B,C,D,E,X( 7)); X( 9) = W[ 9];
BODY_00_15( 8,E,T,A,B,C,D,X( 8)); X(10) = W[10];
BODY_00_15( 9,D,E,T,A,B,C,X( 9)); X(11) = W[11];
BODY_00_15(10,C,D,E,T,A,B,X(10)); X(12) = W[12];
BODY_00_15(11,B,C,D,E,T,A,X(11)); X(13) = W[13];
BODY_00_15(12,A,B,C,D,E,T,X(12)); X(14) = W[14];
BODY_00_15(13,T,A,B,C,D,E,X(13)); X(15) = W[15];
BODY_00_15(14,E,T,A,B,C,D,X(14));
BODY_00_15(15,D,E,T,A,B,C,X(15));
data += SHA_CBLOCK;
}
else
{
(void)HOST_c2l(data,l); X( 0)=l; (void)HOST_c2l(data,l); X( 1)=l;
BODY_00_15( 0,A,B,C,D,E,T,X( 0)); (void)HOST_c2l(data,l); X( 2)=l;
BODY_00_15( 1,T,A,B,C,D,E,X( 1)); (void)HOST_c2l(data,l); X( 3)=l;
BODY_00_15( 2,E,T,A,B,C,D,X( 2)); (void)HOST_c2l(data,l); X( 4)=l;
BODY_00_15( 3,D,E,T,A,B,C,X( 3)); (void)HOST_c2l(data,l); X( 5)=l;
BODY_00_15( 4,C,D,E,T,A,B,X( 4)); (void)HOST_c2l(data,l); X( 6)=l;
BODY_00_15( 5,B,C,D,E,T,A,X( 5)); (void)HOST_c2l(data,l); X( 7)=l;
BODY_00_15( 6,A,B,C,D,E,T,X( 6)); (void)HOST_c2l(data,l); X( 8)=l;
BODY_00_15( 7,T,A,B,C,D,E,X( 7)); (void)HOST_c2l(data,l); X( 9)=l;
BODY_00_15( 8,E,T,A,B,C,D,X( 8)); (void)HOST_c2l(data,l); X(10)=l;
BODY_00_15( 9,D,E,T,A,B,C,X( 9)); (void)HOST_c2l(data,l); X(11)=l;
BODY_00_15(10,C,D,E,T,A,B,X(10)); (void)HOST_c2l(data,l); X(12)=l;
BODY_00_15(11,B,C,D,E,T,A,X(11)); (void)HOST_c2l(data,l); X(13)=l;
BODY_00_15(12,A,B,C,D,E,T,X(12)); (void)HOST_c2l(data,l); X(14)=l;
BODY_00_15(13,T,A,B,C,D,E,X(13)); (void)HOST_c2l(data,l); X(15)=l;
BODY_00_15(14,E,T,A,B,C,D,X(14));
BODY_00_15(15,D,E,T,A,B,C,X(15));
}
BODY_16_19(16,C,D,E,T,A,B,X( 0),X( 0),X( 2),X( 8),X(13));
BODY_16_19(17,B,C,D,E,T,A,X( 1),X( 1),X( 3),X( 9),X(14));
BODY_16_19(18,A,B,C,D,E,T,X( 2),X( 2),X( 4),X(10),X(15));
BODY_16_19(19,T,A,B,C,D,E,X( 3),X( 3),X( 5),X(11),X( 0));
BODY_20_31(20,E,T,A,B,C,D,X( 4),X( 4),X( 6),X(12),X( 1));
BODY_20_31(21,D,E,T,A,B,C,X( 5),X( 5),X( 7),X(13),X( 2));
BODY_20_31(22,C,D,E,T,A,B,X( 6),X( 6),X( 8),X(14),X( 3));
BODY_20_31(23,B,C,D,E,T,A,X( 7),X( 7),X( 9),X(15),X( 4));
BODY_20_31(24,A,B,C,D,E,T,X( 8),X( 8),X(10),X( 0),X( 5));
BODY_20_31(25,T,A,B,C,D,E,X( 9),X( 9),X(11),X( 1),X( 6));
BODY_20_31(26,E,T,A,B,C,D,X(10),X(10),X(12),X( 2),X( 7));
BODY_20_31(27,D,E,T,A,B,C,X(11),X(11),X(13),X( 3),X( 8));
BODY_20_31(28,C,D,E,T,A,B,X(12),X(12),X(14),X( 4),X( 9));
BODY_20_31(29,B,C,D,E,T,A,X(13),X(13),X(15),X( 5),X(10));
BODY_20_31(30,A,B,C,D,E,T,X(14),X(14),X( 0),X( 6),X(11));
BODY_20_31(31,T,A,B,C,D,E,X(15),X(15),X( 1),X( 7),X(12));
BODY_32_39(32,E,T,A,B,C,D,X( 0),X( 2),X( 8),X(13));
BODY_32_39(33,D,E,T,A,B,C,X( 1),X( 3),X( 9),X(14));
BODY_32_39(34,C,D,E,T,A,B,X( 2),X( 4),X(10),X(15));
BODY_32_39(35,B,C,D,E,T,A,X( 3),X( 5),X(11),X( 0));
BODY_32_39(36,A,B,C,D,E,T,X( 4),X( 6),X(12),X( 1));
BODY_32_39(37,T,A,B,C,D,E,X( 5),X( 7),X(13),X( 2));
BODY_32_39(38,E,T,A,B,C,D,X( 6),X( 8),X(14),X( 3));
BODY_32_39(39,D,E,T,A,B,C,X( 7),X( 9),X(15),X( 4));
BODY_40_59(40,C,D,E,T,A,B,X( 8),X(10),X( 0),X( 5));
BODY_40_59(41,B,C,D,E,T,A,X( 9),X(11),X( 1),X( 6));
BODY_40_59(42,A,B,C,D,E,T,X(10),X(12),X( 2),X( 7));
BODY_40_59(43,T,A,B,C,D,E,X(11),X(13),X( 3),X( 8));
BODY_40_59(44,E,T,A,B,C,D,X(12),X(14),X( 4),X( 9));
BODY_40_59(45,D,E,T,A,B,C,X(13),X(15),X( 5),X(10));
BODY_40_59(46,C,D,E,T,A,B,X(14),X( 0),X( 6),X(11));
BODY_40_59(47,B,C,D,E,T,A,X(15),X( 1),X( 7),X(12));
BODY_40_59(48,A,B,C,D,E,T,X( 0),X( 2),X( 8),X(13));
BODY_40_59(49,T,A,B,C,D,E,X( 1),X( 3),X( 9),X(14));
BODY_40_59(50,E,T,A,B,C,D,X( 2),X( 4),X(10),X(15));
BODY_40_59(51,D,E,T,A,B,C,X( 3),X( 5),X(11),X( 0));
BODY_40_59(52,C,D,E,T,A,B,X( 4),X( 6),X(12),X( 1));
BODY_40_59(53,B,C,D,E,T,A,X( 5),X( 7),X(13),X( 2));
BODY_40_59(54,A,B,C,D,E,T,X( 6),X( 8),X(14),X( 3));
BODY_40_59(55,T,A,B,C,D,E,X( 7),X( 9),X(15),X( 4));
BODY_40_59(56,E,T,A,B,C,D,X( 8),X(10),X( 0),X( 5));
BODY_40_59(57,D,E,T,A,B,C,X( 9),X(11),X( 1),X( 6));
BODY_40_59(58,C,D,E,T,A,B,X(10),X(12),X( 2),X( 7));
BODY_40_59(59,B,C,D,E,T,A,X(11),X(13),X( 3),X( 8));
BODY_60_79(60,A,B,C,D,E,T,X(12),X(14),X( 4),X( 9));
BODY_60_79(61,T,A,B,C,D,E,X(13),X(15),X( 5),X(10));
BODY_60_79(62,E,T,A,B,C,D,X(14),X( 0),X( 6),X(11));
BODY_60_79(63,D,E,T,A,B,C,X(15),X( 1),X( 7),X(12));
BODY_60_79(64,C,D,E,T,A,B,X( 0),X( 2),X( 8),X(13));
BODY_60_79(65,B,C,D,E,T,A,X( 1),X( 3),X( 9),X(14));
BODY_60_79(66,A,B,C,D,E,T,X( 2),X( 4),X(10),X(15));
BODY_60_79(67,T,A,B,C,D,E,X( 3),X( 5),X(11),X( 0));
BODY_60_79(68,E,T,A,B,C,D,X( 4),X( 6),X(12),X( 1));
BODY_60_79(69,D,E,T,A,B,C,X( 5),X( 7),X(13),X( 2));
BODY_60_79(70,C,D,E,T,A,B,X( 6),X( 8),X(14),X( 3));
BODY_60_79(71,B,C,D,E,T,A,X( 7),X( 9),X(15),X( 4));
BODY_60_79(72,A,B,C,D,E,T,X( 8),X(10),X( 0),X( 5));
BODY_60_79(73,T,A,B,C,D,E,X( 9),X(11),X( 1),X( 6));
BODY_60_79(74,E,T,A,B,C,D,X(10),X(12),X( 2),X( 7));
BODY_60_79(75,D,E,T,A,B,C,X(11),X(13),X( 3),X( 8));
BODY_60_79(76,C,D,E,T,A,B,X(12),X(14),X( 4),X( 9));
BODY_60_79(77,B,C,D,E,T,A,X(13),X(15),X( 5),X(10));
BODY_60_79(78,A,B,C,D,E,T,X(14),X( 0),X( 6),X(11));
BODY_60_79(79,T,A,B,C,D,E,X(15),X( 1),X( 7),X(12));
c->h0=(c->h0+E)&0xffffffffL;
c->h1=(c->h1+T)&0xffffffffL;
c->h2=(c->h2+A)&0xffffffffL;
c->h3=(c->h3+B)&0xffffffffL;
c->h4=(c->h4+C)&0xffffffffL;
if (--num == 0) break;
A=c->h0;
B=c->h1;
C=c->h2;
D=c->h3;
E=c->h4;
}
}
#endif
#else /* OPENSSL_SMALL_FOOTPRINT */
#define BODY_00_15(xi) do { \
T=E+K_00_19+F_00_19(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T+xi; } while(0)
#define BODY_16_19(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T+=E+K_00_19+F_00_19(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T; } while(0)
#define BODY_20_39(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T+=E+K_20_39+F_20_39(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T; } while(0)
#define BODY_40_59(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T+=E+K_40_59+F_40_59(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T; } while(0)
#define BODY_60_79(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T=E+K_60_79+F_60_79(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T+xa; } while(0)
#if !defined(SHA_1) || !defined(SHA1_ASM)
static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
{
const unsigned char *data=p;
register unsigned MD32_REG_T A,B,C,D,E,T,l;
int i;
SHA_LONG X[16];
A=c->h0;
B=c->h1;
C=c->h2;
D=c->h3;
E=c->h4;
for (;;)
{
for (i=0;i<16;i++)
{ HOST_c2l(data,l); X[i]=l; BODY_00_15(X[i]); }
for (i=0;i<4;i++)
{ BODY_16_19(X[i], X[i+2], X[i+8], X[(i+13)&15]); }
for (;i<24;i++)
{ BODY_20_39(X[i&15], X[(i+2)&15], X[(i+8)&15],X[(i+13)&15]); }
for (i=0;i<20;i++)
{ BODY_40_59(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
for (i=4;i<24;i++)
{ BODY_60_79(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); }
c->h0=(c->h0+A)&0xffffffffL;
c->h1=(c->h1+B)&0xffffffffL;
c->h2=(c->h2+C)&0xffffffffL;
c->h3=(c->h3+D)&0xffffffffL;
c->h4=(c->h4+E)&0xffffffffL;
if (--num == 0) break;
A=c->h0;
B=c->h1;
C=c->h2;
D=c->h3;
E=c->h4;
}
}
#endif
#endif

View File

@@ -0,0 +1,178 @@
/* crypto/sha/shatest.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "../e_os.h"
#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA0)
int main(int argc, char *argv[])
{
printf("No SHA0 support\n");
return(0);
}
#else
#include <openssl/evp.h>
#include <openssl/sha.h>
#ifdef CHARSET_EBCDIC
#include <openssl/ebcdic.h>
#endif
#define SHA_0 /* FIPS 180 */
#undef SHA_1 /* FIPS 180-1 */
static char *test[]={
"abc",
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
NULL,
};
#ifdef SHA_0
static char *ret[]={
"0164b8a914cd2a5e74c4f7ff082c4d97f1edf880",
"d2516ee1acfa5baf33dfc1c471e438449ef134c8",
};
static char *bigret=
"3232affa48628a26653b5aaa44541fd90d690603";
#endif
#ifdef SHA_1
static char *ret[]={
"a9993e364706816aba3e25717850c26c9cd0d89d",
"84983e441c3bd26ebaae4aa1f95129e5e54670f1",
};
static char *bigret=
"34aa973cd4c4daa4f61eeb2bdbad27316534016f";
#endif
static char *pt(unsigned char *md);
int main(int argc, char *argv[])
{
int i,err=0;
char **P,**R;
static unsigned char buf[1000];
char *p,*r;
EVP_MD_CTX c;
unsigned char md[SHA_DIGEST_LENGTH];
#ifdef CHARSET_EBCDIC
ebcdic2ascii(test[0], test[0], strlen(test[0]));
ebcdic2ascii(test[1], test[1], strlen(test[1]));
#endif
EVP_MD_CTX_init(&c);
P=test;
R=ret;
i=1;
while (*P != NULL)
{
EVP_Digest(*P,strlen(*P),md,NULL,EVP_sha(), NULL);
p=pt(md);
if (strcmp(p,*R) != 0)
{
printf("error calculating SHA on '%s'\n",*P);
printf("got %s instead of %s\n",p,*R);
err++;
}
else
printf("test %d ok\n",i);
i++;
R++;
P++;
}
memset(buf,'a',1000);
#ifdef CHARSET_EBCDIC
ebcdic2ascii(buf, buf, 1000);
#endif /*CHARSET_EBCDIC*/
EVP_DigestInit_ex(&c,EVP_sha(), NULL);
for (i=0; i<1000; i++)
EVP_DigestUpdate(&c,buf,1000);
EVP_DigestFinal_ex(&c,md,NULL);
p=pt(md);
r=bigret;
if (strcmp(p,r) != 0)
{
printf("error calculating SHA on '%s'\n",p);
printf("got %s instead of %s\n",p,r);
err++;
}
else
printf("test 3 ok\n");
#ifdef OPENSSL_SYS_NETWARE
if (err) printf("ERROR: %d\n", err);
#endif
EVP_MD_CTX_cleanup(&c);
EXIT(err);
return(0);
}
static char *pt(unsigned char *md)
{
int i;
static char buf[80];
for (i=0; i<SHA_DIGEST_LENGTH; i++)
sprintf(&(buf[i*2]),"%02x",md[i]);
return(buf);
}
#endif