Support for Signal calls.

Merge in RedPhone

// FREEBIE
This commit is contained in:
Moxie Marlinspike
2015-09-09 13:54:29 -07:00
parent 3d4ae60d81
commit d83a3d71bc
2585 changed files with 803492 additions and 45 deletions

View File

@@ -0,0 +1,410 @@
#!/usr/bin/env perl
# ====================================================================
# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# At some point it became apparent that the original SSLeay RC4
# assembler implementation performs suboptimally on latest IA-32
# microarchitectures. After re-tuning performance has changed as
# following:
#
# Pentium -10%
# Pentium III +12%
# AMD +50%(*)
# P4 +250%(**)
#
# (*) This number is actually a trade-off:-) It's possible to
# achieve +72%, but at the cost of -48% off PIII performance.
# In other words code performing further 13% faster on AMD
# would perform almost 2 times slower on Intel PIII...
# For reference! This code delivers ~80% of rc4-amd64.pl
# performance on the same Opteron machine.
# (**) This number requires compressed key schedule set up by
# RC4_set_key [see commentary below for further details].
#
# <appro@fy.chalmers.se>
# May 2011
#
# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
# performance in cycles per processed byte (less is better) and
# improvement relative to previous version of this module is:
#
# Pentium 10.2 # original numbers
# Pentium III 7.8(*)
# Intel P4 7.5
#
# Opteron 6.1/+20% # new MMX numbers
# Core2 5.3/+67%(**)
# Westmere 5.1/+94%(**)
# Sandy Bridge 5.0/+8%
# Atom 12.6/+6%
#
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
# but this specific code performs poorly on Core2. And vice
# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
# [anymore], I chose to discard PIII-specific code path and opt
# for original IALU-only code, which is why MMX/SSE code path
# is guarded by SSE2 bit (see below), not MMX/SSE.
# (**) Performance vs. block size on Core2 and Westmere had a maximum
# at ... 64 bytes block size. And it was quite a maximum, 40-60%
# in comparison to largest 8KB block size. Above improvement
# coefficients are for the largest block size.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"rc4-586.pl");
$xx="eax";
$yy="ebx";
$tx="ecx";
$ty="edx";
$inp="esi";
$out="ebp";
$dat="edi";
sub RC4_loop {
my $i=shift;
my $func = ($i==0)?*mov:*or;
&add (&LB($yy),&LB($tx));
&mov ($ty,&DWP(0,$dat,$yy,4));
&mov (&DWP(0,$dat,$yy,4),$tx);
&mov (&DWP(0,$dat,$xx,4),$ty);
&add ($ty,$tx);
&inc (&LB($xx));
&and ($ty,0xff);
&ror ($out,8) if ($i!=0);
if ($i<3) {
&mov ($tx,&DWP(0,$dat,$xx,4));
} else {
&mov ($tx,&wparam(3)); # reload [re-biased] out
}
&$func ($out,&DWP(0,$dat,$ty,4));
}
if ($alt=0) {
# >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
# but ~40% slower on Core2 and Westmere... Attempt to add movz
# brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
# on Core2 with movz it's almost 20% slower than below alternative
# code... Yes, it's a total mess...
my @XX=($xx,$out);
$RC4_loop_mmx = sub { # SSE actually...
my $i=shift;
my $j=$i<=0?0:$i>>1;
my $mm=$i<=0?"mm0":"mm".($i&1);
&add (&LB($yy),&LB($tx));
&lea (@XX[1],&DWP(1,@XX[0]));
&pxor ("mm2","mm0") if ($i==0);
&psllq ("mm1",8) if ($i==0);
&and (@XX[1],0xff);
&pxor ("mm0","mm0") if ($i<=0);
&mov ($ty,&DWP(0,$dat,$yy,4));
&mov (&DWP(0,$dat,$yy,4),$tx);
&pxor ("mm1","mm2") if ($i==0);
&mov (&DWP(0,$dat,$XX[0],4),$ty);
&add (&LB($ty),&LB($tx));
&movd (@XX[0],"mm7") if ($i==0);
&mov ($tx,&DWP(0,$dat,@XX[1],4));
&pxor ("mm1","mm1") if ($i==1);
&movq ("mm2",&QWP(0,$inp)) if ($i==1);
&movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0);
&pinsrw ($mm,&DWP(0,$dat,$ty,4),$j);
push (@XX,shift(@XX)) if ($i>=0);
}
} else {
# Using pinsrw here improves performane on Intel CPUs by 2-3%, but
# brings down AMD by 7%...
$RC4_loop_mmx = sub {
my $i=shift;
&add (&LB($yy),&LB($tx));
&psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1);
&mov ($ty,&DWP(0,$dat,$yy,4));
&mov (&DWP(0,$dat,$yy,4),$tx);
&mov (&DWP(0,$dat,$xx,4),$ty);
&inc ($xx);
&add ($ty,$tx);
&movz ($xx,&LB($xx)); # (*)
&movz ($ty,&LB($ty)); # (*)
&pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0);
&movq ("mm0",&QWP(0,$inp)) if ($i<=0);
&movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0);
&mov ($tx,&DWP(0,$dat,$xx,4));
&movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
# (*) This is the key to Core2 and Westmere performance.
# Whithout movz out-of-order execution logic confuses
# itself and fails to reorder loads and stores. Problem
# appears to be fixed in Sandy Bridge...
}
}
&external_label("OPENSSL_ia32cap_P");
# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
&function_begin("RC4");
&mov ($dat,&wparam(0)); # load key schedule pointer
&mov ($ty, &wparam(1)); # load len
&mov ($inp,&wparam(2)); # load inp
&mov ($out,&wparam(3)); # load out
&xor ($xx,$xx); # avoid partial register stalls
&xor ($yy,$yy);
&cmp ($ty,0); # safety net
&je (&label("abort"));
&mov (&LB($xx),&BP(0,$dat)); # load key->x
&mov (&LB($yy),&BP(4,$dat)); # load key->y
&add ($dat,8);
&lea ($tx,&DWP(0,$inp,$ty));
&sub ($out,$inp); # re-bias out
&mov (&wparam(1),$tx); # save input+len
&inc (&LB($xx));
# detect compressed key schedule...
&cmp (&DWP(256,$dat),-1);
&je (&label("RC4_CHAR"));
&mov ($tx,&DWP(0,$dat,$xx,4));
&and ($ty,-4); # how many 4-byte chunks?
&jz (&label("loop1"));
&test ($ty,-8);
&mov (&wparam(3),$out); # $out as accumulator in these loops
&jz (&label("go4loop4"));
&picmeup($out,"OPENSSL_ia32cap_P");
&bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX]
&jnc (&label("go4loop4"));
&mov ($out,&wparam(3)) if (!$alt);
&movd ("mm7",&wparam(3)) if ($alt);
&and ($ty,-8);
&lea ($ty,&DWP(-8,$inp,$ty));
&mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8
&$RC4_loop_mmx(-1);
&jmp(&label("loop_mmx_enter"));
&set_label("loop_mmx",16);
&$RC4_loop_mmx(0);
&set_label("loop_mmx_enter");
for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
&mov ($ty,$yy);
&xor ($yy,$yy); # this is second key to Core2
&mov (&LB($yy),&LB($ty)); # and Westmere performance...
&cmp ($inp,&DWP(-4,$dat));
&lea ($inp,&DWP(8,$inp));
&jb (&label("loop_mmx"));
if ($alt) {
&movd ($out,"mm7");
&pxor ("mm2","mm0");
&psllq ("mm1",8);
&pxor ("mm1","mm2");
&movq (&QWP(-8,$out,$inp),"mm1");
} else {
&psllq ("mm1",56);
&pxor ("mm2","mm1");
&movq (&QWP(-8,$out,$inp),"mm2");
}
&emms ();
&cmp ($inp,&wparam(1)); # compare to input+len
&je (&label("done"));
&jmp (&label("loop1"));
&set_label("go4loop4",16);
&lea ($ty,&DWP(-4,$inp,$ty));
&mov (&wparam(2),$ty); # save input+(len/4)*4-4
&set_label("loop4");
for ($i=0;$i<4;$i++) { RC4_loop($i); }
&ror ($out,8);
&xor ($out,&DWP(0,$inp));
&cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4
&mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
&lea ($inp,&DWP(4,$inp));
&mov ($tx,&DWP(0,$dat,$xx,4));
&jb (&label("loop4"));
&cmp ($inp,&wparam(1)); # compare to input+len
&je (&label("done"));
&mov ($out,&wparam(3)); # restore $out
&set_label("loop1",16);
&add (&LB($yy),&LB($tx));
&mov ($ty,&DWP(0,$dat,$yy,4));
&mov (&DWP(0,$dat,$yy,4),$tx);
&mov (&DWP(0,$dat,$xx,4),$ty);
&add ($ty,$tx);
&inc (&LB($xx));
&and ($ty,0xff);
&mov ($ty,&DWP(0,$dat,$ty,4));
&xor (&LB($ty),&BP(0,$inp));
&lea ($inp,&DWP(1,$inp));
&mov ($tx,&DWP(0,$dat,$xx,4));
&cmp ($inp,&wparam(1)); # compare to input+len
&mov (&BP(-1,$out,$inp),&LB($ty));
&jb (&label("loop1"));
&jmp (&label("done"));
# this is essentially Intel P4 specific codepath...
&set_label("RC4_CHAR",16);
&movz ($tx,&BP(0,$dat,$xx));
# strangely enough unrolled loop performs over 20% slower...
&set_label("cloop1");
&add (&LB($yy),&LB($tx));
&movz ($ty,&BP(0,$dat,$yy));
&mov (&BP(0,$dat,$yy),&LB($tx));
&mov (&BP(0,$dat,$xx),&LB($ty));
&add (&LB($ty),&LB($tx));
&movz ($ty,&BP(0,$dat,$ty));
&add (&LB($xx),1);
&xor (&LB($ty),&BP(0,$inp));
&lea ($inp,&DWP(1,$inp));
&movz ($tx,&BP(0,$dat,$xx));
&cmp ($inp,&wparam(1));
&mov (&BP(-1,$out,$inp),&LB($ty));
&jb (&label("cloop1"));
&set_label("done");
&dec (&LB($xx));
&mov (&DWP(-4,$dat),$yy); # save key->y
&mov (&BP(-8,$dat),&LB($xx)); # save key->x
&set_label("abort");
&function_end("RC4");
########################################################################
$inp="esi";
$out="edi";
$idi="ebp";
$ido="ecx";
$idx="edx";
# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
&function_begin("private_RC4_set_key");
&mov ($out,&wparam(0)); # load key
&mov ($idi,&wparam(1)); # load len
&mov ($inp,&wparam(2)); # load data
&picmeup($idx,"OPENSSL_ia32cap_P");
&lea ($out,&DWP(2*4,$out)); # &key->data
&lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
&neg ($idi);
&xor ("eax","eax");
&mov (&DWP(-4,$out),$idi); # borrow key->y
&bt (&DWP(0,$idx),20); # check for bit#20
&jc (&label("c1stloop"));
&set_label("w1stloop",16);
&mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
&add (&LB("eax"),1); # i++;
&jnc (&label("w1stloop"));
&xor ($ido,$ido);
&xor ($idx,$idx);
&set_label("w2ndloop",16);
&mov ("eax",&DWP(0,$out,$ido,4));
&add (&LB($idx),&BP(0,$inp,$idi));
&add (&LB($idx),&LB("eax"));
&add ($idi,1);
&mov ("ebx",&DWP(0,$out,$idx,4));
&jnz (&label("wnowrap"));
&mov ($idi,&DWP(-4,$out));
&set_label("wnowrap");
&mov (&DWP(0,$out,$idx,4),"eax");
&mov (&DWP(0,$out,$ido,4),"ebx");
&add (&LB($ido),1);
&jnc (&label("w2ndloop"));
&jmp (&label("exit"));
# Unlike all other x86 [and x86_64] implementations, Intel P4 core
# [including EM64T] was found to perform poorly with above "32-bit" key
# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
# schedule for x86[_64], because non-P4 implementations suffer from
# significant performance losses then, e.g. PIII exhibits >2x
# deterioration, and so does Opteron. In order to assure optimal
# all-round performance, we detect P4 at run-time and set up compressed
# key schedule, which is recognized by RC4 procedure.
&set_label("c1stloop",16);
&mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
&add (&LB("eax"),1); # i++;
&jnc (&label("c1stloop"));
&xor ($ido,$ido);
&xor ($idx,$idx);
&xor ("ebx","ebx");
&set_label("c2ndloop",16);
&mov (&LB("eax"),&BP(0,$out,$ido));
&add (&LB($idx),&BP(0,$inp,$idi));
&add (&LB($idx),&LB("eax"));
&add ($idi,1);
&mov (&LB("ebx"),&BP(0,$out,$idx));
&jnz (&label("cnowrap"));
&mov ($idi,&DWP(-4,$out));
&set_label("cnowrap");
&mov (&BP(0,$out,$idx),&LB("eax"));
&mov (&BP(0,$out,$ido),&LB("ebx"));
&add (&LB($ido),1);
&jnc (&label("c2ndloop"));
&mov (&DWP(256,$out),-1); # mark schedule as compressed
&set_label("exit");
&xor ("eax","eax");
&mov (&DWP(-8,$out),"eax"); # key->x=0;
&mov (&DWP(-4,$out),"eax"); # key->y=0;
&function_end("private_RC4_set_key");
# const char *RC4_options(void);
&function_begin_B("RC4_options");
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop("eax");
&lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
&picmeup("edx","OPENSSL_ia32cap_P");
&mov ("edx",&DWP(0,"edx"));
&bt ("edx",20);
&jc (&label("1xchar"));
&bt ("edx",26);
&jnc (&label("ret"));
&add ("eax",25);
&ret ();
&set_label("1xchar");
&add ("eax",12);
&set_label("ret");
&ret ();
&set_label("opts",64);
&asciz ("rc4(4x,int)");
&asciz ("rc4(1x,char)");
&asciz ("rc4(8x,mmx)");
&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
&align (64);
&function_end_B("RC4_options");
&asm_finish();

View File

@@ -0,0 +1,755 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by David Mosberger <David.Mosberger@acm.org> based on the
# Itanium optimized Crypto code which was released by HP Labs at
# http://www.hpl.hp.com/research/linux/crypto/.
#
# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
# This is a little helper program which generates a software-pipelined
# for RC4 encryption. The basic algorithm looks like this:
#
# for (counter = 0; counter < len; ++counter)
# {
# in = inp[counter];
# SI = S[I];
# J = (SI + J) & 0xff;
# SJ = S[J];
# T = (SI + SJ) & 0xff;
# S[I] = SJ, S[J] = SI;
# ST = S[T];
# outp[counter] = in ^ ST;
# I = (I + 1) & 0xff;
# }
#
# Pipelining this loop isn't easy, because the stores to the S[] array
# need to be observed in the right order. The loop generated by the
# code below has the following pipeline diagram:
#
# cycle
# | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
# iter
# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
#
# where:
# LDI = load of S[I]
# LDJ = load of S[J]
# SWP = swap of S[I] and S[J]
# LDT = load of S[T]
#
# Note that in the above diagram, the major trouble-spot is that LDI
# of the 2nd iteration is performed BEFORE the SWP of the first
# iteration. Fortunately, this is easy to detect (I of the 1st
# iteration will be equal to J of the 2nd iteration) and when this
# happens, we simply forward the proper value from the 1st iteration
# to the 2nd one. The proper value in this case is simply the value
# of S[I] from the first iteration (thanks to the fact that SWP
# simply swaps the contents of S[I] and S[J]).
#
# Another potential trouble-spot is in cycle 7, where SWP of the 1st
# iteration issues at the same time as the LDI of the 3rd iteration.
# However, thanks to IA-64 execution semantics, this can be taken
# care of simply by placing LDI later in the instruction-group than
# SWP. IA-64 CPUs will automatically forward the value if they
# detect that the SWP and LDI are accessing the same memory-location.
# The core-loop that can be pipelined then looks like this (annotated
# with McKinley/Madison issue port & latency numbers, assuming L1
# cache hits for the most part):
# operation: instruction: issue-ports: latency
# ------------------ ----------------------------- ------------- -------
# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
# ;;
# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
# ;;
# cmp.eq.unc pBypass = I, J * after J is valid!
# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
# (pBypass) br.cond.spnt Bypass
# ;;
# ---------------------------------------------------------------------------------------
# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
# ;;
# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
# ;;
# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
# ;;
# ---------------------------------------------------------------------------------------
# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
# ;;
# T = T & 0xff zxt1 T = T I0, I1 1 cyc
# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
# S[J] = SI st8 [Jptr] = SI M2-M3
# ;;
# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
# ;;
# ---------------------------------------------------------------------------------------
# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
# ;;
# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
# ;;
# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
# ;;
# ---------------------------------------------------------------------------------------
# There are several points worth making here:
# - Note that due to the bypass/forwarding-path, the first two
# phases of the loop are strangly mingled together. In
# particular, note that the first stage of the pipeline is
# using the value of "J", as calculated by the second stage.
# - Each bundle-pair will have exactly 6 instructions.
# - Pipelined, the loop can execute in 3 cycles/iteration and
# 4 stages. However, McKinley/Madison can issue "st1" to
# the same bank at a rate of at most one per 4 cycles. Thus,
# instead of storing each byte, we accumulate them in a word
# and then write them back at once with a single "st8" (this
# implies that the setup code needs to ensure that the output
# buffer is properly aligned, if need be, by encoding the
# first few bytes separately).
# - There is no space for a "br.ctop" instruction. For this
# reason we can't use module-loop support in IA-64 and have
# to do a traditional, purely software-pipelined loop.
# - We can't replace any of the remaining "add/zxt1" pairs with
# "padd1" because the latency for that instruction is too high
# and would push the loop to the point where more bypasses
# would be needed, which we don't have space for.
# - The above loop runs at around 3.26 cycles/byte, or roughly
# 440 MByte/sec on a 1.5GHz Madison. This is well below the
# system bus bandwidth and hence with judicious use of
# "lfetch" this loop can run at (almost) peak speed even when
# the input and output data reside in memory. The
# max. latency that can be tolerated is (PREFETCH_DISTANCE *
# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
# least) 1-ahead prefetching of 128 byte cache-lines. Note
# that we do NOT prefetch into L1, since that would only
# interfere with the S[] table values stored there. This is
# acceptable because there is a 10 cycle latency between
# load and first use of the input data.
# - We use a branch to out-of-line bypass-code of cycle-pressure:
# we calculate the next J, check for the need to activate the
# bypass path, and activate the bypass path ALL IN THE SAME
# CYCLE. If we didn't have these constraints, we could do
# the bypass with a simple conditional move instruction.
# Fortunately, the bypass paths get activated relatively
# infrequently, so the extra branches don't cost all that much
# (about 0.04 cycles/byte, measured on a 16396 byte file with
# random input data).
#
$phases = 4; # number of stages/phases in the pipelined-loop
$unroll_count = 6; # number of times we unrolled it
$pComI = (1 << 0);
$pComJ = (1 << 1);
$pComT = (1 << 2);
$pOut = (1 << 3);
$NData = 4;
$NIP = 3;
$NJP = 2;
$NI = 2;
$NSI = 3;
$NSJ = 2;
$NT = 2;
$NOutWord = 2;
#
# $threshold is the minimum length before we attempt to use the
# big software-pipelined loop. It MUST be greater-or-equal
# to:
# PHASES * (UNROLL_COUNT + 1) + 7
#
# The "+ 7" comes from the fact we may have to encode up to
# 7 bytes separately before the output pointer is aligned.
#
$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
sub I {
local *code = shift;
local $format = shift;
$code .= sprintf ("\t\t".$format."\n", @_);
}
sub P {
local *code = shift;
local $format = shift;
$code .= sprintf ($format."\n", @_);
}
sub STOP {
local *code = shift;
$code .=<<___;
;;
___
}
sub emit_body {
local *c = shift;
local *bypass = shift;
local ($iteration, $p) = @_;
local $i0 = $iteration;
local $i1 = $iteration - 1;
local $i2 = $iteration - 2;
local $i3 = $iteration - 3;
local $iw0 = ($iteration - 3) / 8;
local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
local $byte_num = ($iteration - 3) % 8;
local $label = $iteration + 1;
local $pAny = ($p & 0xf) == 0xf;
local $pByp = (($p & $pComI) && ($iteration > 0));
$c.=<<___;
//////////////////////////////////////////////////
___
if (($p & 0xf) == 0) {
$c.="#ifdef HOST_IS_BIG_ENDIAN\n";
&I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
$iw1 % $NOutWord, $iw1 % $NOutWord);
$c.="#endif\n";
&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
return;
}
# Cycle 0
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
&I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
&I(\$c, "zxt1 J = J") if ($p & $pComJ);
&I(\$c, "}") if ($pAny);
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
&I(\$c, "add T[%u] = SI[%u], SJ[%u]",
$i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
&I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
&I(\$c, "}") if ($pAny);
&STOP(\$c);
# Cycle 1
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
&I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
&I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
&I(\$c, "}") if ($pAny);
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
&I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
&I(\$c, "xor Data[%u] = Data[%u], T[%u]",
$i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
&I(\$c, "}") if ($pAny);
&STOP(\$c);
# Cycle 2
&I(\$c, "{ .mmi") if ($pAny);
&I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
&I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
&I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
$iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
&I(\$c, "}") if ($pAny);
&I(\$c, "{ .mmb") if ($pAny);
&I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
&I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
&P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
&I(\$c, "}") if ($pAny);
&STOP(\$c);
&P(\$c, ".rc4Resume%u:", $label) if ($pByp);
if ($byte_num == 0 && $iteration >= $phases) {
&I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
$iw1 % $NOutWord) if ($p & $pOut);
if ($iteration == (1 + $unroll_count) * $phases - 1) {
if ($unroll_count == 6) {
&I(\$c, "mov OutWord[%u] = OutWord[%u]",
$iw1 % $NOutWord, $iw0 % $NOutWord);
}
&I(\$c, "lfetch.nt1 [InPrefetch], %u",
$unroll_count * $phases);
&I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
$unroll_count * $phases);
&I(\$c, "br.cloop.sptk.few .rc4Loop");
}
}
if ($pByp) {
&P(\$bypass, ".rc4Bypass%u:", $label);
&I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
&I(\$bypass, "nop 0");
&I(\$bypass, "nop 0");
&I(\$bypass, ";;");
&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
&I(\$bypass, ";;");
}
}
$code=<<___;
.ident \"rc4-ia64.s, version 3.0\"
.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
#define LCSave r8
#define PRSave r9
/* Inputs become invalid once rotation begins! */
#define StateTable in0
#define DataLen in1
#define InputBuffer in2
#define OutputBuffer in3
#define KTable r14
#define J r15
#define InPtr r16
#define OutPtr r17
#define InPrefetch r18
#define OutPrefetch r19
#define One r20
#define LoopCount r21
#define Remainder r22
#define IFinal r23
#define EndPtr r24
#define tmp0 r25
#define tmp1 r26
#define pBypass p6
#define pDone p7
#define pSmall p8
#define pAligned p9
#define pUnaligned p10
#define pComputeI pPhase[0]
#define pComputeJ pPhase[1]
#define pComputeT pPhase[2]
#define pOutput pPhase[3]
#define RetVal r8
#define L_OK p7
#define L_NOK p8
#define _NINPUTS 4
#define _NOUTPUT 0
#define _NROTATE 24
#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
#ifndef SZ
# define SZ 4 // this must be set to sizeof(RC4_INT)
#endif
#if SZ == 1
# define LKEY ld1
# define SKEY st1
# define KEYADDR(dst, i) add dst = i, KTable
#elif SZ == 2
# define LKEY ld2
# define SKEY st2
# define KEYADDR(dst, i) shladd dst = i, 1, KTable
#elif SZ == 4
# define LKEY ld4
# define SKEY st4
# define KEYADDR(dst, i) shladd dst = i, 2, KTable
#else
# define LKEY ld8
# define SKEY st8
# define KEYADDR(dst, i) shladd dst = i, 3, KTable
#endif
#if defined(_HPUX_SOURCE) && !defined(_LP64)
# define ADDP addp4
#else
# define ADDP add
#endif
/* Define a macro for the bit number of the n-th byte: */
#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
# define HOST_IS_BIG_ENDIAN
# define BYTE_POS(n) (56 - (8 * (n)))
#else
# define BYTE_POS(n) (8 * (n))
#endif
/*
We must perform the first phase of the pipeline explicitly since
we will always load from the stable the first time. The br.cexit
will never be taken since regardless of the number of bytes because
the epilogue count is 4.
*/
/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
assembler failed on original macro with syntax error. <appro> */
#define MODSCHED_RC4_PROLOGUE \\
{ \\
ld1 Data[0] = [InPtr], 1; \\
add IFinal = 1, I[1]; \\
KEYADDR(IPr[0], I[1]); \\
} ;; \\
{ \\
LKEY SI[0] = [IPr[0]]; \\
mov pr.rot = 0x10000; \\
mov ar.ec = 4; \\
} ;; \\
{ \\
add J = J, SI[0]; \\
zxt1 I[0] = IFinal; \\
br.cexit.spnt.few .+16; /* never taken */ \\
} ;;
#define MODSCHED_RC4_LOOP(label) \\
label: \\
{ .mmi; \\
(pComputeI) ld1 Data[0] = [InPtr], 1; \\
(pComputeI) add IFinal = 1, I[1]; \\
(pComputeJ) zxt1 J = J; \\
}{ .mmi; \\
(pOutput) LKEY T[1] = [T[1]]; \\
(pComputeT) add T[0] = SI[2], SJ[1]; \\
(pComputeI) KEYADDR(IPr[0], I[1]); \\
} ;; \\
{ .mmi; \\
(pComputeT) SKEY [IPr[2]] = SJ[1]; \\
(pComputeT) SKEY [JP[1]] = SI[2]; \\
(pComputeT) zxt1 T[0] = T[0]; \\
}{ .mmi; \\
(pComputeI) LKEY SI[0] = [IPr[0]]; \\
(pComputeJ) KEYADDR(JP[0], J); \\
(pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
} ;; \\
{ .mmi; \\
(pComputeJ) LKEY SJ[0] = [JP[0]]; \\
(pOutput) xor Data[3] = Data[3], T[1]; \\
nop 0x0; \\
}{ .mmi; \\
(pComputeT) KEYADDR(T[0], T[0]); \\
(pBypass) mov SI[0] = SI[1]; \\
(pComputeI) zxt1 I[0] = IFinal; \\
} ;; \\
{ .mmb; \\
(pOutput) st1 [OutPtr] = Data[3], 1; \\
(pComputeI) add J = J, SI[0]; \\
br.ctop.sptk.few label; \\
} ;;
.text
.align 32
.type RC4, \@function
.global RC4
.proc RC4
.prologue
RC4:
{
.mmi
alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
.rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
OutWord[2]
.rotp pPhase[4]
ADDP InPrefetch = 0, InputBuffer
ADDP KTable = 0, StateTable
}
{
.mmi
ADDP InPtr = 0, InputBuffer
ADDP OutPtr = 0, OutputBuffer
mov RetVal = r0
}
;;
{
.mmi
lfetch.nt1 [InPrefetch], 0x80
ADDP OutPrefetch = 0, OutputBuffer
}
{ // Return 0 if the input length is nonsensical
.mib
ADDP StateTable = 0, StateTable
cmp.ge.unc L_NOK, L_OK = r0, DataLen
(L_NOK) br.ret.sptk.few rp
}
;;
{
.mib
cmp.eq.or L_NOK, L_OK = r0, InPtr
cmp.eq.or L_NOK, L_OK = r0, OutPtr
nop 0x0
}
{
.mib
cmp.eq.or L_NOK, L_OK = r0, StateTable
nop 0x0
(L_NOK) br.ret.sptk.few rp
}
;;
LKEY I[1] = [KTable], SZ
/* Prefetch the state-table. It contains 256 elements of size SZ */
#if SZ == 1
ADDP tmp0 = 1*128, StateTable
#elif SZ == 2
ADDP tmp0 = 3*128, StateTable
ADDP tmp1 = 2*128, StateTable
#elif SZ == 4
ADDP tmp0 = 7*128, StateTable
ADDP tmp1 = 6*128, StateTable
#elif SZ == 8
ADDP tmp0 = 15*128, StateTable
ADDP tmp1 = 14*128, StateTable
#endif
;;
#if SZ >= 8
lfetch.fault.nt1 [tmp0], -256 // 15
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 13
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 11
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 9
lfetch.fault.nt1 [tmp1], -256;;
#endif
#if SZ >= 4
lfetch.fault.nt1 [tmp0], -256 // 7
lfetch.fault.nt1 [tmp1], -256;;
lfetch.fault.nt1 [tmp0], -256 // 5
lfetch.fault.nt1 [tmp1], -256;;
#endif
#if SZ >= 2
lfetch.fault.nt1 [tmp0], -256 // 3
lfetch.fault.nt1 [tmp1], -256;;
#endif
{
.mii
lfetch.fault.nt1 [tmp0] // 1
add I[1]=1,I[1];;
zxt1 I[1]=I[1]
}
{
.mmi
lfetch.nt1 [InPrefetch], 0x80
lfetch.excl.nt1 [OutPrefetch], 0x80
.save pr, PRSave
mov PRSave = pr
} ;;
{
.mmi
lfetch.excl.nt1 [OutPrefetch], 0x80
LKEY J = [KTable], SZ
ADDP EndPtr = DataLen, InPtr
} ;;
{
.mmi
ADDP EndPtr = -1, EndPtr // Make it point to
// last data byte.
mov One = 1
.save ar.lc, LCSave
mov LCSave = ar.lc
.body
} ;;
{
.mmb
sub Remainder = 0, OutPtr
cmp.gtu pSmall, p0 = $threshold, DataLen
(pSmall) br.cond.dpnt .rc4Remainder // Data too small for
// big loop.
} ;;
{
.mmi
and Remainder = 0x7, Remainder
;;
cmp.eq pAligned, pUnaligned = Remainder, r0
nop 0x0
} ;;
{
.mmb
.pred.rel "mutex",pUnaligned,pAligned
(pUnaligned) add Remainder = -1, Remainder
(pAligned) sub Remainder = EndPtr, InPtr
(pAligned) br.cond.dptk.many .rc4Aligned
} ;;
{
.mmi
nop 0x0
nop 0x0
mov.i ar.lc = Remainder
}
/* Do the initial few bytes via the compact, modulo-scheduled loop
until the output pointer is 8-byte-aligned. */
MODSCHED_RC4_PROLOGUE
MODSCHED_RC4_LOOP(.RC4AlignLoop)
{
.mib
sub Remainder = EndPtr, InPtr
zxt1 IFinal = IFinal
clrrrb // Clear CFM.rrb.pr so
;; // next "mov pr.rot = N"
// does the right thing.
}
{
.mmi
mov I[1] = IFinal
nop 0x0
nop 0x0
} ;;
.rc4Aligned:
/*
Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
*/
{
.mlx
add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
movl Remainder = 0xaaaaaaaaaaaaaaab
} ;;
{
.mmi
setf.sig f6 = LoopCount // M2, M3 6 cyc
setf.sig f7 = Remainder // M2, M3 6 cyc
nop 0x0
} ;;
{
.mfb
nop 0x0
xmpy.hu f6 = f6, f7
nop 0x0
} ;;
{
.mmi
getf.sig LoopCount = f6;; // M2 5 cyc
nop 0x0
shr.u LoopCount = LoopCount, 4
} ;;
{
.mmi
nop 0x0
nop 0x0
mov.i ar.lc = LoopCount
} ;;
/* Now comes the unrolled loop: */
.rc4Prologue:
___
$iteration = 0;
# Generate the prologue:
$predicates = 1;
for ($i = 0; $i < $phases; ++$i) {
&emit_body (\$code, \$bypass, $iteration++, $predicates);
$predicates = ($predicates << 1) | 1;
}
$code.=<<___;
.rc4Loop:
___
# Generate the body:
for ($i = 0; $i < $unroll_count*$phases; ++$i) {
&emit_body (\$code, \$bypass, $iteration++, $predicates);
}
$code.=<<___;
.rc4Epilogue:
___
# Generate the epilogue:
for ($i = 0; $i < $phases; ++$i) {
$predicates <<= 1;
&emit_body (\$code, \$bypass, $iteration++, $predicates);
}
$code.=<<___;
{
.mmi
lfetch.nt1 [EndPtr] // fetch line with last byte
mov IFinal = I[1]
nop 0x0
}
.rc4Remainder:
{
.mmi
sub Remainder = EndPtr, InPtr // Calculate
// # of bytes
// left - 1
nop 0x0
nop 0x0
} ;;
{
.mib
cmp.eq pDone, p0 = -1, Remainder // done already?
mov.i ar.lc = Remainder
(pDone) br.cond.dptk.few .rc4Complete
}
/* Do the remaining bytes via the compact, modulo-scheduled loop */
MODSCHED_RC4_PROLOGUE
MODSCHED_RC4_LOOP(.RC4RestLoop)
.rc4Complete:
{
.mmi
add KTable = -SZ, KTable
add IFinal = -1, IFinal
mov ar.lc = LCSave
} ;;
{
.mii
SKEY [KTable] = J,-SZ
zxt1 IFinal = IFinal
mov pr = PRSave, 0x1FFFF
} ;;
{
.mib
SKEY [KTable] = IFinal
add RetVal = 1, r0
br.ret.sptk.few rp
} ;;
___
# Last but not least, emit the code for the bypass-code of the unrolled loop:
$code.=$bypass;
$code.=<<___;
.endp RC4
___
print $code;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,632 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# June 2011
#
# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
# http://download.intel.com/design/intarch/papers/323686.pdf, is that
# since both algorithms exhibit instruction-level parallelism, ILP,
# below theoretical maximum, interleaving them would allow to utilize
# processor resources better and achieve better performance. RC4
# instruction sequence is virtually identical to rc4-x86_64.pl, which
# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
# minimize register usage, which was used as "main thread" with RC4
# weaved into it, one RC4 round per one MD5 round. In addition to the
# stiched subroutine the script can generate standalone replacement
# md5_block_asm_data_order and RC4. Below are performance numbers in
# cycles per processed byte, less is better, for these the standalone
# subroutines, sum of them, and stitched one:
#
# RC4 MD5 RC4+MD5 stitch gain
# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*)
# Core2 6.5 5.8 12.3 7.7 +60%
# Westmere 4.3 5.2 9.5 7.0 +36%
# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
# Atom 9.3 6.5 15.8 11.1 +42%
#
# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
# is +53%...
my ($rc4,$md5)=(1,1); # what to generate?
my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(),
# but its result is discarded. Idea here is
# to be able to use 'openssl speed rc4' for
# benchmarking the stitched subroutine...
my $flavour = shift;
my $output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
if ($rc4 && !$md5) {
($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
$func="RC4"; $nargs=4;
} elsif ($md5 && !$rc4) {
($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
$func="md5_block_asm_data_order"; $nargs=3;
} else {
($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
$func="rc4_md5_enc"; $nargs=6;
# void rc4_md5_enc(
# RC4_KEY *key, #
# const void *in0, # RC4 input
# void *out, # RC4 output
# MD5_CTX *ctx, #
# const void *inp, # MD5 input
# size_t len); # number of 64-byte blocks
}
my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
0x6b901122,0xfd987193,0xa679438e,0x49b40821,
0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 );
my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers
my $tmp="%r12d";
my @XX=("%rbp","%rsi"); # RC4 registers
my @TX=("%rax","%rbx");
my $YY="%rcx";
my $TY="%rdx";
my $MOD=32; # 16, 32 or 64
$code.=<<___;
.text
.align 16
.globl $func
.type $func,\@function,$nargs
$func:
cmp \$0,$len
je .Labort
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
sub \$40,%rsp
.Lbody:
___
if ($rc4) {
$code.=<<___;
$D#md5# mov $ctx,%r11 # reassign arguments
mov $len,%r12
mov $in0,%r13
mov $out,%r14
$D#md5# mov $inp,%r15
___
$ctx="%r11" if ($md5); # reassign arguments
$len="%r12";
$in0="%r13";
$out="%r14";
$inp="%r15" if ($md5);
$inp=$in0 if (!$md5);
$code.=<<___;
xor $XX[0],$XX[0]
xor $YY,$YY
lea 8($dat),$dat
mov -8($dat),$XX[0]#b
mov -4($dat),$YY#b
inc $XX[0]#b
sub $in0,$out
movl ($dat,$XX[0],4),$TX[0]#d
___
$code.=<<___ if (!$md5);
xor $TX[1],$TX[1]
test \$-128,$len
jz .Loop1
sub $XX[0],$TX[1]
and \$`$MOD-1`,$TX[1]
jz .Loop${MOD}_is_hot
sub $TX[1],$len
.Loop${MOD}_warmup:
add $TX[0]#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX[0]#d,($dat,$YY,4)
movl $TY#d,($dat,$XX[0],4)
add $TY#b,$TX[0]#b
inc $XX[0]#b
movl ($dat,$TX[0],4),$TY#d
movl ($dat,$XX[0],4),$TX[0]#d
xorb ($in0),$TY#b
movb $TY#b,($out,$in0)
lea 1($in0),$in0
dec $TX[1]
jnz .Loop${MOD}_warmup
mov $YY,$TX[1]
xor $YY,$YY
mov $TX[1]#b,$YY#b
.Loop${MOD}_is_hot:
mov $len,32(%rsp) # save original $len
shr \$6,$len # number of 64-byte blocks
___
if ($D && !$md5) { # stitch in dummy MD5
$md5=1;
$ctx="%r11";
$inp="%r15";
$code.=<<___;
mov %rsp,$ctx
mov $in0,$inp
___
}
}
$code.=<<___;
#rc4# add $TX[0]#b,$YY#b
#rc4# lea ($dat,$XX[0],4),$XX[1]
shl \$6,$len
add $inp,$len # pointer to the end of input
mov $len,16(%rsp)
#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX
#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX
#md5# mov 1*4($ctx),$V[1]
#md5# mov 2*4($ctx),$V[2]
#md5# mov 3*4($ctx),$V[3]
jmp .Loop
.align 16
.Loop:
#md5# mov $V[0],0*4(%rsp) # put aside current hash value
#md5# mov $V[1],1*4(%rsp)
#md5# mov $V[2],2*4(%rsp)
#md5# mov $V[3],$tmp # forward reference
#md5# mov $V[3],3*4(%rsp)
___
sub R0 {
my ($i,$a,$b,$c,$d)=@_;
my @rot0=(7,12,17,22);
my $j=$i%16;
my $k=$i%$MOD;
my $xmm="%xmm".($j&1);
$code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15);
$code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
$code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
$code.=<<___;
#rc4# movl ($dat,$YY,4),$TY#d
#md5# xor $c,$tmp
#rc4# movl $TX[0]#d,($dat,$YY,4)
#md5# and $b,$tmp
#md5# add 4*`$j`($inp),$a
#rc4# add $TY#b,$TX[0]#b
#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
#md5# add \$$K[$i],$a
#md5# xor $d,$tmp
#rc4# movz $TX[0]#b,$TX[0]#d
#rc4# movl $TY#d,4*$k($XX[1])
#md5# add $tmp,$a
#rc4# add $TX[1]#b,$YY#b
#md5# rol \$$rot0[$j%4],$a
#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference
#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
#md5# add $b,$a
___
$code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
mov $YY,$XX[1]
xor $YY,$YY # keyword to partial register
mov $XX[1]#b,$YY#b
lea ($dat,$XX[0],4),$XX[1]
___
$code.=<<___ if ($rc4 && $j==15);
psllq \$8,%xmm1
pxor %xmm0,%xmm2
pxor %xmm1,%xmm2
___
}
sub R1 {
my ($i,$a,$b,$c,$d)=@_;
my @rot1=(5,9,14,20);
my $j=$i%16;
my $k=$i%$MOD;
my $xmm="%xmm".($j&1);
$code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15);
$code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
$code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
$code.=<<___;
#rc4# movl ($dat,$YY,4),$TY#d
#md5# xor $b,$tmp
#rc4# movl $TX[0]#d,($dat,$YY,4)
#md5# and $d,$tmp
#md5# add 4*`((1+5*$j)%16)`($inp),$a
#rc4# add $TY#b,$TX[0]#b
#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
#md5# add \$$K[$i],$a
#md5# xor $c,$tmp
#rc4# movz $TX[0]#b,$TX[0]#d
#rc4# movl $TY#d,4*$k($XX[1])
#md5# add $tmp,$a
#rc4# add $TX[1]#b,$YY#b
#md5# rol \$$rot1[$j%4],$a
#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference
#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
#md5# add $b,$a
___
$code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
mov $YY,$XX[1]
xor $YY,$YY # keyword to partial register
mov $XX[1]#b,$YY#b
lea ($dat,$XX[0],4),$XX[1]
___
$code.=<<___ if ($rc4 && $j==15);
psllq \$8,%xmm1
pxor %xmm0,%xmm3
pxor %xmm1,%xmm3
___
}
sub R2 {
my ($i,$a,$b,$c,$d)=@_;
my @rot2=(4,11,16,23);
my $j=$i%16;
my $k=$i%$MOD;
my $xmm="%xmm".($j&1);
$code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15);
$code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
$code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
$code.=<<___;
#rc4# movl ($dat,$YY,4),$TY#d
#md5# xor $c,$tmp
#rc4# movl $TX[0]#d,($dat,$YY,4)
#md5# xor $b,$tmp
#md5# add 4*`((5+3*$j)%16)`($inp),$a
#rc4# add $TY#b,$TX[0]#b
#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
#md5# add \$$K[$i],$a
#rc4# movz $TX[0]#b,$TX[0]#d
#md5# add $tmp,$a
#rc4# movl $TY#d,4*$k($XX[1])
#rc4# add $TX[1]#b,$YY#b
#md5# rol \$$rot2[$j%4],$a
#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference
#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
#md5# add $b,$a
___
$code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
mov $YY,$XX[1]
xor $YY,$YY # keyword to partial register
mov $XX[1]#b,$YY#b
lea ($dat,$XX[0],4),$XX[1]
___
$code.=<<___ if ($rc4 && $j==15);
psllq \$8,%xmm1
pxor %xmm0,%xmm4
pxor %xmm1,%xmm4
___
}
sub R3 {
my ($i,$a,$b,$c,$d)=@_;
my @rot3=(6,10,15,21);
my $j=$i%16;
my $k=$i%$MOD;
my $xmm="%xmm".($j&1);
$code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15);
$code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
$code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
$code.=<<___;
#rc4# movl ($dat,$YY,4),$TY#d
#md5# xor $d,$tmp
#rc4# movl $TX[0]#d,($dat,$YY,4)
#md5# or $b,$tmp
#md5# add 4*`((7*$j)%16)`($inp),$a
#rc4# add $TY#b,$TX[0]#b
#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
#md5# add \$$K[$i],$a
#rc4# movz $TX[0]#b,$TX[0]#d
#md5# xor $c,$tmp
#rc4# movl $TY#d,4*$k($XX[1])
#md5# add $tmp,$a
#rc4# add $TX[1]#b,$YY#b
#md5# rol \$$rot3[$j%4],$a
#md5# mov \$-1,$tmp # forward reference
#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
#md5# add $b,$a
___
$code.=<<___ if ($rc4 && $j==15);
mov $XX[0],$XX[1]
xor $XX[0],$XX[0] # keyword to partial register
mov $XX[1]#b,$XX[0]#b
mov $YY,$XX[1]
xor $YY,$YY # keyword to partial register
mov $XX[1]#b,$YY#b
lea ($dat,$XX[0],4),$XX[1]
psllq \$8,%xmm1
pxor %xmm0,%xmm5
pxor %xmm1,%xmm5
___
}
my $i=0;
for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
$code.=<<___;
#md5# add 0*4(%rsp),$V[0] # accumulate hash value
#md5# add 1*4(%rsp),$V[1]
#md5# add 2*4(%rsp),$V[2]
#md5# add 3*4(%rsp),$V[3]
#rc4# movdqu %xmm2,($out,$in0) # write RC4 output
#rc4# movdqu %xmm3,16($out,$in0)
#rc4# movdqu %xmm4,32($out,$in0)
#rc4# movdqu %xmm5,48($out,$in0)
#md5# lea 64($inp),$inp
#rc4# lea 64($in0),$in0
cmp 16(%rsp),$inp # are we done?
jb .Loop
#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX
#rc4# sub $TX[0]#b,$YY#b # correct $YY
#md5# mov $V[0],0*4($len) # write MD5_CTX
#md5# mov $V[1],1*4($len)
#md5# mov $V[2],2*4($len)
#md5# mov $V[3],3*4($len)
___
$code.=<<___ if ($rc4 && (!$md5 || $D));
mov 32(%rsp),$len # restore original $len
and \$63,$len # remaining bytes
jnz .Loop1
jmp .Ldone
.align 16
.Loop1:
add $TX[0]#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX[0]#d,($dat,$YY,4)
movl $TY#d,($dat,$XX[0],4)
add $TY#b,$TX[0]#b
inc $XX[0]#b
movl ($dat,$TX[0],4),$TY#d
movl ($dat,$XX[0],4),$TX[0]#d
xorb ($in0),$TY#b
movb $TY#b,($out,$in0)
lea 1($in0),$in0
dec $len
jnz .Loop1
.Ldone:
___
$code.=<<___;
#rc4# sub \$1,$XX[0]#b
#rc4# movl $XX[0]#d,-8($dat)
#rc4# movl $YY#d,-4($dat)
mov 40(%rsp),%r15
mov 48(%rsp),%r14
mov 56(%rsp),%r13
mov 64(%rsp),%r12
mov 72(%rsp),%rbp
mov 80(%rsp),%rbx
lea 88(%rsp),%rsp
.Lepilogue:
.Labort:
ret
.size $func,.-$func
___
if ($rc4 && $D) { # sole purpose of this section is to provide
# option to use the generated module as drop-in
# replacement for rc4-x86_64.pl for debugging
# and testing purposes...
my ($idx,$ido)=("%r8","%r9");
my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
$code.=<<___;
.globl RC4_set_key
.type RC4_set_key,\@function,3
.align 16
RC4_set_key:
lea 8($dat),$dat
lea ($inp,$len),$inp
neg $len
mov $len,%rcx
xor %eax,%eax
xor $ido,$ido
xor %r10,%r10
xor %r11,%r11
jmp .Lw1stloop
.align 16
.Lw1stloop:
mov %eax,($dat,%rax,4)
add \$1,%al
jnc .Lw1stloop
xor $ido,$ido
xor $idx,$idx
.align 16
.Lw2ndloop:
mov ($dat,$ido,4),%r10d
add ($inp,$len,1),$idx#b
add %r10b,$idx#b
add \$1,$len
mov ($dat,$idx,4),%r11d
cmovz %rcx,$len
mov %r10d,($dat,$idx,4)
mov %r11d,($dat,$ido,4)
add \$1,$ido#b
jnc .Lw2ndloop
xor %eax,%eax
mov %eax,-8($dat)
mov %eax,-4($dat)
ret
.size RC4_set_key,.-RC4_set_key
.globl RC4_options
.type RC4_options,\@abi-omnipotent
.align 16
RC4_options:
lea .Lopts(%rip),%rax
ret
.align 64
.Lopts:
.asciz "rc4(64x,int)"
.align 64
.size RC4_options,.-RC4_options
___
}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
my $rec="%rcx";
my $frame="%rdx";
my $context="%r8";
my $disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lbody(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lbody
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
lea .Lepilogue(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lin_prologue
mov 40(%rax),%r15
mov 48(%rax),%r14
mov 56(%rax),%r13
mov 64(%rax),%r12
mov 72(%rax),%rbp
mov 80(%rax),%rbx
lea 88(%rax),%rax
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R12
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva .LSEH_begin_$func
.rva .LSEH_end_$func
.rva .LSEH_info_$func
.section .xdata
.align 8
.LSEH_info_$func:
.byte 9,0,0,0
.rva se_handler
___
}
sub reg_part {
my ($reg,$conv)=@_;
if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
return $reg;
}
$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/pinsrw\s+\$0,/movd /gm;
$code =~ s/#md5#//gm if ($md5);
$code =~ s/#rc4#//gm if ($rc4);
print $code;
close STDOUT;

View File

@@ -0,0 +1,314 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# RC4 for PA-RISC.
# June 2009.
#
# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
# For reference, [4x] unrolled loop is >40% faster than folded one.
# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
# is believed to be not sufficient to justify the effort...
#
# Special thanks to polarhome.com for providing HP-UX account.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
# [+ argument transfer]
$SZ=1; # defaults to RC4_CHAR
if (open CONF,"<${dir}../../opensslconf.h") {
while(<CONF>) {
if (m/#\s*define\s+RC4_INT\s+(.*)/) {
$SZ = ($1=~/char$/) ? 1 : 4;
last;
}
}
close CONF;
}
if ($SZ==1) { # RC4_CHAR
$LD="ldb";
$LDX="ldbx";
$MKX="addl";
$ST="stb";
} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
$LD="ldw";
$LDX="ldwx,s";
$MKX="sh2addl";
$ST="stw";
}
$key="%r26";
$len="%r25";
$inp="%r24";
$out="%r23";
@XX=("%r19","%r20");
@TX=("%r21","%r22");
$YY="%r28";
$TY="%r29";
$acc="%r1";
$ix="%r2";
$iy="%r3";
$dat0="%r4";
$dat1="%r5";
$rem="%r6";
$mask="%r31";
sub unrolledloopbody {
for ($i=0;$i<4;$i++) {
$code.=<<___;
ldo 1($XX[0]),$XX[1]
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
and $mask,$XX[1],$XX[1]
$LDX $YY($key),$TY
$MKX $YY,$key,$ix
$LDX $XX[1]($key),$TX[1]
$MKX $XX[0],$key,$iy
$ST $TX[0],0($ix)
comclr,<> $XX[1],$YY,%r0 ; conditional
copy $TX[0],$TX[1] ; move
`sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
$ST $TY,0($iy)
addl $TX[0],$TY,$TY
addl $TX[1],$YY,$YY
and $mask,$TY,$TY
and $mask,$YY,$YY
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
} }
sub foldedloop {
my ($label,$count)=@_;
$code.=<<___;
$label
$MKX $YY,$key,$iy
$LDX $YY($key),$TY
$MKX $XX[0],$key,$ix
$ST $TX[0],0($iy)
ldo 1($XX[0]),$XX[0]
$ST $TY,0($ix)
addl $TX[0],$TY,$TY
ldbx $inp($out),$dat1
and $mask,$TY,$TY
and $mask,$XX[0],$XX[0]
$LDX $TY($key),$acc
$LDX $XX[0]($key),$TX[0]
ldo 1($out),$out
xor $dat1,$acc,$acc
addl $TX[0],$YY,$YY
stb $acc,-1($out)
addib,<> -1,$count,$label ; $count is always small
and $mask,$YY,$YY
___
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
RC4
.PROC
.CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
cmpib,*= 0,$len,L\$abort
sub $inp,$out,$inp ; distance between $inp and $out
$LD `0*$SZ`($key),$XX[0]
$LD `1*$SZ`($key),$YY
ldo `2*$SZ`($key),$key
ldi 0xff,$mask
ldi 3,$dat0
ldo 1($XX[0]),$XX[0] ; warm up loop
and $mask,$XX[0],$XX[0]
$LDX $XX[0]($key),$TX[0]
addl $TX[0],$YY,$YY
cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
and $mask,$YY,$YY
and,<> $out,$dat0,$rem ; is $out aligned?
b L\$alignedout
subi 4,$rem,$rem
sub $len,$rem,$len
___
&foldedloop("L\$alignout",$rem); # process till $out is aligned
$code.=<<___;
L\$alignedout ; $len is at least 4 here
and,<> $inp,$dat0,$acc ; is $inp aligned?
b L\$oop4
sub $inp,$acc,$rem ; align $inp
sh3addl $acc,%r0,$acc
subi 32,$acc,$acc
mtctl $acc,%cr11 ; load %sar with vshd align factor
ldwx $rem($out),$dat0
ldo 4($rem),$rem
L\$oop4misalignedinp
___
&unrolledloopbody();
$code.=<<___;
$LDX $TY($key),$ix
ldwx $rem($out),$dat1
ldo -4($len),$len
or $ix,$acc,$acc ; last piece, no need to dep
vshd $dat0,$dat1,$iy ; align data
copy $dat1,$dat0
xor $iy,$acc,$acc
stw $acc,0($out)
cmpib,*<< 3,$len,L\$oop4misalignedinp
ldo 4($out),$out
cmpib,*= 0,$len,L\$done
nop
b L\$oop1
nop
.ALIGN 8
L\$oop4
___
&unrolledloopbody();
$code.=<<___;
$LDX $TY($key),$ix
ldwx $inp($out),$dat0
ldo -4($len),$len
or $ix,$acc,$acc ; last piece, no need to dep
xor $dat0,$acc,$acc
stw $acc,0($out)
cmpib,*<< 3,$len,L\$oop4
ldo 4($out),$out
cmpib,*= 0,$len,L\$done
nop
___
&foldedloop("L\$oop1",$len);
$code.=<<___;
L\$done
$POP `-$FRAME-$SAVED_RP`(%sp),%r2
ldo -1($XX[0]),$XX[0] ; chill out loop
sub $YY,$TX[0],$YY
and $mask,$XX[0],$XX[0]
and $mask,$YY,$YY
$ST $XX[0],`-2*$SZ`($key)
$ST $YY,`-1*$SZ`($key)
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
L\$abort
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
___
$code.=<<___;
.EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
.ALIGN 8
private_RC4_set_key
.PROC
.CALLINFO NO_CALLS
.ENTRY
$ST %r0,`0*$SZ`($key)
$ST %r0,`1*$SZ`($key)
ldo `2*$SZ`($key),$key
copy %r0,@XX[0]
L\$1st
$ST @XX[0],0($key)
ldo 1(@XX[0]),@XX[0]
bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
ldo $SZ($key),$key
ldo `-256*$SZ`($key),$key ; rewind $key
addl $len,$inp,$inp ; $inp to point at the end
sub %r0,$len,%r23 ; inverse index
copy %r0,@XX[0]
copy %r0,@XX[1]
ldi 0xff,$mask
L\$2nd
$LDX @XX[0]($key),@TX[0]
ldbx %r23($inp),@TX[1]
addi,nuv 1,%r23,%r23 ; increment and conditional
sub %r0,$len,%r23 ; inverse index
addl @TX[0],@XX[1],@XX[1]
addl @TX[1],@XX[1],@XX[1]
and $mask,@XX[1],@XX[1]
$MKX @XX[0],$key,$TY
$LDX @XX[1]($key),@TX[1]
$MKX @XX[1],$key,$YY
ldo 1(@XX[0]),@XX[0]
$ST @TX[0],0($YY)
bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
$ST @TX[1],0($TY)
bv,n (%r2)
.EXIT
nop
.PROCEND
.EXPORT RC4_options,ENTRY
.ALIGN 8
RC4_options
.PROC
.CALLINFO NO_CALLS
.ENTRY
blr %r0,%r28
ldi 3,%r1
L\$pic
andcm %r28,%r1,%r28
bv (%r2)
.EXIT
ldo L\$opts-L\$pic(%r28),%r28
.PROCEND
.ALIGN 8
L\$opts
.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
print $code;
close STDOUT;

View File

@@ -0,0 +1,234 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# February 2009
#
# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
# "cluster" Address Generation Interlocks, so that one pipeline stall
# resolves several dependencies.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z990 it was measured to perform
# 50% better than code generated by gcc 4.3.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$rp="%r14";
$sp="%r15";
$code=<<___;
.text
___
# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
{
$acc="%r0";
$cnt="%r1";
$key="%r2";
$len="%r3";
$inp="%r4";
$out="%r5";
@XX=("%r6","%r7");
@TX=("%r8","%r9");
$YY="%r10";
$TY="%r11";
$code.=<<___;
.globl RC4
.type RC4,\@function
.align 64
RC4:
stm${g} %r6,%r11,6*$SIZE_T($sp)
___
$code.=<<___ if ($flavour =~ /3[12]/);
llgfr $len,$len
___
$code.=<<___;
llgc $XX[0],0($key)
llgc $YY,1($key)
la $XX[0],1($XX[0])
nill $XX[0],0xff
srlg $cnt,$len,3
ltgr $cnt,$cnt
llgc $TX[0],2($XX[0],$key)
jz .Lshort
j .Loop8
.align 64
.Loop8:
___
for ($i=0;$i<8;$i++) {
$code.=<<___;
la $YY,0($YY,$TX[0]) # $i
nill $YY,255
la $XX[1],1($XX[0])
nill $XX[1],255
___
$code.=<<___ if ($i==1);
llgc $acc,2($TY,$key)
___
$code.=<<___ if ($i>1);
sllg $acc,$acc,8
ic $acc,2($TY,$key)
___
$code.=<<___;
llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key)
llgc $TX[1],2($XX[1],$key)
stc $TY,2($XX[0],$key)
cr $XX[1],$YY
jne .Lcmov$i
la $TX[1],0($TX[0])
.Lcmov$i:
la $TY,0($TY,$TX[0])
nill $TY,255
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
lg $TX[1],0($inp)
sllg $acc,$acc,8
la $inp,8($inp)
ic $acc,2($TY,$key)
xgr $acc,$TX[1]
stg $acc,0($out)
la $out,8($out)
brctg $cnt,.Loop8
.Lshort:
lghi $acc,7
ngr $len,$acc
jz .Lexit
j .Loop1
.align 16
.Loop1:
la $YY,0($YY,$TX[0])
nill $YY,255
llgc $TY,2($YY,$key)
stc $TX[0],2($YY,$key)
stc $TY,2($XX[0],$key)
ar $TY,$TX[0]
ahi $XX[0],1
nill $TY,255
nill $XX[0],255
llgc $acc,0($inp)
la $inp,1($inp)
llgc $TY,2($TY,$key)
llgc $TX[0],2($XX[0],$key)
xr $acc,$TY
stc $acc,0($out)
la $out,1($out)
brct $len,.Loop1
.Lexit:
ahi $XX[0],-1
stc $XX[0],0($key)
stc $YY,1($key)
lm${g} %r6,%r11,6*$SIZE_T($sp)
br $rp
.size RC4,.-RC4
.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
}
# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
{
$cnt="%r0";
$idx="%r1";
$key="%r2";
$len="%r3";
$inp="%r4";
$acc="%r5";
$dat="%r6";
$ikey="%r7";
$iinp="%r8";
$code.=<<___;
.globl private_RC4_set_key
.type private_RC4_set_key,\@function
.align 64
private_RC4_set_key:
stm${g} %r6,%r8,6*$SIZE_T($sp)
lhi $cnt,256
la $idx,0(%r0)
sth $idx,0($key)
.align 4
.L1stloop:
stc $idx,2($idx,$key)
la $idx,1($idx)
brct $cnt,.L1stloop
lghi $ikey,-256
lr $cnt,$len
la $iinp,0(%r0)
la $idx,0(%r0)
.align 16
.L2ndloop:
llgc $acc,2+256($ikey,$key)
llgc $dat,0($iinp,$inp)
la $idx,0($idx,$acc)
la $ikey,1($ikey)
la $idx,0($idx,$dat)
nill $idx,255
la $iinp,1($iinp)
tml $ikey,255
llgc $dat,2($idx,$key)
stc $dat,2+256-1($ikey,$key)
stc $acc,2($idx,$key)
jz .Ldone
brct $cnt,.L2ndloop
lr $cnt,$len
la $iinp,0(%r0)
j .L2ndloop
.Ldone:
lm${g} %r6,%r8,6*$SIZE_T($sp)
br $rp
.size private_RC4_set_key,.-private_RC4_set_key
___
}
# const char *RC4_options()
$code.=<<___;
.globl RC4_options
.type RC4_options,\@function
.align 16
RC4_options:
larl %r2,.Loptions
br %r14
.size RC4_options,.-RC4_options
.section .rodata
.Loptions:
.align 8
.string "rc4(8x,char)"
___
print $code;
close STDOUT; # force flush

View File

@@ -0,0 +1,615 @@
.text
.globl RC4
.type RC4,@function
.align 16
RC4: orq %rsi,%rsi
jne .Lentry
.byte 0xf3,0xc3
.Lentry:
pushq %rbx
pushq %r12
pushq %r13
.Lprologue:
movq %rsi,%r11
movq %rdx,%r12
movq %rcx,%r13
xorq %r10,%r10
xorq %rcx,%rcx
leaq 8(%rdi),%rdi
movb -8(%rdi),%r10b
movb -4(%rdi),%cl
cmpl $-1,256(%rdi)
je .LRC4_CHAR
movl OPENSSL_ia32cap_P(%rip),%r8d
xorq %rbx,%rbx
incb %r10b
subq %r10,%rbx
subq %r12,%r13
movl (%rdi,%r10,4),%eax
testq $-16,%r11
jz .Lloop1
btl $30,%r8d
jc .Lintel
andq $7,%rbx
leaq 1(%r10),%rsi
jz .Loop8
subq %rbx,%r11
.Loop8_warmup:
addb %al,%cl
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
movl %edx,(%rdi,%r10,4)
addb %dl,%al
incb %r10b
movl (%rdi,%rax,4),%edx
movl (%rdi,%r10,4),%eax
xorb (%r12),%dl
movb %dl,(%r13,%r12,1)
leaq 1(%r12),%r12
decq %rbx
jnz .Loop8_warmup
leaq 1(%r10),%rsi
jmp .Loop8
.align 16
.Loop8:
addb %al,%cl
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
movl 0(%rdi,%rsi,4),%ebx
rorq $8,%r8
movl %edx,0(%rdi,%r10,4)
addb %al,%dl
movb (%rdi,%rdx,4),%r8b
addb %bl,%cl
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
movl 4(%rdi,%rsi,4),%eax
rorq $8,%r8
movl %edx,4(%rdi,%r10,4)
addb %bl,%dl
movb (%rdi,%rdx,4),%r8b
addb %al,%cl
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
movl 8(%rdi,%rsi,4),%ebx
rorq $8,%r8
movl %edx,8(%rdi,%r10,4)
addb %al,%dl
movb (%rdi,%rdx,4),%r8b
addb %bl,%cl
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
movl 12(%rdi,%rsi,4),%eax
rorq $8,%r8
movl %edx,12(%rdi,%r10,4)
addb %bl,%dl
movb (%rdi,%rdx,4),%r8b
addb %al,%cl
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
movl 16(%rdi,%rsi,4),%ebx
rorq $8,%r8
movl %edx,16(%rdi,%r10,4)
addb %al,%dl
movb (%rdi,%rdx,4),%r8b
addb %bl,%cl
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
movl 20(%rdi,%rsi,4),%eax
rorq $8,%r8
movl %edx,20(%rdi,%r10,4)
addb %bl,%dl
movb (%rdi,%rdx,4),%r8b
addb %al,%cl
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
movl 24(%rdi,%rsi,4),%ebx
rorq $8,%r8
movl %edx,24(%rdi,%r10,4)
addb %al,%dl
movb (%rdi,%rdx,4),%r8b
addb $8,%sil
addb %bl,%cl
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
movl -4(%rdi,%rsi,4),%eax
rorq $8,%r8
movl %edx,28(%rdi,%r10,4)
addb %bl,%dl
movb (%rdi,%rdx,4),%r8b
addb $8,%r10b
rorq $8,%r8
subq $8,%r11
xorq (%r12),%r8
movq %r8,(%r13,%r12,1)
leaq 8(%r12),%r12
testq $-8,%r11
jnz .Loop8
cmpq $0,%r11
jne .Lloop1
jmp .Lexit
.align 16
.Lintel:
testq $-32,%r11
jz .Lloop1
andq $15,%rbx
jz .Loop16_is_hot
subq %rbx,%r11
.Loop16_warmup:
addb %al,%cl
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
movl %edx,(%rdi,%r10,4)
addb %dl,%al
incb %r10b
movl (%rdi,%rax,4),%edx
movl (%rdi,%r10,4),%eax
xorb (%r12),%dl
movb %dl,(%r13,%r12,1)
leaq 1(%r12),%r12
decq %rbx
jnz .Loop16_warmup
movq %rcx,%rbx
xorq %rcx,%rcx
movb %bl,%cl
.Loop16_is_hot:
leaq (%rdi,%r10,4),%rsi
addb %al,%cl
movl (%rdi,%rcx,4),%edx
pxor %xmm0,%xmm0
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 4(%rsi),%ebx
movzbl %al,%eax
movl %edx,0(%rsi)
addb %bl,%cl
pinsrw $0,(%rdi,%rax,4),%xmm0
jmp .Loop16_enter
.align 16
.Loop16:
addb %al,%cl
movl (%rdi,%rcx,4),%edx
pxor %xmm0,%xmm2
psllq $8,%xmm1
pxor %xmm0,%xmm0
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 4(%rsi),%ebx
movzbl %al,%eax
movl %edx,0(%rsi)
pxor %xmm1,%xmm2
addb %bl,%cl
pinsrw $0,(%rdi,%rax,4),%xmm0
movdqu %xmm2,(%r13,%r12,1)
leaq 16(%r12),%r12
.Loop16_enter:
movl (%rdi,%rcx,4),%edx
pxor %xmm1,%xmm1
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movl 8(%rsi),%eax
movzbl %bl,%ebx
movl %edx,4(%rsi)
addb %al,%cl
pinsrw $0,(%rdi,%rbx,4),%xmm1
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 12(%rsi),%ebx
movzbl %al,%eax
movl %edx,8(%rsi)
addb %bl,%cl
pinsrw $1,(%rdi,%rax,4),%xmm0
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movl 16(%rsi),%eax
movzbl %bl,%ebx
movl %edx,12(%rsi)
addb %al,%cl
pinsrw $1,(%rdi,%rbx,4),%xmm1
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 20(%rsi),%ebx
movzbl %al,%eax
movl %edx,16(%rsi)
addb %bl,%cl
pinsrw $2,(%rdi,%rax,4),%xmm0
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movl 24(%rsi),%eax
movzbl %bl,%ebx
movl %edx,20(%rsi)
addb %al,%cl
pinsrw $2,(%rdi,%rbx,4),%xmm1
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 28(%rsi),%ebx
movzbl %al,%eax
movl %edx,24(%rsi)
addb %bl,%cl
pinsrw $3,(%rdi,%rax,4),%xmm0
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movl 32(%rsi),%eax
movzbl %bl,%ebx
movl %edx,28(%rsi)
addb %al,%cl
pinsrw $3,(%rdi,%rbx,4),%xmm1
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 36(%rsi),%ebx
movzbl %al,%eax
movl %edx,32(%rsi)
addb %bl,%cl
pinsrw $4,(%rdi,%rax,4),%xmm0
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movl 40(%rsi),%eax
movzbl %bl,%ebx
movl %edx,36(%rsi)
addb %al,%cl
pinsrw $4,(%rdi,%rbx,4),%xmm1
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 44(%rsi),%ebx
movzbl %al,%eax
movl %edx,40(%rsi)
addb %bl,%cl
pinsrw $5,(%rdi,%rax,4),%xmm0
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movl 48(%rsi),%eax
movzbl %bl,%ebx
movl %edx,44(%rsi)
addb %al,%cl
pinsrw $5,(%rdi,%rbx,4),%xmm1
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 52(%rsi),%ebx
movzbl %al,%eax
movl %edx,48(%rsi)
addb %bl,%cl
pinsrw $6,(%rdi,%rax,4),%xmm0
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movl 56(%rsi),%eax
movzbl %bl,%ebx
movl %edx,52(%rsi)
addb %al,%cl
pinsrw $6,(%rdi,%rbx,4),%xmm1
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
addb %dl,%al
movl 60(%rsi),%ebx
movzbl %al,%eax
movl %edx,56(%rsi)
addb %bl,%cl
pinsrw $7,(%rdi,%rax,4),%xmm0
addb $16,%r10b
movdqu (%r12),%xmm2
movl (%rdi,%rcx,4),%edx
movl %ebx,(%rdi,%rcx,4)
addb %dl,%bl
movzbl %bl,%ebx
movl %edx,60(%rsi)
leaq (%rdi,%r10,4),%rsi
pinsrw $7,(%rdi,%rbx,4),%xmm1
movl (%rsi),%eax
movq %rcx,%rbx
xorq %rcx,%rcx
subq $16,%r11
movb %bl,%cl
testq $-16,%r11
jnz .Loop16
psllq $8,%xmm1
pxor %xmm0,%xmm2
pxor %xmm1,%xmm2
movdqu %xmm2,(%r13,%r12,1)
leaq 16(%r12),%r12
cmpq $0,%r11
jne .Lloop1
jmp .Lexit
.align 16
.Lloop1:
addb %al,%cl
movl (%rdi,%rcx,4),%edx
movl %eax,(%rdi,%rcx,4)
movl %edx,(%rdi,%r10,4)
addb %dl,%al
incb %r10b
movl (%rdi,%rax,4),%edx
movl (%rdi,%r10,4),%eax
xorb (%r12),%dl
movb %dl,(%r13,%r12,1)
leaq 1(%r12),%r12
decq %r11
jnz .Lloop1
jmp .Lexit
.align 16
.LRC4_CHAR:
addb $1,%r10b
movzbl (%rdi,%r10,1),%eax
testq $-8,%r11
jz .Lcloop1
jmp .Lcloop8
.align 16
.Lcloop8:
movl (%r12),%r8d
movl 4(%r12),%r9d
addb %al,%cl
leaq 1(%r10),%rsi
movzbl (%rdi,%rcx,1),%edx
movzbl %sil,%esi
movzbl (%rdi,%rsi,1),%ebx
movb %al,(%rdi,%rcx,1)
cmpq %rsi,%rcx
movb %dl,(%rdi,%r10,1)
jne .Lcmov0
movq %rax,%rbx
.Lcmov0:
addb %al,%dl
xorb (%rdi,%rdx,1),%r8b
rorl $8,%r8d
addb %bl,%cl
leaq 1(%rsi),%r10
movzbl (%rdi,%rcx,1),%edx
movzbl %r10b,%r10d
movzbl (%rdi,%r10,1),%eax
movb %bl,(%rdi,%rcx,1)
cmpq %r10,%rcx
movb %dl,(%rdi,%rsi,1)
jne .Lcmov1
movq %rbx,%rax
.Lcmov1:
addb %bl,%dl
xorb (%rdi,%rdx,1),%r8b
rorl $8,%r8d
addb %al,%cl
leaq 1(%r10),%rsi
movzbl (%rdi,%rcx,1),%edx
movzbl %sil,%esi
movzbl (%rdi,%rsi,1),%ebx
movb %al,(%rdi,%rcx,1)
cmpq %rsi,%rcx
movb %dl,(%rdi,%r10,1)
jne .Lcmov2
movq %rax,%rbx
.Lcmov2:
addb %al,%dl
xorb (%rdi,%rdx,1),%r8b
rorl $8,%r8d
addb %bl,%cl
leaq 1(%rsi),%r10
movzbl (%rdi,%rcx,1),%edx
movzbl %r10b,%r10d
movzbl (%rdi,%r10,1),%eax
movb %bl,(%rdi,%rcx,1)
cmpq %r10,%rcx
movb %dl,(%rdi,%rsi,1)
jne .Lcmov3
movq %rbx,%rax
.Lcmov3:
addb %bl,%dl
xorb (%rdi,%rdx,1),%r8b
rorl $8,%r8d
addb %al,%cl
leaq 1(%r10),%rsi
movzbl (%rdi,%rcx,1),%edx
movzbl %sil,%esi
movzbl (%rdi,%rsi,1),%ebx
movb %al,(%rdi,%rcx,1)
cmpq %rsi,%rcx
movb %dl,(%rdi,%r10,1)
jne .Lcmov4
movq %rax,%rbx
.Lcmov4:
addb %al,%dl
xorb (%rdi,%rdx,1),%r9b
rorl $8,%r9d
addb %bl,%cl
leaq 1(%rsi),%r10
movzbl (%rdi,%rcx,1),%edx
movzbl %r10b,%r10d
movzbl (%rdi,%r10,1),%eax
movb %bl,(%rdi,%rcx,1)
cmpq %r10,%rcx
movb %dl,(%rdi,%rsi,1)
jne .Lcmov5
movq %rbx,%rax
.Lcmov5:
addb %bl,%dl
xorb (%rdi,%rdx,1),%r9b
rorl $8,%r9d
addb %al,%cl
leaq 1(%r10),%rsi
movzbl (%rdi,%rcx,1),%edx
movzbl %sil,%esi
movzbl (%rdi,%rsi,1),%ebx
movb %al,(%rdi,%rcx,1)
cmpq %rsi,%rcx
movb %dl,(%rdi,%r10,1)
jne .Lcmov6
movq %rax,%rbx
.Lcmov6:
addb %al,%dl
xorb (%rdi,%rdx,1),%r9b
rorl $8,%r9d
addb %bl,%cl
leaq 1(%rsi),%r10
movzbl (%rdi,%rcx,1),%edx
movzbl %r10b,%r10d
movzbl (%rdi,%r10,1),%eax
movb %bl,(%rdi,%rcx,1)
cmpq %r10,%rcx
movb %dl,(%rdi,%rsi,1)
jne .Lcmov7
movq %rbx,%rax
.Lcmov7:
addb %bl,%dl
xorb (%rdi,%rdx,1),%r9b
rorl $8,%r9d
leaq -8(%r11),%r11
movl %r8d,(%r13)
leaq 8(%r12),%r12
movl %r9d,4(%r13)
leaq 8(%r13),%r13
testq $-8,%r11
jnz .Lcloop8
cmpq $0,%r11
jne .Lcloop1
jmp .Lexit
.align 16
.Lcloop1:
addb %al,%cl
movzbl %cl,%ecx
movzbl (%rdi,%rcx,1),%edx
movb %al,(%rdi,%rcx,1)
movb %dl,(%rdi,%r10,1)
addb %al,%dl
addb $1,%r10b
movzbl %dl,%edx
movzbl %r10b,%r10d
movzbl (%rdi,%rdx,1),%edx
movzbl (%rdi,%r10,1),%eax
xorb (%r12),%dl
leaq 1(%r12),%r12
movb %dl,(%r13)
leaq 1(%r13),%r13
subq $1,%r11
jnz .Lcloop1
jmp .Lexit
.align 16
.Lexit:
subb $1,%r10b
movl %r10d,-8(%rdi)
movl %ecx,-4(%rdi)
movq (%rsp),%r13
movq 8(%rsp),%r12
movq 16(%rsp),%rbx
addq $24,%rsp
.Lepilogue:
.byte 0xf3,0xc3
.size RC4,.-RC4
.globl private_RC4_set_key
.type private_RC4_set_key,@function
.align 16
private_RC4_set_key:
leaq 8(%rdi),%rdi
leaq (%rdx,%rsi,1),%rdx
negq %rsi
movq %rsi,%rcx
xorl %eax,%eax
xorq %r9,%r9
xorq %r10,%r10
xorq %r11,%r11
movl OPENSSL_ia32cap_P(%rip),%r8d
btl $20,%r8d
jc .Lc1stloop
jmp .Lw1stloop
.align 16
.Lw1stloop:
movl %eax,(%rdi,%rax,4)
addb $1,%al
jnc .Lw1stloop
xorq %r9,%r9
xorq %r8,%r8
.align 16
.Lw2ndloop:
movl (%rdi,%r9,4),%r10d
addb (%rdx,%rsi,1),%r8b
addb %r10b,%r8b
addq $1,%rsi
movl (%rdi,%r8,4),%r11d
cmovzq %rcx,%rsi
movl %r10d,(%rdi,%r8,4)
movl %r11d,(%rdi,%r9,4)
addb $1,%r9b
jnc .Lw2ndloop
jmp .Lexit_key
.align 16
.Lc1stloop:
movb %al,(%rdi,%rax,1)
addb $1,%al
jnc .Lc1stloop
xorq %r9,%r9
xorq %r8,%r8
.align 16
.Lc2ndloop:
movb (%rdi,%r9,1),%r10b
addb (%rdx,%rsi,1),%r8b
addb %r10b,%r8b
addq $1,%rsi
movb (%rdi,%r8,1),%r11b
jnz .Lcnowrap
movq %rcx,%rsi
.Lcnowrap:
movb %r10b,(%rdi,%r8,1)
movb %r11b,(%rdi,%r9,1)
addb $1,%r9b
jnc .Lc2ndloop
movl $-1,256(%rdi)
.align 16
.Lexit_key:
xorl %eax,%eax
movl %eax,-8(%rdi)
movl %eax,-4(%rdi)
.byte 0xf3,0xc3
.size private_RC4_set_key,.-private_RC4_set_key
.globl RC4_options
.type RC4_options,@function
.align 16
RC4_options:
leaq .Lopts(%rip),%rax
movl OPENSSL_ia32cap_P(%rip),%edx
btl $20,%edx
jc .L8xchar
btl $30,%edx
jnc .Ldone
addq $25,%rax
.byte 0xf3,0xc3
.L8xchar:
addq $12,%rax
.Ldone:
.byte 0xf3,0xc3
.align 64
.Lopts:
.byte 114,99,52,40,56,120,44,105,110,116,41,0
.byte 114,99,52,40,56,120,44,99,104,97,114,41,0
.byte 114,99,52,40,49,54,120,44,105,110,116,41,0
.byte 82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
.size RC4_options,.-RC4_options

View File

@@ -0,0 +1,677 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# July 2004
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
# "hand-coded assembler"] doesn't stand for the whole improvement
# coefficient. It turned out that eliminating RC4_CHAR from config
# line results in ~40% improvement (yes, even for C implementation).
# Presumably it has everything to do with AMD cache architecture and
# RAW or whatever penalties. Once again! The module *requires* config
# line *without* RC4_CHAR! As for coding "secret," I bet on partial
# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
# I simply 'inc %r8b'. Even though optimization manual discourages
# to operate on partial registers, it turned out to be the best bet.
# At least for AMD... How IA32E would perform remains to be seen...
# November 2004
#
# As was shown by Marc Bevand reordering of couple of load operations
# results in even higher performance gain of 3.3x:-) At least on
# Opteron... For reference, 1x in this case is RC4_CHAR C-code
# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
# Latter means that if you want to *estimate* what to expect from
# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
# November 2004
#
# Intel P4 EM64T core was found to run the AMD64 code really slow...
# The only way to achieve comparable performance on P4 was to keep
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
# compose blended code, which would perform even within 30% marginal
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details...
# April 2005
#
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# those with add/sub results in 50% performance improvement of folded
# loop...
# May 2005
#
# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
# performance by >30% [unlike P4 32-bit case that is]. But this is
# provided that loads are reordered even more aggressively! Both code
# pathes, AMD64 and EM64T, reorder loads in essentially same manner
# as my IA-64 implementation. On Opteron this resulted in modest 5%
# improvement [I had to test it], while final Intel P4 performance
# achieves respectful 432MBps on 2.8GHz processor now. For reference.
# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
# RC4_INT code-path. While if executed on Opteron, it's only 25%
# slower than the RC4_INT one [meaning that if CPU µ-arch detection
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
# March 2007
#
# Intel Core2 was observed to perform poorly on both code paths:-( It
# apparently suffers from some kind of partial register stall, which
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
# cloop1 boosts its performance by 80%! This loop appears to be optimal
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.
# May 2010
#
# Intel Westmere was observed to perform suboptimally. Adding yet
# another movzb to cloop1 improved performance by almost 50%! Core2
# performance is improved too, but nominally...
# May 2011
#
# The only code path that was not modified is P4-specific one. Non-P4
# Intel code path optimization is heavily based on submission by Maxim
# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
# some of the ideas even in attempt to optmize the original RC4_INT
# code path... Current performance in cycles per processed byte (less
# is better) and improvement coefficients relative to previous
# version of this module are:
#
# Opteron 5.3/+0%(*)
# P4 6.5
# Core2 6.2/+15%(**)
# Westmere 4.2/+60%
# Sandy Bridge 4.2/+120%
# Atom 9.3/+80%
#
# (*) But corresponding loop has less instructions, which should have
# positive effect on upcoming Bulldozer, which has one less ALU.
# For reference, Intel code runs at 6.8 cpb rate on Opteron.
# (**) Note that Core2 result is ~15% lower than corresponding result
# for 32-bit code, meaning that it's possible to improve it,
# but more than likely at the cost of the others (see rc4-586.pl
# to get the idea)...
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
$inp="%rdx"; # arg3
$out="%rcx"; # arg4
{
$code=<<___;
.text
.extern OPENSSL_ia32cap_P
.globl RC4
.type RC4,\@function,4
.align 16
RC4: or $len,$len
jne .Lentry
ret
.Lentry:
push %rbx
push %r12
push %r13
.Lprologue:
mov $len,%r11
mov $inp,%r12
mov $out,%r13
___
my $len="%r11"; # reassign input arguments
my $inp="%r12";
my $out="%r13";
my @XX=("%r10","%rsi");
my @TX=("%rax","%rbx");
my $YY="%rcx";
my $TY="%rdx";
$code.=<<___;
xor $XX[0],$XX[0]
xor $YY,$YY
lea 8($dat),$dat
mov -8($dat),$XX[0]#b
mov -4($dat),$YY#b
cmpl \$-1,256($dat)
je .LRC4_CHAR
mov OPENSSL_ia32cap_P(%rip),%r8d
xor $TX[1],$TX[1]
inc $XX[0]#b
sub $XX[0],$TX[1]
sub $inp,$out
movl ($dat,$XX[0],4),$TX[0]#d
test \$-16,$len
jz .Lloop1
bt \$30,%r8d # Intel CPU?
jc .Lintel
and \$7,$TX[1]
lea 1($XX[0]),$XX[1]
jz .Loop8
sub $TX[1],$len
.Loop8_warmup:
add $TX[0]#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX[0]#d,($dat,$YY,4)
movl $TY#d,($dat,$XX[0],4)
add $TY#b,$TX[0]#b
inc $XX[0]#b
movl ($dat,$TX[0],4),$TY#d
movl ($dat,$XX[0],4),$TX[0]#d
xorb ($inp),$TY#b
movb $TY#b,($out,$inp)
lea 1($inp),$inp
dec $TX[1]
jnz .Loop8_warmup
lea 1($XX[0]),$XX[1]
jmp .Loop8
.align 16
.Loop8:
___
for ($i=0;$i<8;$i++) {
$code.=<<___ if ($i==7);
add \$8,$XX[1]#b
___
$code.=<<___;
add $TX[0]#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX[0]#d,($dat,$YY,4)
movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
ror \$8,%r8 # ror is redundant when $i=0
movl $TY#d,4*$i($dat,$XX[0],4)
add $TX[0]#b,$TY#b
movb ($dat,$TY,4),%r8b
___
push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
add \$8,$XX[0]#b
ror \$8,%r8
sub \$8,$len
xor ($inp),%r8
mov %r8,($out,$inp)
lea 8($inp),$inp
test \$-8,$len
jnz .Loop8
cmp \$0,$len
jne .Lloop1
jmp .Lexit
.align 16
.Lintel:
test \$-32,$len
jz .Lloop1
and \$15,$TX[1]
jz .Loop16_is_hot
sub $TX[1],$len
.Loop16_warmup:
add $TX[0]#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX[0]#d,($dat,$YY,4)
movl $TY#d,($dat,$XX[0],4)
add $TY#b,$TX[0]#b
inc $XX[0]#b
movl ($dat,$TX[0],4),$TY#d
movl ($dat,$XX[0],4),$TX[0]#d
xorb ($inp),$TY#b
movb $TY#b,($out,$inp)
lea 1($inp),$inp
dec $TX[1]
jnz .Loop16_warmup
mov $YY,$TX[1]
xor $YY,$YY
mov $TX[1]#b,$YY#b
.Loop16_is_hot:
lea ($dat,$XX[0],4),$XX[1]
___
sub RC4_loop {
my $i=shift;
my $j=$i<0?0:$i;
my $xmm="%xmm".($j&1);
$code.=" add \$16,$XX[0]#b\n" if ($i==15);
$code.=" movdqu ($inp),%xmm2\n" if ($i==15);
$code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
$code.=" movl ($dat,$YY,4),$TY#d\n";
$code.=" pxor %xmm0,%xmm2\n" if ($i==0);
$code.=" psllq \$8,%xmm1\n" if ($i==0);
$code.=" pxor $xmm,$xmm\n" if ($i<=1);
$code.=" movl $TX[0]#d,($dat,$YY,4)\n";
$code.=" add $TY#b,$TX[0]#b\n";
$code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
$code.=" movz $TX[0]#b,$TX[0]#d\n";
$code.=" movl $TY#d,4*$j($XX[1])\n";
$code.=" pxor %xmm1,%xmm2\n" if ($i==0);
$code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
$code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
$code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
$code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
$code.=" lea 16($inp),$inp\n" if ($i==0);
$code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
}
RC4_loop(-1);
$code.=<<___;
jmp .Loop16_enter
.align 16
.Loop16:
___
for ($i=0;$i<16;$i++) {
$code.=".Loop16_enter:\n" if ($i==1);
RC4_loop($i);
push(@TX,shift(@TX)); # "rotate" registers
}
$code.=<<___;
mov $YY,$TX[1]
xor $YY,$YY # keyword to partial register
sub \$16,$len
mov $TX[1]#b,$YY#b
test \$-16,$len
jnz .Loop16
psllq \$8,%xmm1
pxor %xmm0,%xmm2
pxor %xmm1,%xmm2
movdqu %xmm2,($out,$inp)
lea 16($inp),$inp
cmp \$0,$len
jne .Lloop1
jmp .Lexit
.align 16
.Lloop1:
add $TX[0]#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX[0]#d,($dat,$YY,4)
movl $TY#d,($dat,$XX[0],4)
add $TY#b,$TX[0]#b
inc $XX[0]#b
movl ($dat,$TX[0],4),$TY#d
movl ($dat,$XX[0],4),$TX[0]#d
xorb ($inp),$TY#b
movb $TY#b,($out,$inp)
lea 1($inp),$inp
dec $len
jnz .Lloop1
jmp .Lexit
.align 16
.LRC4_CHAR:
add \$1,$XX[0]#b
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
jmp .Lcloop8
.align 16
.Lcloop8:
mov ($inp),%r8d
mov 4($inp),%r9d
___
# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
for ($i=0;$i<4;$i++) {
$code.=<<___;
add $TX[0]#b,$YY#b
lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY#d
movzb $XX[1]#b,$XX[1]#d
movzb ($dat,$XX[1]),$TX[1]#d
movb $TX[0]#b,($dat,$YY)
cmp $XX[1],$YY
movb $TY#b,($dat,$XX[0])
jne .Lcmov$i # Intel cmov is sloooow...
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
xor ($dat,$TY),%r8b
ror \$8,%r8d
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
for ($i=4;$i<8;$i++) {
$code.=<<___;
add $TX[0]#b,$YY#b
lea 1($XX[0]),$XX[1]
movzb ($dat,$YY),$TY#d
movzb $XX[1]#b,$XX[1]#d
movzb ($dat,$XX[1]),$TX[1]#d
movb $TX[0]#b,($dat,$YY)
cmp $XX[1],$YY
movb $TY#b,($dat,$XX[0])
jne .Lcmov$i # Intel cmov is sloooow...
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
xor ($dat,$TY),%r9b
ror \$8,%r9d
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
lea -8($len),$len
mov %r8d,($out)
lea 8($inp),$inp
mov %r9d,4($out)
lea 8($out),$out
test \$-8,$len
jnz .Lcloop8
cmp \$0,$len
jne .Lcloop1
jmp .Lexit
___
$code.=<<___;
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
movzb $YY#b,$YY#d
movzb ($dat,$YY),$TY#d
movb $TX[0]#b,($dat,$YY)
movb $TY#b,($dat,$XX[0])
add $TX[0]#b,$TY#b
add \$1,$XX[0]#b
movzb $TY#b,$TY#d
movzb $XX[0]#b,$XX[0]#d
movzb ($dat,$TY),$TY#d
movzb ($dat,$XX[0]),$TX[0]#d
xorb ($inp),$TY#b
lea 1($inp),$inp
movb $TY#b,($out)
lea 1($out),$out
sub \$1,$len
jnz .Lcloop1
jmp .Lexit
.align 16
.Lexit:
sub \$1,$XX[0]#b
movl $XX[0]#d,-8($dat)
movl $YY#d,-4($dat)
mov (%rsp),%r13
mov 8(%rsp),%r12
mov 16(%rsp),%rbx
add \$24,%rsp
.Lepilogue:
ret
.size RC4,.-RC4
___
}
$idx="%r8";
$ido="%r9";
$code.=<<___;
.globl private_RC4_set_key
.type private_RC4_set_key,\@function,3
.align 16
private_RC4_set_key:
lea 8($dat),$dat
lea ($inp,$len),$inp
neg $len
mov $len,%rcx
xor %eax,%eax
xor $ido,$ido
xor %r10,%r10
xor %r11,%r11
mov OPENSSL_ia32cap_P(%rip),$idx#d
bt \$20,$idx#d # RC4_CHAR?
jc .Lc1stloop
jmp .Lw1stloop
.align 16
.Lw1stloop:
mov %eax,($dat,%rax,4)
add \$1,%al
jnc .Lw1stloop
xor $ido,$ido
xor $idx,$idx
.align 16
.Lw2ndloop:
mov ($dat,$ido,4),%r10d
add ($inp,$len,1),$idx#b
add %r10b,$idx#b
add \$1,$len
mov ($dat,$idx,4),%r11d
cmovz %rcx,$len
mov %r10d,($dat,$idx,4)
mov %r11d,($dat,$ido,4)
add \$1,$ido#b
jnc .Lw2ndloop
jmp .Lexit_key
.align 16
.Lc1stloop:
mov %al,($dat,%rax)
add \$1,%al
jnc .Lc1stloop
xor $ido,$ido
xor $idx,$idx
.align 16
.Lc2ndloop:
mov ($dat,$ido),%r10b
add ($inp,$len),$idx#b
add %r10b,$idx#b
add \$1,$len
mov ($dat,$idx),%r11b
jnz .Lcnowrap
mov %rcx,$len
.Lcnowrap:
mov %r10b,($dat,$idx)
mov %r11b,($dat,$ido)
add \$1,$ido#b
jnc .Lc2ndloop
movl \$-1,256($dat)
.align 16
.Lexit_key:
xor %eax,%eax
mov %eax,-8($dat)
mov %eax,-4($dat)
ret
.size private_RC4_set_key,.-private_RC4_set_key
.globl RC4_options
.type RC4_options,\@abi-omnipotent
.align 16
RC4_options:
lea .Lopts(%rip),%rax
mov OPENSSL_ia32cap_P(%rip),%edx
bt \$20,%edx
jc .L8xchar
bt \$30,%edx
jnc .Ldone
add \$25,%rax
ret
.L8xchar:
add \$12,%rax
.Ldone:
ret
.align 64
.Lopts:
.asciz "rc4(8x,int)"
.asciz "rc4(8x,char)"
.asciz "rc4(16x,int)"
.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
.size RC4_options,.-RC4_options
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type stream_se_handler,\@abi-omnipotent
.align 16
stream_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lprologue(%rip),%r10
cmp %r10,%rbx # context->Rip<prologue label
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
lea .Lepilogue(%rip),%r10
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
lea 24(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%r12
mov -24(%rax),%r13
mov %rbx,144($context) # restore context->Rbx
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
.Lin_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
jmp .Lcommon_seh_exit
.size stream_se_handler,.-stream_se_handler
.type key_se_handler,\@abi-omnipotent
.align 16
key_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
.Lcommon_seh_exit:
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size key_se_handler,.-key_se_handler
.section .pdata
.align 4
.rva .LSEH_begin_RC4
.rva .LSEH_end_RC4
.rva .LSEH_info_RC4
.rva .LSEH_begin_private_RC4_set_key
.rva .LSEH_end_private_RC4_set_key
.rva .LSEH_info_private_RC4_set_key
.section .xdata
.align 8
.LSEH_info_RC4:
.byte 9,0,0,0
.rva stream_se_handler
.LSEH_info_private_RC4_set_key:
.byte 9,0,0,0
.rva key_se_handler
___
}
sub reg_part {
my ($reg,$conv)=@_;
if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
return $reg;
}
$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;