Support for Signal calls.

Merge in RedPhone

// FREEBIE
This commit is contained in:
Moxie Marlinspike
2015-09-09 13:54:29 -07:00
parent 3d4ae60d81
commit d83a3d71bc
2585 changed files with 803492 additions and 45 deletions

View File

@@ -0,0 +1,3 @@
This is an OpenSSL-compatible version of AES (also called Rijndael).
aes_core.c is basically the same as rijndael-alg-fst.c but with an
API that looks like the rest of the OpenSSL symmetric cipher suite.

View File

@@ -0,0 +1,147 @@
/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#ifndef HEADER_AES_H
#define HEADER_AES_H
#include <openssl/opensslconf.h>
#ifdef OPENSSL_NO_AES
#error AES is disabled.
#endif
#include <stddef.h>
#define AES_ENCRYPT 1
#define AES_DECRYPT 0
/* Because array size can't be a const in C, the following two are macros.
Both sizes are in bytes. */
#define AES_MAXNR 14
#define AES_BLOCK_SIZE 16
#ifdef __cplusplus
extern "C" {
#endif
/* This should be a hidden type, but EVP requires that the size be known */
struct aes_key_st {
#ifdef AES_LONG
unsigned long rd_key[4 *(AES_MAXNR + 1)];
#else
unsigned int rd_key[4 *(AES_MAXNR + 1)];
#endif
int rounds;
};
typedef struct aes_key_st AES_KEY;
const char *AES_options(void);
int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
void AES_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void AES_decrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key, const int enc);
void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, const int enc);
void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc);
void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc);
void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc);
void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num);
void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char ivec[AES_BLOCK_SIZE],
unsigned char ecount_buf[AES_BLOCK_SIZE],
unsigned int *num);
/* NB: the IV is _two_ blocks long */
void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, const int enc);
/* NB: the IV is _four_ blocks long */
void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
const AES_KEY *key2, const unsigned char *ivec,
const int enc);
int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen);
int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen);
#ifdef __cplusplus
}
#endif
#endif /* !HEADER_AES_H */

View File

@@ -0,0 +1,63 @@
/* crypto/aes/aes_cbc.c -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#include <openssl/aes.h>
#include <openssl/modes.h>
void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec, const int enc) {
if (enc)
CRYPTO_cbc128_encrypt(in,out,len,key,ivec,(block128_f)AES_encrypt);
else
CRYPTO_cbc128_decrypt(in,out,len,key,ivec,(block128_f)AES_decrypt);
}

View File

@@ -0,0 +1,81 @@
/* crypto/aes/aes_cfb.c -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#include <openssl/aes.h>
#include <openssl/modes.h>
/* The input and output encrypted as though 128bit cfb mode is being
* used. The extra state information to record how much of the
* 128bit block we have used is contained in *num;
*/
void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc) {
CRYPTO_cfb128_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
}
/* N.B. This expects the input to be packed, MS bit first */
void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc)
{
CRYPTO_cfb128_1_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
}
void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc)
{
CRYPTO_cfb128_8_encrypt(in,out,length,key,ivec,num,enc,(block128_f)AES_encrypt);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,61 @@
/* crypto/aes/aes_ctr.c -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#include <openssl/aes.h>
#include <openssl/modes.h>
void AES_ctr128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char ivec[AES_BLOCK_SIZE],
unsigned char ecount_buf[AES_BLOCK_SIZE],
unsigned int *num) {
CRYPTO_ctr128_encrypt(in,out,length,key,ivec,ecount_buf,num,(block128_f)AES_encrypt);
}

View File

@@ -0,0 +1,73 @@
/* crypto/aes/aes_ecb.c -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#ifndef AES_DEBUG
# ifndef NDEBUG
# define NDEBUG
# endif
#endif
#include <assert.h>
#include <openssl/aes.h>
#include "aes_locl.h"
void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key, const int enc) {
assert(in && out && key);
assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
if (AES_ENCRYPT == enc)
AES_encrypt(in, out, key);
else
AES_decrypt(in, out, key);
}

View File

@@ -0,0 +1,323 @@
/* crypto/aes/aes_ige.c -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#include "cryptlib.h"
#include <openssl/aes.h>
#include "aes_locl.h"
#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
typedef struct {
unsigned long data[N_WORDS];
} aes_block_t;
/* XXX: probably some better way to do this */
#if defined(__i386__) || defined(__x86_64__)
#define UNALIGNED_MEMOPS_ARE_FAST 1
#else
#define UNALIGNED_MEMOPS_ARE_FAST 0
#endif
#if UNALIGNED_MEMOPS_ARE_FAST
#define load_block(d, s) (d) = *(const aes_block_t *)(s)
#define store_block(d, s) *(aes_block_t *)(d) = (s)
#else
#define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
#define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
#endif
/* N.B. The IV for this mode is _twice_ the block size */
void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, const int enc)
{
size_t n;
size_t len = length;
OPENSSL_assert(in && out && key && ivec);
OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
OPENSSL_assert((length%AES_BLOCK_SIZE) == 0);
len = length / AES_BLOCK_SIZE;
if (AES_ENCRYPT == enc)
{
if (in != out &&
(UNALIGNED_MEMOPS_ARE_FAST || ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(long)==0))
{
aes_block_t *ivp = (aes_block_t *)ivec;
aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
while (len)
{
aes_block_t *inp = (aes_block_t *)in;
aes_block_t *outp = (aes_block_t *)out;
for(n=0 ; n < N_WORDS; ++n)
outp->data[n] = inp->data[n] ^ ivp->data[n];
AES_encrypt((unsigned char *)outp->data, (unsigned char *)outp->data, key);
for(n=0 ; n < N_WORDS; ++n)
outp->data[n] ^= iv2p->data[n];
ivp = outp;
iv2p = inp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
}
else
{
aes_block_t tmp, tmp2;
aes_block_t iv;
aes_block_t iv2;
load_block(iv, ivec);
load_block(iv2, ivec + AES_BLOCK_SIZE);
while (len)
{
load_block(tmp, in);
for(n=0 ; n < N_WORDS; ++n)
tmp2.data[n] = tmp.data[n] ^ iv.data[n];
AES_encrypt((unsigned char *)tmp2.data, (unsigned char *)tmp2.data, key);
for(n=0 ; n < N_WORDS; ++n)
tmp2.data[n] ^= iv2.data[n];
store_block(out, tmp2);
iv = tmp2;
iv2 = tmp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, iv.data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
}
}
else
{
if (in != out &&
(UNALIGNED_MEMOPS_ARE_FAST || ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(long)==0))
{
aes_block_t *ivp = (aes_block_t *)ivec;
aes_block_t *iv2p = (aes_block_t *)(ivec + AES_BLOCK_SIZE);
while (len)
{
aes_block_t tmp;
aes_block_t *inp = (aes_block_t *)in;
aes_block_t *outp = (aes_block_t *)out;
for(n=0 ; n < N_WORDS; ++n)
tmp.data[n] = inp->data[n] ^ iv2p->data[n];
AES_decrypt((unsigned char *)tmp.data, (unsigned char *)outp->data, key);
for(n=0 ; n < N_WORDS; ++n)
outp->data[n] ^= ivp->data[n];
ivp = inp;
iv2p = outp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
}
else
{
aes_block_t tmp, tmp2;
aes_block_t iv;
aes_block_t iv2;
load_block(iv, ivec);
load_block(iv2, ivec + AES_BLOCK_SIZE);
while (len)
{
load_block(tmp, in);
tmp2 = tmp;
for(n=0 ; n < N_WORDS; ++n)
tmp.data[n] ^= iv2.data[n];
AES_decrypt((unsigned char *)tmp.data, (unsigned char *)tmp.data, key);
for(n=0 ; n < N_WORDS; ++n)
tmp.data[n] ^= iv.data[n];
store_block(out, tmp);
iv = tmp2;
iv2 = tmp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, iv.data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
}
}
}
/*
* Note that its effectively impossible to do biIGE in anything other
* than a single pass, so no provision is made for chaining.
*/
/* N.B. The IV for this mode is _four times_ the block size */
void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
const AES_KEY *key2, const unsigned char *ivec,
const int enc)
{
size_t n;
size_t len = length;
unsigned char tmp[AES_BLOCK_SIZE];
unsigned char tmp2[AES_BLOCK_SIZE];
unsigned char tmp3[AES_BLOCK_SIZE];
unsigned char prev[AES_BLOCK_SIZE];
const unsigned char *iv;
const unsigned char *iv2;
OPENSSL_assert(in && out && key && ivec);
OPENSSL_assert((AES_ENCRYPT == enc)||(AES_DECRYPT == enc));
OPENSSL_assert((length%AES_BLOCK_SIZE) == 0);
if (AES_ENCRYPT == enc)
{
/* XXX: Do a separate case for when in != out (strictly should
check for overlap, too) */
/* First the forward pass */
iv = ivec;
iv2 = ivec + AES_BLOCK_SIZE;
while (len >= AES_BLOCK_SIZE)
{
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
out[n] = in[n] ^ iv[n];
AES_encrypt(out, out, key);
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
out[n] ^= iv2[n];
iv = out;
memcpy(prev, in, AES_BLOCK_SIZE);
iv2 = prev;
len -= AES_BLOCK_SIZE;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
/* And now backwards */
iv = ivec + AES_BLOCK_SIZE*2;
iv2 = ivec + AES_BLOCK_SIZE*3;
len = length;
while(len >= AES_BLOCK_SIZE)
{
out -= AES_BLOCK_SIZE;
/* XXX: reduce copies by alternating between buffers */
memcpy(tmp, out, AES_BLOCK_SIZE);
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
out[n] ^= iv[n];
/* hexdump(stdout, "out ^ iv", out, AES_BLOCK_SIZE); */
AES_encrypt(out, out, key);
/* hexdump(stdout,"enc", out, AES_BLOCK_SIZE); */
/* hexdump(stdout,"iv2", iv2, AES_BLOCK_SIZE); */
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
out[n] ^= iv2[n];
/* hexdump(stdout,"out", out, AES_BLOCK_SIZE); */
iv = out;
memcpy(prev, tmp, AES_BLOCK_SIZE);
iv2 = prev;
len -= AES_BLOCK_SIZE;
}
}
else
{
/* First backwards */
iv = ivec + AES_BLOCK_SIZE*2;
iv2 = ivec + AES_BLOCK_SIZE*3;
in += length;
out += length;
while (len >= AES_BLOCK_SIZE)
{
in -= AES_BLOCK_SIZE;
out -= AES_BLOCK_SIZE;
memcpy(tmp, in, AES_BLOCK_SIZE);
memcpy(tmp2, in, AES_BLOCK_SIZE);
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
tmp[n] ^= iv2[n];
AES_decrypt(tmp, out, key);
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
out[n] ^= iv[n];
memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
iv = tmp3;
iv2 = out;
len -= AES_BLOCK_SIZE;
}
/* And now forwards */
iv = ivec;
iv2 = ivec + AES_BLOCK_SIZE;
len = length;
while (len >= AES_BLOCK_SIZE)
{
memcpy(tmp, out, AES_BLOCK_SIZE);
memcpy(tmp2, out, AES_BLOCK_SIZE);
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
tmp[n] ^= iv2[n];
AES_decrypt(tmp, out, key);
for(n=0 ; n < AES_BLOCK_SIZE ; ++n)
out[n] ^= iv[n];
memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
iv = tmp3;
iv2 = out;
len -= AES_BLOCK_SIZE;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
}
}

View File

@@ -0,0 +1,89 @@
/* crypto/aes/aes.h -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#ifndef HEADER_AES_LOCL_H
#define HEADER_AES_LOCL_H
#include <openssl/e_os2.h>
#ifdef OPENSSL_NO_AES
#error AES is disabled.
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
# define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
# define GETU32(p) SWAP(*((u32 *)(p)))
# define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
#else
# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
#endif
#ifdef AES_LONG
typedef unsigned long u32;
#else
typedef unsigned int u32;
#endif
typedef unsigned short u16;
typedef unsigned char u8;
#define MAXKC (256/32)
#define MAXKB (256/8)
#define MAXNR 14
/* This controls loop-unrolling in aes_core.c */
#undef FULL_UNROLL
#endif /* !HEADER_AES_LOCL_H */

View File

@@ -0,0 +1,85 @@
/* crypto/aes/aes_misc.c -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#include <openssl/opensslv.h>
#include <openssl/crypto.h>
#include <openssl/aes.h>
#include "aes_locl.h"
const char AES_version[]="AES" OPENSSL_VERSION_PTEXT;
const char *AES_options(void) {
#ifdef FULL_UNROLL
return "aes(full)";
#else
return "aes(partial)";
#endif
}
/* FIPS wrapper functions to block low level AES calls in FIPS mode */
int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key)
{
#ifdef OPENSSL_FIPS
fips_cipher_abort(AES);
#endif
return private_AES_set_encrypt_key(userKey, bits, key);
}
int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key)
{
#ifdef OPENSSL_FIPS
fips_cipher_abort(AES);
#endif
return private_AES_set_decrypt_key(userKey, bits, key);
}

View File

@@ -0,0 +1,60 @@
/* crypto/aes/aes_ofb.c -*- mode:C; c-file-style: "eay" -*- */
/* ====================================================================
* Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
*/
#include <openssl/aes.h>
#include <openssl/modes.h>
void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num)
{
CRYPTO_ofb128_encrypt(in,out,length,key,ivec,num,(block128_f)AES_encrypt);
}

View File

@@ -0,0 +1,259 @@
/* crypto/aes/aes_wrap.c */
/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
* project.
*/
/* ====================================================================
* Copyright (c) 2008 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* licensing@OpenSSL.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
#include "cryptlib.h"
#include <openssl/aes.h>
#include <openssl/bio.h>
static const unsigned char default_iv[] = {
0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
};
int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen)
{
unsigned char *A, B[16], *R;
unsigned int i, j, t;
if ((inlen & 0x7) || (inlen < 8))
return -1;
A = B;
t = 1;
memcpy(out + 8, in, inlen);
if (!iv)
iv = default_iv;
memcpy(A, iv, 8);
for (j = 0; j < 6; j++)
{
R = out + 8;
for (i = 0; i < inlen; i += 8, t++, R += 8)
{
memcpy(B + 8, R, 8);
AES_encrypt(B, B, key);
A[7] ^= (unsigned char)(t & 0xff);
if (t > 0xff)
{
A[6] ^= (unsigned char)((t >> 8) & 0xff);
A[5] ^= (unsigned char)((t >> 16) & 0xff);
A[4] ^= (unsigned char)((t >> 24) & 0xff);
}
memcpy(R, B + 8, 8);
}
}
memcpy(out, A, 8);
return inlen + 8;
}
int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen)
{
unsigned char *A, B[16], *R;
unsigned int i, j, t;
inlen -= 8;
if (inlen & 0x7)
return -1;
if (inlen < 8)
return -1;
A = B;
t = 6 * (inlen >> 3);
memcpy(A, in, 8);
memcpy(out, in + 8, inlen);
for (j = 0; j < 6; j++)
{
R = out + inlen - 8;
for (i = 0; i < inlen; i += 8, t--, R -= 8)
{
A[7] ^= (unsigned char)(t & 0xff);
if (t > 0xff)
{
A[6] ^= (unsigned char)((t >> 8) & 0xff);
A[5] ^= (unsigned char)((t >> 16) & 0xff);
A[4] ^= (unsigned char)((t >> 24) & 0xff);
}
memcpy(B + 8, R, 8);
AES_decrypt(B, B, key);
memcpy(R, B + 8, 8);
}
}
if (!iv)
iv = default_iv;
if (memcmp(A, iv, 8))
{
OPENSSL_cleanse(out, inlen);
return 0;
}
return inlen;
}
#ifdef AES_WRAP_TEST
int AES_wrap_unwrap_test(const unsigned char *kek, int keybits,
const unsigned char *iv,
const unsigned char *eout,
const unsigned char *key, int keylen)
{
unsigned char *otmp = NULL, *ptmp = NULL;
int r, ret = 0;
AES_KEY wctx;
otmp = OPENSSL_malloc(keylen + 8);
ptmp = OPENSSL_malloc(keylen);
if (!otmp || !ptmp)
return 0;
if (AES_set_encrypt_key(kek, keybits, &wctx))
goto err;
r = AES_wrap_key(&wctx, iv, otmp, key, keylen);
if (r <= 0)
goto err;
if (eout && memcmp(eout, otmp, keylen))
goto err;
if (AES_set_decrypt_key(kek, keybits, &wctx))
goto err;
r = AES_unwrap_key(&wctx, iv, ptmp, otmp, r);
if (memcmp(key, ptmp, keylen))
goto err;
ret = 1;
err:
if (otmp)
OPENSSL_free(otmp);
if (ptmp)
OPENSSL_free(ptmp);
return ret;
}
int main(int argc, char **argv)
{
static const unsigned char kek[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
};
static const unsigned char key[] = {
0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
};
static const unsigned char e1[] = {
0x1f, 0xa6, 0x8b, 0x0a, 0x81, 0x12, 0xb4, 0x47,
0xae, 0xf3, 0x4b, 0xd8, 0xfb, 0x5a, 0x7b, 0x82,
0x9d, 0x3e, 0x86, 0x23, 0x71, 0xd2, 0xcf, 0xe5
};
static const unsigned char e2[] = {
0x96, 0x77, 0x8b, 0x25, 0xae, 0x6c, 0xa4, 0x35,
0xf9, 0x2b, 0x5b, 0x97, 0xc0, 0x50, 0xae, 0xd2,
0x46, 0x8a, 0xb8, 0xa1, 0x7a, 0xd8, 0x4e, 0x5d
};
static const unsigned char e3[] = {
0x64, 0xe8, 0xc3, 0xf9, 0xce, 0x0f, 0x5b, 0xa2,
0x63, 0xe9, 0x77, 0x79, 0x05, 0x81, 0x8a, 0x2a,
0x93, 0xc8, 0x19, 0x1e, 0x7d, 0x6e, 0x8a, 0xe7
};
static const unsigned char e4[] = {
0x03, 0x1d, 0x33, 0x26, 0x4e, 0x15, 0xd3, 0x32,
0x68, 0xf2, 0x4e, 0xc2, 0x60, 0x74, 0x3e, 0xdc,
0xe1, 0xc6, 0xc7, 0xdd, 0xee, 0x72, 0x5a, 0x93,
0x6b, 0xa8, 0x14, 0x91, 0x5c, 0x67, 0x62, 0xd2
};
static const unsigned char e5[] = {
0xa8, 0xf9, 0xbc, 0x16, 0x12, 0xc6, 0x8b, 0x3f,
0xf6, 0xe6, 0xf4, 0xfb, 0xe3, 0x0e, 0x71, 0xe4,
0x76, 0x9c, 0x8b, 0x80, 0xa3, 0x2c, 0xb8, 0x95,
0x8c, 0xd5, 0xd1, 0x7d, 0x6b, 0x25, 0x4d, 0xa1
};
static const unsigned char e6[] = {
0x28, 0xc9, 0xf4, 0x04, 0xc4, 0xb8, 0x10, 0xf4,
0xcb, 0xcc, 0xb3, 0x5c, 0xfb, 0x87, 0xf8, 0x26,
0x3f, 0x57, 0x86, 0xe2, 0xd8, 0x0e, 0xd3, 0x26,
0xcb, 0xc7, 0xf0, 0xe7, 0x1a, 0x99, 0xf4, 0x3b,
0xfb, 0x98, 0x8b, 0x9b, 0x7a, 0x02, 0xdd, 0x21
};
AES_KEY wctx, xctx;
int ret;
ret = AES_wrap_unwrap_test(kek, 128, NULL, e1, key, 16);
fprintf(stderr, "Key test result %d\n", ret);
ret = AES_wrap_unwrap_test(kek, 192, NULL, e2, key, 16);
fprintf(stderr, "Key test result %d\n", ret);
ret = AES_wrap_unwrap_test(kek, 256, NULL, e3, key, 16);
fprintf(stderr, "Key test result %d\n", ret);
ret = AES_wrap_unwrap_test(kek, 192, NULL, e4, key, 24);
fprintf(stderr, "Key test result %d\n", ret);
ret = AES_wrap_unwrap_test(kek, 256, NULL, e5, key, 24);
fprintf(stderr, "Key test result %d\n", ret);
ret = AES_wrap_unwrap_test(kek, 256, NULL, e6, key, 32);
fprintf(stderr, "Key test result %d\n", ret);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,761 @@
#include "arm_arch.h"
#if __ARM_ARCH__>=7
.text
.arch armv8-a+crypto
.align 5
rcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl aes_v8_set_encrypt_key
.type aes_v8_set_encrypt_key,%function
.align 5
aes_v8_set_encrypt_key:
.Lenc_key:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
adr x3,rcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
ld1 {v3.16b},[x0],#16
mov w1,#8 // reuse w1
ld1 {v1.4s,v2.4s},[x3],#32
b.lt .Loop128
b.eq .L192
b .L256
.align 4
.Loop128:
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
b.ne .Loop128
ld1 {v1.4s},[x3]
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2]
add x2,x2,#0x50
mov w12,#10
b .Ldone
.align 4
.L192:
ld1 {v4.8b},[x0],#8
movi v6.16b,#8 // borrow v6.16b
st1 {v3.4s},[x2],#16
sub v2.16b,v2.16b,v6.16b // adjust the mask
.Loop192:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.8b},[x2],#8
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
dup v5.4s,v3.s[3]
eor v5.16b,v5.16b,v4.16b
eor v6.16b,v6.16b,v1.16b
ext v4.16b,v0.16b,v4.16b,#12
shl v1.16b,v1.16b,#1
eor v4.16b,v4.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
eor v4.16b,v4.16b,v6.16b
st1 {v3.4s},[x2],#16
b.ne .Loop192
mov w12,#12
add x2,x2,#0x20
b .Ldone
.align 4
.L256:
ld1 {v4.16b},[x0]
mov w1,#7
mov w12,#14
st1 {v3.4s},[x2],#16
.Loop256:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2],#16
b.eq .Ldone
dup v6.4s,v3.s[3] // just splat
ext v5.16b,v0.16b,v4.16b,#12
aese v6.16b,v0.16b
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
eor v4.16b,v4.16b,v6.16b
b .Loop256
.Ldone:
str w12,[x2]
eor x0,x0,x0 // return value
ldr x29,[sp],#16
ret
.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
.globl aes_v8_set_decrypt_key
.type aes_v8_set_decrypt_key,%function
.align 5
aes_v8_set_decrypt_key:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
bl .Lenc_key
sub x2,x2,#240 // restore original x2
mov x4,#-16
add x0,x2,x12,lsl#4 // end of key schedule
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
.Loop_imc:
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
cmp x0,x2
b.hi .Loop_imc
ld1 {v0.4s},[x2]
aesimc v0.16b,v0.16b
st1 {v0.4s},[x0]
eor x0,x0,x0 // return value
ldp x29,x30,[sp],#16
ret
.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
.globl aes_v8_encrypt
.type aes_v8_encrypt,%function
.align 5
aes_v8_encrypt:
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
.Loop_enc:
aese v2.16b,v0.16b
ld1 {v0.4s},[x2],#16
aesmc v2.16b,v2.16b
subs w3,w3,#2
aese v2.16b,v1.16b
ld1 {v1.4s},[x2],#16
aesmc v2.16b,v2.16b
b.gt .Loop_enc
aese v2.16b,v0.16b
ld1 {v0.4s},[x2]
aesmc v2.16b,v2.16b
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.size aes_v8_encrypt,.-aes_v8_encrypt
.globl aes_v8_decrypt
.type aes_v8_decrypt,%function
.align 5
aes_v8_decrypt:
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
.Loop_dec:
aesd v2.16b,v0.16b
ld1 {v0.4s},[x2],#16
aesimc v2.16b,v2.16b
subs w3,w3,#2
aesd v2.16b,v1.16b
ld1 {v1.4s},[x2],#16
aesimc v2.16b,v2.16b
b.gt .Loop_dec
aesd v2.16b,v0.16b
ld1 {v0.4s},[x2]
aesimc v2.16b,v2.16b
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.size aes_v8_decrypt,.-aes_v8_decrypt
.globl aes_v8_cbc_encrypt
.type aes_v8_cbc_encrypt,%function
.align 5
aes_v8_cbc_encrypt:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
subs x2,x2,#16
mov x8,#16
b.lo .Lcbc_abort
csel x8,xzr,x8,eq
cmp w5,#0 // en- or decrypting?
ldr w5,[x3,#240]
and x2,x2,#-16
ld1 {v6.16b},[x4]
ld1 {v0.16b},[x0],x8
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
ld1 {v18.4s-v19.4s},[x7],#32
ld1 {v20.4s-v21.4s},[x7],#32
ld1 {v22.4s-v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
b.eq .Lcbc_dec
cmp w5,#2
eor v0.16b,v0.16b,v6.16b
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
.Loop_cbc_enc:
aese v0.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
subs w6,w6,#2
aese v0.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
b.gt .Loop_cbc_enc
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
add x7,x3,#16
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
mov w6,w5
eor v6.16b,v0.16b,v7.16b
st1 {v6.16b},[x1],#16
b.hs .Loop_cbc_enc
b .Lcbc_done
.align 5
.Lcbc_enc128:
ld1 {v2.4s-v3.4s},[x7]
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc128:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc128
st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
.Lcbc_dec128:
ld1 {v4.4s-v5.4s},[x7]
eor v6.16b,v6.16b,v7.16b
eor v2.16b,v0.16b,v7.16b
mov x12,x8
.Loop2x_cbc_dec128:
aesd v0.16b,v16.16b
aesd v1.16b,v16.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
subs x2,x2,#32
aesd v0.16b,v17.16b
aesd v1.16b,v17.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
csel x8,xzr,x8,lo
aesd v0.16b,v4.16b
aesd v1.16b,v4.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
csel x12,xzr,x12,ls
aesd v0.16b,v5.16b
aesd v1.16b,v5.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
aesd v0.16b,v18.16b
aesd v1.16b,v18.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
aesd v0.16b,v19.16b
aesd v1.16b,v19.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
aesd v0.16b,v20.16b
aesd v1.16b,v20.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
aesd v0.16b,v21.16b
aesd v1.16b,v21.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
aesd v0.16b,v22.16b
aesd v1.16b,v22.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
eor v6.16b,v6.16b,v0.16b
ld1 {v0.16b},[x0],x8
eor v2.16b,v2.16b,v1.16b
ld1 {v1.16b},[x0],x12
st1 {v6.16b},[x1],#16
eor v6.16b,v3.16b,v7.16b
st1 {v2.16b},[x1],#16
eor v2.16b,v0.16b,v7.16b
orr v3.16b,v1.16b,v1.16b
b.hs .Loop2x_cbc_dec128
adds x2,x2,#32
eor v6.16b,v6.16b,v7.16b
b.eq .Lcbc_done
eor v2.16b,v2.16b,v7.16b
b .Lcbc_dec_tail
.align 5
.Lcbc_dec:
subs x2,x2,#16
orr v2.16b,v0.16b,v0.16b
b.lo .Lcbc_dec_tail
csel x8,xzr,x8,eq
cmp w5,#2
ld1 {v1.16b},[x0],x8
orr v3.16b,v1.16b,v1.16b
b.eq .Lcbc_dec128
.Loop2x_cbc_dec:
aesd v0.16b,v16.16b
aesd v1.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
subs w6,w6,#2
aesd v0.16b,v17.16b
aesd v1.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
b.gt .Loop2x_cbc_dec
aesd v0.16b,v16.16b
aesd v1.16b,v16.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
eor v4.16b,v6.16b,v7.16b
eor v5.16b,v2.16b,v7.16b
aesd v0.16b,v17.16b
aesd v1.16b,v17.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
orr v6.16b,v3.16b,v3.16b
subs x2,x2,#32
aesd v0.16b,v18.16b
aesd v1.16b,v18.16b
aesimc v0.16b,v0.16b
csel x8,xzr,x8,lo
aesimc v1.16b,v1.16b
mov x7,x3
aesd v0.16b,v19.16b
aesd v1.16b,v19.16b
aesimc v0.16b,v0.16b
ld1 {v2.16b},[x0],x8
aesimc v1.16b,v1.16b
csel x8,xzr,x8,ls
aesd v0.16b,v20.16b
aesd v1.16b,v20.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
ld1 {v3.16b},[x0],x8
aesd v0.16b,v21.16b
aesd v1.16b,v21.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
aesd v0.16b,v22.16b
aesd v1.16b,v22.16b
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
mov w6,w5
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
orr v0.16b,v2.16b,v2.16b
st1 {v4.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
st1 {v5.16b},[x1],#16
b.hs .Loop2x_cbc_dec
adds x2,x2,#32
b.eq .Lcbc_done
.Lcbc_dec_tail:
aesd v0.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesimc v0.16b,v0.16b
subs w6,w6,#2
aesd v0.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesimc v0.16b,v0.16b
b.gt .Lcbc_dec_tail
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
eor v4.16b,v6.16b,v7.16b
aesd v0.16b,v18.16b
aesimc v0.16b,v0.16b
orr v6.16b,v2.16b,v2.16b
aesd v0.16b,v19.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v22.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v23.16b
eor v4.16b,v4.16b,v0.16b
st1 {v4.16b},[x1],#16
.Lcbc_done:
st1 {v6.16b},[x4]
.Lcbc_abort:
ldr x29,[sp],#16
ret
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
.globl aes_v8_ctr32_encrypt_blocks
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
aes_v8_ctr32_encrypt_blocks:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ldr w5,[x3,#240]
ldr w8, [x4, #12]
ld1 {v0.4s},[x4]
ld1 {v16.4s-v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
ld1 {v18.4s-v19.4s},[x7],#32
ld1 {v20.4s-v21.4s},[x7],#32
ld1 {v22.4s-v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
subs x2,x2,#2
b.lo .Lctr32_tail
#ifndef __ARMEB__
rev w8, w8
#endif
orr v1.16b,v0.16b,v0.16b
add w8, w8, #1
orr v6.16b,v0.16b,v0.16b
rev w10, w8
cmp w5,#2
mov v1.s[3],w10
b.eq .Lctr32_128
.Loop2x_ctr32:
aese v0.16b,v16.16b
aese v1.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
subs w6,w6,#2
aese v0.16b,v17.16b
aese v1.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
b.gt .Loop2x_ctr32
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aesmc v4.16b,v0.16b
orr v0.16b,v6.16b,v6.16b
aesmc v5.16b,v1.16b
orr v1.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aese v5.16b,v17.16b
ld1 {v2.16b},[x0],#16
aesmc v4.16b,v4.16b
ld1 {v3.16b},[x0],#16
aesmc v5.16b,v5.16b
add w8,w8,#1
aese v4.16b,v18.16b
aese v5.16b,v18.16b
rev w9,w8
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
add w8,w8,#1
aese v4.16b,v19.16b
aese v5.16b,v19.16b
eor v2.16b,v2.16b,v7.16b
rev w10,w8
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
eor v3.16b,v3.16b,v7.16b
mov x7,x3
aese v4.16b,v20.16b
aese v5.16b,v20.16b
subs x2,x2,#2
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
ld1 {v16.4s-v17.4s},[x7],#32 // re-pre-load rndkey[0-1]
aese v4.16b,v21.16b
aese v5.16b,v21.16b
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
aese v4.16b,v22.16b
aese v5.16b,v22.16b
mov v0.s[3], w9
aesmc v4.16b,v4.16b
mov v1.s[3], w10
aesmc v5.16b,v5.16b
aese v4.16b,v23.16b
aese v5.16b,v23.16b
mov w6,w5
eor v2.16b,v2.16b,v4.16b
eor v3.16b,v3.16b,v5.16b
st1 {v2.16b},[x1],#16
st1 {v3.16b},[x1],#16
b.hs .Loop2x_ctr32
adds x2,x2,#2
b.eq .Lctr32_done
b .Lctr32_tail
.Lctr32_128:
ld1 {v4.4s-v5.4s},[x7]
.Loop2x_ctr32_128:
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aesmc v0.16b,v0.16b
ld1 {v2.16b},[x0],#16
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0],#16
aese v0.16b,v17.16b
aese v1.16b,v17.16b
add w8,w8,#1
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
rev w9,w8
aese v0.16b,v4.16b
aese v1.16b,v4.16b
add w8,w8,#1
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
rev w10,w8
aese v0.16b,v5.16b
aese v1.16b,v5.16b
subs x2,x2,#2
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
aese v0.16b,v18.16b
aese v1.16b,v18.16b
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
aese v0.16b,v19.16b
aese v1.16b,v19.16b
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
aese v0.16b,v20.16b
aese v1.16b,v20.16b
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
aese v0.16b,v21.16b
aese v1.16b,v21.16b
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
aese v0.16b,v22.16b
aese v1.16b,v22.16b
aesmc v0.16b,v0.16b
aesmc v1.16b,v1.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v23.16b
eor v3.16b,v3.16b,v7.16b
aese v1.16b,v23.16b
eor v2.16b,v2.16b,v0.16b
orr v0.16b,v6.16b,v6.16b
eor v3.16b,v3.16b,v1.16b
orr v1.16b,v6.16b,v6.16b
st1 {v2.16b},[x1],#16
mov v0.s[3], w9
st1 {v3.16b},[x1],#16
mov v1.s[3], w10
b.hs .Loop2x_ctr32_128
adds x2,x2,#2
b.eq .Lctr32_done
.Lctr32_tail:
aese v0.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
subs w6,w6,#2
aese v0.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
b.gt .Lctr32_tail
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
ld1 {v2.16b},[x0]
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v23.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
.Lctr32_done:
ldr x29,[sp],#16
ret
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif

View File

@@ -0,0 +1,767 @@
#include "arm_arch.h"
#if __ARM_ARCH__>=7
.text
.fpu neon
.code 32
.align 5
rcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl aes_v8_set_encrypt_key
.type aes_v8_set_encrypt_key,%function
.align 5
aes_v8_set_encrypt_key:
.Lenc_key:
adr r3,rcon
cmp r1,#192
veor q0,q0,q0
vld1.8 {q3},[r0]!
mov r1,#8 @ reuse r1
vld1.32 {q1,q2},[r3]!
blt .Loop128
beq .L192
b .L256
.align 4
.Loop128:
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
bne .Loop128
vld1.32 {q1},[r3]
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
veor q3,q3,q10
vst1.32 {q3},[r2]
add r2,r2,#0x50
mov r12,#10
b .Ldone
.align 4
.L192:
vld1.8 {d16},[r0]!
vmov.i8 q10,#8 @ borrow q10
vst1.32 {q3},[r2]!
vsub.i8 q2,q2,q10 @ adjust the mask
.Loop192:
vtbl.8 d20,{q8},d4
vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12
vst1.32 {d16},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vdup.32 q9,d7[1]
veor q9,q9,q8
veor q10,q10,q1
vext.8 q8,q0,q8,#12
vshl.u8 q1,q1,#1
veor q8,q8,q9
veor q3,q3,q10
veor q8,q8,q10
vst1.32 {q3},[r2]!
bne .Loop192
mov r12,#12
add r2,r2,#0x20
b .Ldone
.align 4
.L256:
vld1.8 {q8},[r0]
mov r1,#7
mov r12,#14
vst1.32 {q3},[r2]!
.Loop256:
vtbl.8 d20,{q8},d4
vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12
vst1.32 {q8},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
vst1.32 {q3},[r2]!
beq .Ldone
vdup.32 q10,d7[1]
vext.8 q9,q0,q8,#12
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q8,q8,q9
vext.8 q9,q0,q9,#12
veor q8,q8,q9
vext.8 q9,q0,q9,#12
veor q8,q8,q9
veor q8,q8,q10
b .Loop256
.Ldone:
str r12,[r2]
eor r0,r0,r0 @ return value
bx lr
.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
.globl aes_v8_set_decrypt_key
.type aes_v8_set_decrypt_key,%function
.align 5
aes_v8_set_decrypt_key:
stmdb sp!,{r4,lr}
bl .Lenc_key
sub r2,r2,#240 @ restore original r2
mov r4,#-16
add r0,r2,r12,lsl#4 @ end of key schedule
vld1.32 {q0},[r2]
vld1.32 {q1},[r0]
vst1.32 {q0},[r0],r4
vst1.32 {q1},[r2]!
.Loop_imc:
vld1.32 {q0},[r2]
vld1.32 {q1},[r0]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vst1.32 {q0},[r0],r4
vst1.32 {q1},[r2]!
cmp r0,r2
bhi .Loop_imc
vld1.32 {q0},[r2]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
vst1.32 {q0},[r0]
eor r0,r0,r0 @ return value
ldmia sp!,{r4,pc}
.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
.globl aes_v8_encrypt
.type aes_v8_encrypt,%function
.align 5
aes_v8_encrypt:
ldr r3,[r2,#240]
vld1.32 {q0},[r2]!
vld1.8 {q2},[r0]
sub r3,r3,#2
vld1.32 {q1},[r2]!
.Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
vld1.32 {q1},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
bgt .Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0
vst1.8 {q2},[r1]
bx lr
.size aes_v8_encrypt,.-aes_v8_encrypt
.globl aes_v8_decrypt
.type aes_v8_decrypt,%function
.align 5
aes_v8_decrypt:
ldr r3,[r2,#240]
vld1.32 {q0},[r2]!
vld1.8 {q2},[r0]
sub r3,r3,#2
vld1.32 {q1},[r2]!
.Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
vld1.32 {q1},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
bgt .Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0
vst1.8 {q2},[r1]
bx lr
.size aes_v8_decrypt,.-aes_v8_decrypt
.globl aes_v8_cbc_encrypt
.type aes_v8_cbc_encrypt,%function
.align 5
aes_v8_cbc_encrypt:
mov ip,sp
stmdb sp!,{r4-r8,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load remaining args
subs r2,r2,#16
mov r8,#16
blo .Lcbc_abort
moveq r8,#0
cmp r5,#0 @ en- or decrypting?
ldr r5,[r3,#240]
and r2,r2,#-16
vld1.8 {q6},[r4]
vld1.8 {q0},[r0],r8
vld1.32 {q8-q9},[r3] @ load key schedule...
sub r5,r5,#6
add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
sub r5,r5,#2
vld1.32 {q10-q11},[r7]!
vld1.32 {q12-q13},[r7]!
vld1.32 {q14-q15},[r7]!
vld1.32 {q7},[r7]
add r7,r3,#32
mov r6,r5
beq .Lcbc_dec
cmp r5,#2
veor q0,q0,q6
veor q5,q8,q7
beq .Lcbc_enc128
.Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
bgt .Loop_cbc_enc
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r7,r3,#16
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
mov r6,r5
veor q6,q0,q7
vst1.8 {q6},[r1]!
bhs .Loop_cbc_enc
b .Lcbc_done
.align 5
.Lcbc_enc128:
vld1.32 {q2-q3},[r7]
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vst1.8 {q6},[r1]!
.Lenter_cbc_enc128:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q6,q0,q7
bhs .Loop_cbc_enc128
vst1.8 {q6},[r1]!
b .Lcbc_done
.align 5
.Lcbc_dec128:
vld1.32 {q4-q5},[r7]
veor q6,q6,q7
veor q2,q0,q7
mov r12,r8
.Loop2x_cbc_dec128:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
subs r2,r2,#32
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
movlo r8,#0
.byte 0x48,0x03,0xb0,0xf3 @ aesd q0,q4
.byte 0x48,0x23,0xb0,0xf3 @ aesd q1,q4
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
movls r12,#0
.byte 0x4a,0x03,0xb0,0xf3 @ aesd q0,q5
.byte 0x4a,0x23,0xb0,0xf3 @ aesd q1,q5
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
.byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
.byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
veor q6,q6,q0
vld1.8 {q0},[r0],r8
veor q2,q2,q1
vld1.8 {q1},[r0],r12
vst1.8 {q6},[r1]!
veor q6,q3,q7
vst1.8 {q2},[r1]!
veor q2,q0,q7
vorr q3,q1,q1
bhs .Loop2x_cbc_dec128
adds r2,r2,#32
veor q6,q6,q7
beq .Lcbc_done
veor q2,q2,q7
b .Lcbc_dec_tail
.align 5
.Lcbc_dec:
subs r2,r2,#16
vorr q2,q0,q0
blo .Lcbc_dec_tail
moveq r8,#0
cmp r5,#2
vld1.8 {q1},[r0],r8
vorr q3,q1,q1
beq .Lcbc_dec128
.Loop2x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
vld1.32 {q8},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
vld1.32 {q9},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
bgt .Loop2x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
veor q4,q6,q7
veor q5,q2,q7
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vorr q6,q3,q3
subs r2,r2,#32
.byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
.byte 0x64,0x23,0xb0,0xf3 @ aesd q1,q10
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
movlo r8,#0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
mov r7,r3
.byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
.byte 0x66,0x23,0xb0,0xf3 @ aesd q1,q11
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
vld1.8 {q2},[r0],r8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
movls r8,#0
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vld1.8 {q3},[r0],r8
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
mov r6,r5
veor q4,q4,q0
veor q5,q5,q1
vorr q0,q2,q2
vst1.8 {q4},[r1]!
vorr q1,q3,q3
vst1.8 {q5},[r1]!
bhs .Loop2x_cbc_dec
adds r2,r2,#32
beq .Lcbc_done
.Lcbc_dec_tail:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
vld1.32 {q8},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
vld1.32 {q9},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
bgt .Lcbc_dec_tail
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
veor q4,q6,q7
.byte 0x64,0x03,0xb0,0xf3 @ aesd q0,q10
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
vorr q6,q2,q2
.byte 0x66,0x03,0xb0,0xf3 @ aesd q0,q11
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
veor q4,q4,q0
vst1.8 {q4},[r1]!
.Lcbc_done:
vst1.8 {q6},[r4]
.Lcbc_abort:
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r8,pc}
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
.globl aes_v8_ctr32_encrypt_blocks
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
aes_v8_ctr32_encrypt_blocks:
mov ip,sp
stmdb sp!,{r4-r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
ldr r5,[r3,#240]
ldr r8, [r4, #12]
vld1.32 {q0},[r4]
vld1.32 {q8-q9},[r3] @ load key schedule...
sub r5,r5,#6
add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
sub r5,r5,#2
vld1.32 {q10-q11},[r7]!
vld1.32 {q12-q13},[r7]!
vld1.32 {q14-q15},[r7]!
vld1.32 {q7},[r7]
add r7,r3,#32
mov r6,r5
subs r2,r2,#2
blo .Lctr32_tail
#ifndef __ARMEB__
rev r8, r8
#endif
vorr q1,q0,q0
add r8, r8, #1
vorr q6,q0,q0
rev r10, r8
cmp r5,#2
vmov.32 d3[1],r10
beq .Lctr32_128
.Loop2x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
bgt .Loop2x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
vorr q0,q6,q6
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
vorr q1,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
vld1.8 {q2},[r0]!
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vld1.8 {q3},[r0]!
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
add r8,r8,#1
.byte 0x24,0x83,0xb0,0xf3 @ aese q4,q10
.byte 0x24,0xa3,0xb0,0xf3 @ aese q5,q10
rev r9,r8
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
add r8,r8,#1
.byte 0x26,0x83,0xb0,0xf3 @ aese q4,q11
.byte 0x26,0xa3,0xb0,0xf3 @ aese q5,q11
veor q2,q2,q7
rev r10,r8
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q3,q3,q7
mov r7,r3
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
subs r2,r2,#2
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vld1.32 {q8-q9},[r7]! @ re-pre-load rndkey[0-1]
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
vmov.32 d1[1], r9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vmov.32 d3[1], r10
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
mov r6,r5
veor q2,q2,q4
veor q3,q3,q5
vst1.8 {q2},[r1]!
vst1.8 {q3},[r1]!
bhs .Loop2x_ctr32
adds r2,r2,#2
beq .Lctr32_done
b .Lctr32_tail
.Lctr32_128:
vld1.32 {q4-q5},[r7]
.Loop2x_ctr32_128:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q2},[r0]!
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q3},[r0]!
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
add r8,r8,#1
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
rev r9,r8
.byte 0x08,0x03,0xb0,0xf3 @ aese q0,q4
.byte 0x08,0x23,0xb0,0xf3 @ aese q1,q4
add r8,r8,#1
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
rev r10,r8
.byte 0x0a,0x03,0xb0,0xf3 @ aese q0,q5
.byte 0x0a,0x23,0xb0,0xf3 @ aese q1,q5
subs r2,r2,#2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x24,0x23,0xb0,0xf3 @ aese q1,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x26,0x23,0xb0,0xf3 @ aese q1,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q2,q2,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q3,q3,q7
.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
veor q2,q2,q0
vorr q0,q6,q6
veor q3,q3,q1
vorr q1,q6,q6
vst1.8 {q2},[r1]!
vmov.32 d1[1], r9
vst1.8 {q3},[r1]!
vmov.32 d3[1], r10
bhs .Loop2x_ctr32_128
adds r2,r2,#2
beq .Lctr32_done
.Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
bgt .Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q2},[r0]
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q2,q2,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q2,q2,q0
vst1.8 {q2},[r1]
.Lctr32_done:
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r10,pc}
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif

View File

@@ -0,0 +1,980 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for ARMv8 AES instructions. The
# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
# registers to 16, which implies additional instructions. This has
# no effect on mighty Apple A7, as results are literally equal to
# the theoretical estimates based on instruction latencies and issue
# rate. It remains to be seen how does it affect other platforms...
#
# Performance in cycles per byte processed with 128-bit key:
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
# Cortex-A5x n/a n/a n/a
$flavour = shift;
open STDOUT,">".shift;
$prefix="aes_v8";
$code=<<___;
#include "arm_arch.h"
#if __ARM_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
# maintain both 32- and 64-bit codes within single module and
# transliterate common code to either flavour with regex vodoo.
#
{{{
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
$code.=<<___;
.align 5
rcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
${prefix}_set_encrypt_key:
.Lenc_key:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___;
adr $ptr,rcon
cmp $bits,#192
veor $zero,$zero,$zero
vld1.8 {$in0},[$inp],#16
mov $bits,#8 // reuse $bits
vld1.32 {$rcon,$mask},[$ptr],#32
b.lt .Loop128
b.eq .L192
b .L256
.align 4
.Loop128:
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
b.ne .Loop128
vld1.32 {$rcon},[$ptr]
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
veor $in0,$in0,$key
vst1.32 {$in0},[$out]
add $out,$out,#0x50
mov $rounds,#10
b .Ldone
.align 4
.L192:
vld1.8 {$in1},[$inp],#8
vmov.i8 $key,#8 // borrow $key
vst1.32 {$in0},[$out],#16
vsub.i8 $mask,$mask,$key // adjust the mask
.Loop192:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#8
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vdup.32 $tmp,${in0}[3]
veor $tmp,$tmp,$in1
veor $key,$key,$rcon
vext.8 $in1,$zero,$in1,#12
vshl.u8 $rcon,$rcon,#1
veor $in1,$in1,$tmp
veor $in0,$in0,$key
veor $in1,$in1,$key
vst1.32 {$in0},[$out],#16
b.ne .Loop192
mov $rounds,#12
add $out,$out,#0x20
b .Ldone
.align 4
.L256:
vld1.8 {$in1},[$inp]
mov $bits,#7
mov $rounds,#14
vst1.32 {$in0},[$out],#16
.Loop256:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vst1.32 {$in0},[$out],#16
b.eq .Ldone
vdup.32 $key,${in0}[3] // just splat
vext.8 $tmp,$zero,$in1,#12
aese $key,$zero
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
veor $in1,$in1,$key
b .Loop256
.Ldone:
str $rounds,[$out]
eor x0,x0,x0 // return value
`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
ret
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
.globl ${prefix}_set_decrypt_key
.type ${prefix}_set_decrypt_key,%function
.align 5
${prefix}_set_decrypt_key:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
stmdb sp!,{r4,lr}
___
$code.=<<___;
bl .Lenc_key
sub $out,$out,#240 // restore original $out
mov x4,#-16
add $inp,$out,x12,lsl#4 // end of key schedule
vld1.32 {v0.16b},[$out]
vld1.32 {v1.16b},[$inp]
vst1.32 {v0.16b},[$inp],x4
vst1.32 {v1.16b},[$out],#16
.Loop_imc:
vld1.32 {v0.16b},[$out]
vld1.32 {v1.16b},[$inp]
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
vst1.32 {v0.16b},[$inp],x4
vst1.32 {v1.16b},[$out],#16
cmp $inp,$out
b.hi .Loop_imc
vld1.32 {v0.16b},[$out]
aesimc v0.16b,v0.16b
vst1.32 {v0.16b},[$inp]
eor x0,x0,x0 // return value
___
$code.=<<___ if ($flavour !~ /64/);
ldmia sp!,{r4,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldp x29,x30,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
my ($inp,$out,$key)=map("x$_",(0..2));
my $rounds="w3";
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
$code.=<<___;
.globl ${prefix}_${dir}crypt
.type ${prefix}_${dir}crypt,%function
.align 5
${prefix}_${dir}crypt:
ldr $rounds,[$key,#240]
vld1.32 {$rndkey0},[$key],#16
vld1.8 {$inout},[$inp]
sub $rounds,$rounds,#2
vld1.32 {$rndkey1},[$key],#16
.Loop_${dir}c:
aes$e $inout,$rndkey0
vld1.32 {$rndkey0},[$key],#16
aes$mc $inout,$inout
subs $rounds,$rounds,#2
aes$e $inout,$rndkey1
vld1.32 {$rndkey1},[$key],#16
aes$mc $inout,$inout
b.gt .Loop_${dir}c
aes$e $inout,$rndkey0
vld1.32 {$rndkey0},[$key]
aes$mc $inout,$inout
aes$e $inout,$rndkey1
veor $inout,$inout,$rndkey0
vst1.8 {$inout},[$out]
ret
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl ${prefix}_cbc_encrypt
.type ${prefix}_cbc_encrypt,%function
.align 5
${prefix}_cbc_encrypt:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r8,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load remaining args
___
$code.=<<___;
subs $len,$len,#16
mov $step,#16
b.lo .Lcbc_abort
cclr $step,eq
cmp $enc,#0 // en- or decrypting?
ldr $rounds,[$key,#240]
and $len,$len,#-16
vld1.8 {$ivec},[$ivp]
vld1.8 {$dat},[$inp],$step
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#6
add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
sub $rounds,$rounds,#2
vld1.32 {q10-q11},[$key_],#32
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
b.eq .Lcbc_dec
cmp $rounds,#2
veor $dat,$dat,$ivec
veor $rndzero_n_last,q8,$rndlast
b.eq .Lcbc_enc128
.Loop_cbc_enc:
aese $dat,q8
vld1.32 {q8},[$key_],#16
aesmc $dat,$dat
subs $cnt,$cnt,#2
aese $dat,q9
vld1.32 {q9},[$key_],#16
aesmc $dat,$dat
b.gt .Loop_cbc_enc
aese $dat,q8
aesmc $dat,$dat
subs $len,$len,#16
aese $dat,q9
aesmc $dat,$dat
cclr $step,eq
aese $dat,q10
aesmc $dat,$dat
add $key_,$key,#16
aese $dat,q11
aesmc $dat,$dat
vld1.8 {q8},[$inp],$step
aese $dat,q12
aesmc $dat,$dat
veor q8,q8,$rndzero_n_last
aese $dat,q13
aesmc $dat,$dat
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
aese $dat,q14
aesmc $dat,$dat
aese $dat,q15
mov $cnt,$rounds
veor $ivec,$dat,$rndlast
vst1.8 {$ivec},[$out],#16
b.hs .Loop_cbc_enc
b .Lcbc_done
.align 5
.Lcbc_enc128:
vld1.32 {$in0-$in1},[$key_]
aese $dat,q8
aesmc $dat,$dat
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese $dat,q8
aesmc $dat,$dat
vst1.8 {$ivec},[$out],#16
.Lenter_cbc_enc128:
aese $dat,q9
aesmc $dat,$dat
subs $len,$len,#16
aese $dat,$in0
aesmc $dat,$dat
cclr $step,eq
aese $dat,$in1
aesmc $dat,$dat
aese $dat,q10
aesmc $dat,$dat
aese $dat,q11
aesmc $dat,$dat
vld1.8 {q8},[$inp],$step
aese $dat,q12
aesmc $dat,$dat
aese $dat,q13
aesmc $dat,$dat
aese $dat,q14
aesmc $dat,$dat
veor q8,q8,$rndzero_n_last
aese $dat,q15
veor $ivec,$dat,$rndlast
b.hs .Loop_cbc_enc128
vst1.8 {$ivec},[$out],#16
b .Lcbc_done
.align 5
.Lcbc_dec128:
vld1.32 {$tmp0-$tmp1},[$key_]
veor $ivec,$ivec,$rndlast
veor $in0,$dat0,$rndlast
mov $step1,$step
.Loop2x_cbc_dec128:
aesd $dat0,q8
aesd $dat1,q8
aesimc $dat0,$dat0
aesimc $dat1,$dat1
subs $len,$len,#32
aesd $dat0,q9
aesd $dat1,q9
aesimc $dat0,$dat0
aesimc $dat1,$dat1
cclr $step,lo
aesd $dat0,$tmp0
aesd $dat1,$tmp0
aesimc $dat0,$dat0
aesimc $dat1,$dat1
cclr $step1,ls
aesd $dat0,$tmp1
aesd $dat1,$tmp1
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesd $dat0,q10
aesd $dat1,q10
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesd $dat0,q11
aesd $dat1,q11
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesd $dat0,q12
aesd $dat1,q12
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesd $dat0,q13
aesd $dat1,q13
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesd $dat0,q14
aesd $dat1,q14
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesd $dat0,q15
aesd $dat1,q15
veor $ivec,$ivec,$dat0
vld1.8 {$dat0},[$inp],$step
veor $in0,$in0,$dat1
vld1.8 {$dat1},[$inp],$step1
vst1.8 {$ivec},[$out],#16
veor $ivec,$in1,$rndlast
vst1.8 {$in0},[$out],#16
veor $in0,$dat0,$rndlast
vorr $in1,$dat1,$dat1
b.hs .Loop2x_cbc_dec128
adds $len,$len,#32
veor $ivec,$ivec,$rndlast
b.eq .Lcbc_done
veor $in0,$in0,$rndlast
b .Lcbc_dec_tail
.align 5
.Lcbc_dec:
subs $len,$len,#16
vorr $in0,$dat,$dat
b.lo .Lcbc_dec_tail
cclr $step,eq
cmp $rounds,#2
vld1.8 {$dat1},[$inp],$step
vorr $in1,$dat1,$dat1
b.eq .Lcbc_dec128
.Loop2x_cbc_dec:
aesd $dat0,q8
aesd $dat1,q8
vld1.32 {q8},[$key_],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
subs $cnt,$cnt,#2
aesd $dat0,q9
aesd $dat1,q9
vld1.32 {q9},[$key_],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
b.gt .Loop2x_cbc_dec
aesd $dat0,q8
aesd $dat1,q8
aesimc $dat0,$dat0
aesimc $dat1,$dat1
veor $tmp0,$ivec,$rndlast
veor $tmp1,$in0,$rndlast
aesd $dat0,q9
aesd $dat1,q9
aesimc $dat0,$dat0
aesimc $dat1,$dat1
vorr $ivec,$in1,$in1
subs $len,$len,#32
aesd $dat0,q10
aesd $dat1,q10
aesimc $dat0,$dat0
cclr $step,lo
aesimc $dat1,$dat1
mov $key_,$key
aesd $dat0,q11
aesd $dat1,q11
aesimc $dat0,$dat0
vld1.8 {$in0},[$inp],$step
aesimc $dat1,$dat1
cclr $step,ls
aesd $dat0,q12
aesd $dat1,q12
aesimc $dat0,$dat0
aesimc $dat1,$dat1
vld1.8 {$in1},[$inp],$step
aesd $dat0,q13
aesd $dat1,q13
aesimc $dat0,$dat0
aesimc $dat1,$dat1
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
aesd $dat0,q14
aesd $dat1,q14
aesimc $dat0,$dat0
aesimc $dat1,$dat1
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
aesd $dat0,q15
aesd $dat1,q15
mov $cnt,$rounds
veor $tmp0,$tmp0,$dat0
veor $tmp1,$tmp1,$dat1
vorr $dat0,$in0,$in0
vst1.8 {$tmp0},[$out],#16
vorr $dat1,$in1,$in1
vst1.8 {$tmp1},[$out],#16
b.hs .Loop2x_cbc_dec
adds $len,$len,#32
b.eq .Lcbc_done
.Lcbc_dec_tail:
aesd $dat,q8
vld1.32 {q8},[$key_],#16
aesimc $dat,$dat
subs $cnt,$cnt,#2
aesd $dat,q9
vld1.32 {q9},[$key_],#16
aesimc $dat,$dat
b.gt .Lcbc_dec_tail
aesd $dat,q8
aesimc $dat,$dat
aesd $dat,q9
aesimc $dat,$dat
veor $tmp,$ivec,$rndlast
aesd $dat,q10
aesimc $dat,$dat
vorr $ivec,$in0,$in0
aesd $dat,q11
aesimc $dat,$dat
aesd $dat,q12
aesimc $dat,$dat
aesd $dat,q13
aesimc $dat,$dat
aesd $dat,q14
aesimc $dat,$dat
aesd $dat,q15
veor $tmp,$tmp,$dat
vst1.8 {$tmp},[$out],#16
.Lcbc_done:
vst1.8 {$ivec},[$ivp]
.Lcbc_abort:
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r8,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat,$tmp)=($dat0,$tmp0);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl ${prefix}_ctr32_encrypt_blocks
.type ${prefix}_ctr32_encrypt_blocks,%function
.align 5
${prefix}_ctr32_encrypt_blocks:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
___
$code.=<<___;
ldr $rounds,[$key,#240]
ldr $ctr, [$ivp, #12]
vld1.32 {$dat0},[$ivp]
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#6
add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
sub $rounds,$rounds,#2
vld1.32 {q10-q11},[$key_],#32
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
subs $len,$len,#2
b.lo .Lctr32_tail
#ifndef __ARMEB__
rev $ctr, $ctr
#endif
vorr $dat1,$dat0,$dat0
add $ctr, $ctr, #1
vorr $ivec,$dat0,$dat0
rev $tctr1, $ctr
cmp $rounds,#2
vmov.32 ${dat1}[3],$tctr1
b.eq .Lctr32_128
.Loop2x_ctr32:
aese $dat0,q8
aese $dat1,q8
vld1.32 {q8},[$key_],#16
aesmc $dat0,$dat0
aesmc $dat1,$dat1
subs $cnt,$cnt,#2
aese $dat0,q9
aese $dat1,q9
vld1.32 {q9},[$key_],#16
aesmc $dat0,$dat0
aesmc $dat1,$dat1
b.gt .Loop2x_ctr32
aese $dat0,q8
aese $dat1,q8
aesmc $tmp0,$dat0
vorr $dat0,$ivec,$ivec
aesmc $tmp1,$dat1
vorr $dat1,$ivec,$ivec
aese $tmp0,q9
aese $tmp1,q9
vld1.8 {$in0},[$inp],#16
aesmc $tmp0,$tmp0
vld1.8 {$in1},[$inp],#16
aesmc $tmp1,$tmp1
add $ctr,$ctr,#1
aese $tmp0,q10
aese $tmp1,q10
rev $tctr,$ctr
aesmc $tmp0,$tmp0
aesmc $tmp1,$tmp1
add $ctr,$ctr,#1
aese $tmp0,q11
aese $tmp1,q11
veor $in0,$in0,$rndlast
rev $tctr1,$ctr
aesmc $tmp0,$tmp0
aesmc $tmp1,$tmp1
veor $in1,$in1,$rndlast
mov $key_,$key
aese $tmp0,q12
aese $tmp1,q12
subs $len,$len,#2
aesmc $tmp0,$tmp0
aesmc $tmp1,$tmp1
vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1]
aese $tmp0,q13
aese $tmp1,q13
aesmc $tmp0,$tmp0
aesmc $tmp1,$tmp1
aese $tmp0,q14
aese $tmp1,q14
vmov.32 ${dat0}[3], $tctr
aesmc $tmp0,$tmp0
vmov.32 ${dat1}[3], $tctr1
aesmc $tmp1,$tmp1
aese $tmp0,q15
aese $tmp1,q15
mov $cnt,$rounds
veor $in0,$in0,$tmp0
veor $in1,$in1,$tmp1
vst1.8 {$in0},[$out],#16
vst1.8 {$in1},[$out],#16
b.hs .Loop2x_ctr32
adds $len,$len,#2
b.eq .Lctr32_done
b .Lctr32_tail
.Lctr32_128:
vld1.32 {$tmp0-$tmp1},[$key_]
.Loop2x_ctr32_128:
aese $dat0,q8
aese $dat1,q8
aesmc $dat0,$dat0
vld1.8 {$in0},[$inp],#16
aesmc $dat1,$dat1
vld1.8 {$in1},[$inp],#16
aese $dat0,q9
aese $dat1,q9
add $ctr,$ctr,#1
aesmc $dat0,$dat0
aesmc $dat1,$dat1
rev $tctr,$ctr
aese $dat0,$tmp0
aese $dat1,$tmp0
add $ctr,$ctr,#1
aesmc $dat0,$dat0
aesmc $dat1,$dat1
rev $tctr1,$ctr
aese $dat0,$tmp1
aese $dat1,$tmp1
subs $len,$len,#2
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q10
aese $dat1,q10
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q11
aese $dat1,q11
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q12
aese $dat1,q12
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q13
aese $dat1,q13
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q14
aese $dat1,q14
aesmc $dat0,$dat0
aesmc $dat1,$dat1
veor $in0,$in0,$rndlast
aese $dat0,q15
veor $in1,$in1,$rndlast
aese $dat1,q15
veor $in0,$in0,$dat0
vorr $dat0,$ivec,$ivec
veor $in1,$in1,$dat1
vorr $dat1,$ivec,$ivec
vst1.8 {$in0},[$out],#16
vmov.32 ${dat0}[3], $tctr
vst1.8 {$in1},[$out],#16
vmov.32 ${dat1}[3], $tctr1
b.hs .Loop2x_ctr32_128
adds $len,$len,#2
b.eq .Lctr32_done
.Lctr32_tail:
aese $dat,q8
vld1.32 {q8},[$key_],#16
aesmc $dat,$dat
subs $cnt,$cnt,#2
aese $dat,q9
vld1.32 {q9},[$key_],#16
aesmc $dat,$dat
b.gt .Lctr32_tail
aese $dat,q8
aesmc $dat,$dat
aese $dat,q9
aesmc $dat,$dat
vld1.8 {$in0},[$inp]
aese $dat,q10
aesmc $dat,$dat
aese $dat,q11
aesmc $dat,$dat
aese $dat,q12
aesmc $dat,$dat
aese $dat,q13
aesmc $dat,$dat
aese $dat,q14
aesmc $dat,$dat
veor $in0,$in0,$rndlast
aese $dat,q15
veor $in0,$in0,$dat
vst1.8 {$in0},[$out]
.Lctr32_done:
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r10,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
$code.=<<___;
#endif
___
########################################
if ($flavour =~ /64/) { ######## 64-bit code
my %opcode = (
"aesd" => 0x4e285800, "aese" => 0x4e284800,
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5),
$mnemonic,$arg;
};
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vext\.8/ext/o or
s/vrev32\.8/rev32/o or
s/vtst\.8/cmtst/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
# fix up remainig legacy suffixes
s/\.[ui]?8//o;
m/\],#8/o and s/\.16b/\.8b/go;
s/\.[ui]?32//o and s/\.16b/\.4s/go;
s/\.[ui]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
print $_,"\n";
}
} else { ######## 32-bit code
my %opcode = (
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<1) |(($2&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
};
sub unvtbl {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remainig new-style suffixes
s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,661 @@
.file "vpaes-x86.s"
.text
.align 64
.L_vpaes_consts:
.long 218628480,235210255,168496130,67568393
.long 252381056,17041926,33884169,51187212
.long 252645135,252645135,252645135,252645135
.long 1512730624,3266504856,1377990664,3401244816
.long 830229760,1275146365,2969422977,3447763452
.long 3411033600,2979783055,338359620,2782886510
.long 4209124096,907596821,221174255,1006095553
.long 191964160,3799684038,3164090317,1589111125
.long 182528256,1777043520,2877432650,3265356744
.long 1874708224,3503451415,3305285752,363511674
.long 1606117888,3487855781,1093350906,2384367825
.long 197121,67569157,134941193,202313229
.long 67569157,134941193,202313229,197121
.long 134941193,202313229,197121,67569157
.long 202313229,197121,67569157,134941193
.long 33619971,100992007,168364043,235736079
.long 235736079,33619971,100992007,168364043
.long 168364043,235736079,33619971,100992007
.long 100992007,168364043,235736079,33619971
.long 50462976,117835012,185207048,252579084
.long 252314880,51251460,117574920,184942860
.long 184682752,252054788,50987272,118359308
.long 118099200,185467140,251790600,50727180
.long 2946363062,528716217,1300004225,1881839624
.long 1532713819,1532713819,1532713819,1532713819
.long 3602276352,4288629033,3737020424,4153884961
.long 1354558464,32357713,2958822624,3775749553
.long 1201988352,132424512,1572796698,503232858
.long 2213177600,1597421020,4103937655,675398315
.long 2749646592,4273543773,1511898873,121693092
.long 3040248576,1103263732,2871565598,1608280554
.long 2236667136,2588920351,482954393,64377734
.long 3069987328,291237287,2117370568,3650299247
.long 533321216,3573750986,2572112006,1401264716
.long 1339849704,2721158661,548607111,3445553514
.long 2128193280,3054596040,2183486460,1257083700
.long 655635200,1165381986,3923443150,2344132524
.long 190078720,256924420,290342170,357187870
.long 1610966272,2263057382,4103205268,309794674
.long 2592527872,2233205587,1335446729,3402964816
.long 3973531904,3225098121,3002836325,1918774430
.long 3870401024,2102906079,2284471353,4117666579
.long 617007872,1021508343,366931923,691083277
.long 2528395776,3491914898,2968704004,1613121270
.long 3445188352,3247741094,844474987,4093578302
.long 651481088,1190302358,1689581232,574775300
.long 4289380608,206939853,2555985458,2489840491
.long 2130264064,327674451,3566485037,3349835193
.long 2470714624,316102159,3636825756,3393945945
.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
.byte 118,101,114,115,105,116,121,41,0
.align 64
.type _vpaes_preheat,@function
.align 16
_vpaes_preheat:
addl (%esp),%ebp
movdqa -48(%ebp),%xmm7
movdqa -16(%ebp),%xmm6
ret
.size _vpaes_preheat,.-_vpaes_preheat
.type _vpaes_encrypt_core,@function
.align 16
_vpaes_encrypt_core:
movl $16,%ecx
movl 240(%edx),%eax
movdqa %xmm6,%xmm1
movdqa (%ebp),%xmm2
pandn %xmm0,%xmm1
movdqu (%edx),%xmm5
psrld $4,%xmm1
pand %xmm6,%xmm0
.byte 102,15,56,0,208
movdqa 16(%ebp),%xmm0
.byte 102,15,56,0,193
pxor %xmm5,%xmm2
pxor %xmm2,%xmm0
addl $16,%edx
leal 192(%ebp),%ebx
jmp .L000enc_entry
.align 16
.L001enc_loop:
movdqa 32(%ebp),%xmm4
.byte 102,15,56,0,226
pxor %xmm5,%xmm4
movdqa 48(%ebp),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
movdqa 64(%ebp),%xmm5
.byte 102,15,56,0,234
movdqa -64(%ebx,%ecx,1),%xmm1
movdqa 80(%ebp),%xmm2
.byte 102,15,56,0,211
pxor %xmm5,%xmm2
movdqa (%ebx,%ecx,1),%xmm4
movdqa %xmm0,%xmm3
.byte 102,15,56,0,193
addl $16,%edx
pxor %xmm2,%xmm0
.byte 102,15,56,0,220
addl $16,%ecx
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andl $48,%ecx
pxor %xmm3,%xmm0
subl $1,%eax
.L000enc_entry:
movdqa %xmm6,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm6,%xmm0
movdqa -32(%ebp),%xmm5
.byte 102,15,56,0,232
pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
.byte 102,15,56,0,217
pxor %xmm5,%xmm3
movdqa %xmm7,%xmm4
.byte 102,15,56,0,224
pxor %xmm5,%xmm4
movdqa %xmm7,%xmm2
.byte 102,15,56,0,211
pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
movdqu (%edx),%xmm5
.byte 102,15,56,0,220
pxor %xmm1,%xmm3
jnz .L001enc_loop
movdqa 96(%ebp),%xmm4
movdqa 112(%ebp),%xmm0
.byte 102,15,56,0,226
pxor %xmm5,%xmm4
.byte 102,15,56,0,195
movdqa 64(%ebx,%ecx,1),%xmm1
pxor %xmm4,%xmm0
.byte 102,15,56,0,193
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
.type _vpaes_decrypt_core,@function
.align 16
_vpaes_decrypt_core:
movl 240(%edx),%eax
leal 608(%ebp),%ebx
movdqa %xmm6,%xmm1
movdqa -64(%ebx),%xmm2
pandn %xmm0,%xmm1
movl %eax,%ecx
psrld $4,%xmm1
movdqu (%edx),%xmm5
shll $4,%ecx
pand %xmm6,%xmm0
.byte 102,15,56,0,208
movdqa -48(%ebx),%xmm0
xorl $48,%ecx
.byte 102,15,56,0,193
andl $48,%ecx
pxor %xmm5,%xmm2
movdqa 176(%ebp),%xmm5
pxor %xmm2,%xmm0
addl $16,%edx
leal -352(%ebx,%ecx,1),%ecx
jmp .L002dec_entry
.align 16
.L003dec_loop:
movdqa -32(%ebx),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa -16(%ebx),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
addl $16,%edx
.byte 102,15,56,0,197
movdqa (%ebx),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 16(%ebx),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
subl $1,%eax
.byte 102,15,56,0,197
movdqa 32(%ebx),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 48(%ebx),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,56,0,197
movdqa 64(%ebx),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 80(%ebx),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,58,15,237,12
.L002dec_entry:
movdqa %xmm6,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm6,%xmm0
movdqa -32(%ebp),%xmm2
.byte 102,15,56,0,208
pxor %xmm1,%xmm0
movdqa %xmm7,%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
movdqa %xmm7,%xmm4
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm7,%xmm2
.byte 102,15,56,0,211
pxor %xmm0,%xmm2
movdqa %xmm7,%xmm3
.byte 102,15,56,0,220
pxor %xmm1,%xmm3
movdqu (%edx),%xmm0
jnz .L003dec_loop
movdqa 96(%ebx),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 112(%ebx),%xmm0
movdqa (%ecx),%xmm2
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,56,0,194
ret
.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
.type _vpaes_schedule_core,@function
.align 16
_vpaes_schedule_core:
addl (%esp),%ebp
movdqu (%esi),%xmm0
movdqa 320(%ebp),%xmm2
movdqa %xmm0,%xmm3
leal (%ebp),%ebx
movdqa %xmm2,4(%esp)
call _vpaes_schedule_transform
movdqa %xmm0,%xmm7
testl %edi,%edi
jnz .L004schedule_am_decrypting
movdqu %xmm0,(%edx)
jmp .L005schedule_go
.L004schedule_am_decrypting:
movdqa 256(%ebp,%ecx,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%edx)
xorl $48,%ecx
.L005schedule_go:
cmpl $192,%eax
ja .L006schedule_256
je .L007schedule_192
.L008schedule_128:
movl $10,%eax
.L009loop_schedule_128:
call _vpaes_schedule_round
decl %eax
jz .L010schedule_mangle_last
call _vpaes_schedule_mangle
jmp .L009loop_schedule_128
.align 16
.L007schedule_192:
movdqu 8(%esi),%xmm0
call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
movl $4,%eax
.L011loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle
call _vpaes_schedule_round
decl %eax
jz .L010schedule_mangle_last
call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .L011loop_schedule_192
.align 16
.L006schedule_256:
movdqu 16(%esi),%xmm0
call _vpaes_schedule_transform
movl $7,%eax
.L012loop_schedule_256:
call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decl %eax
jz .L010schedule_mangle_last
call _vpaes_schedule_mangle
pshufd $255,%xmm0,%xmm0
movdqa %xmm7,20(%esp)
movdqa %xmm6,%xmm7
call .L_vpaes_schedule_low_round
movdqa 20(%esp),%xmm7
jmp .L012loop_schedule_256
.align 16
.L010schedule_mangle_last:
leal 384(%ebp),%ebx
testl %edi,%edi
jnz .L013schedule_mangle_last_dec
movdqa 256(%ebp,%ecx,1),%xmm1
.byte 102,15,56,0,193
leal 352(%ebp),%ebx
addl $32,%edx
.L013schedule_mangle_last_dec:
addl $-16,%edx
pxor 336(%ebp),%xmm0
call _vpaes_schedule_transform
movdqu %xmm0,(%edx)
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
ret
.size _vpaes_schedule_core,.-_vpaes_schedule_core
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
pshufd $128,%xmm6,%xmm0
pxor %xmm0,%xmm6
pshufd $254,%xmm7,%xmm0
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
ret
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
.type _vpaes_schedule_round,@function
.align 16
_vpaes_schedule_round:
movdqa 8(%esp),%xmm2
pxor %xmm1,%xmm1
.byte 102,15,58,15,202,15
.byte 102,15,58,15,210,15
pxor %xmm1,%xmm7
pshufd $255,%xmm0,%xmm0
.byte 102,15,58,15,192,1
movdqa %xmm2,8(%esp)
.L_vpaes_schedule_low_round:
movdqa %xmm7,%xmm1
pslldq $4,%xmm7
pxor %xmm1,%xmm7
movdqa %xmm7,%xmm1
pslldq $8,%xmm7
pxor %xmm1,%xmm7
pxor 336(%ebp),%xmm7
movdqa -16(%ebp),%xmm4
movdqa -48(%ebp),%xmm5
movdqa %xmm4,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm4,%xmm0
movdqa -32(%ebp),%xmm2
.byte 102,15,56,0,208
pxor %xmm1,%xmm0
movdqa %xmm5,%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
movdqa %xmm5,%xmm4
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm5,%xmm2
.byte 102,15,56,0,211
pxor %xmm0,%xmm2
movdqa %xmm5,%xmm3
.byte 102,15,56,0,220
pxor %xmm1,%xmm3
movdqa 32(%ebp),%xmm4
.byte 102,15,56,0,226
movdqa 48(%ebp),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
pxor %xmm7,%xmm0
movdqa %xmm0,%xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round
.type _vpaes_schedule_transform,@function
.align 16
_vpaes_schedule_transform:
movdqa -16(%ebp),%xmm2
movdqa %xmm2,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm2,%xmm0
movdqa (%ebx),%xmm2
.byte 102,15,56,0,208
movdqa 16(%ebx),%xmm0
.byte 102,15,56,0,193
pxor %xmm2,%xmm0
ret
.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
.type _vpaes_schedule_mangle,@function
.align 16
_vpaes_schedule_mangle:
movdqa %xmm0,%xmm4
movdqa 128(%ebp),%xmm5
testl %edi,%edi
jnz .L014schedule_mangle_dec
addl $16,%edx
pxor 336(%ebp),%xmm4
.byte 102,15,56,0,229
movdqa %xmm4,%xmm3
.byte 102,15,56,0,229
pxor %xmm4,%xmm3
.byte 102,15,56,0,229
pxor %xmm4,%xmm3
jmp .L015schedule_mangle_both
.align 16
.L014schedule_mangle_dec:
movdqa -16(%ebp),%xmm2
leal 416(%ebp),%esi
movdqa %xmm2,%xmm1
pandn %xmm4,%xmm1
psrld $4,%xmm1
pand %xmm2,%xmm4
movdqa (%esi),%xmm2
.byte 102,15,56,0,212
movdqa 16(%esi),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
.byte 102,15,56,0,221
movdqa 32(%esi),%xmm2
.byte 102,15,56,0,212
pxor %xmm3,%xmm2
movdqa 48(%esi),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
.byte 102,15,56,0,221
movdqa 64(%esi),%xmm2
.byte 102,15,56,0,212
pxor %xmm3,%xmm2
movdqa 80(%esi),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
.byte 102,15,56,0,221
movdqa 96(%esi),%xmm2
.byte 102,15,56,0,212
pxor %xmm3,%xmm2
movdqa 112(%esi),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
addl $-16,%edx
.L015schedule_mangle_both:
movdqa 256(%ebp,%ecx,1),%xmm1
.byte 102,15,56,0,217
addl $-16,%ecx
andl $48,%ecx
movdqu %xmm3,(%edx)
ret
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
.globl vpaes_set_encrypt_key
.type vpaes_set_encrypt_key,@function
.align 16
vpaes_set_encrypt_key:
.L_vpaes_set_encrypt_key_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%esi
leal -56(%esp),%ebx
movl 24(%esp),%eax
andl $-16,%ebx
movl 28(%esp),%edx
xchgl %esp,%ebx
movl %ebx,48(%esp)
movl %eax,%ebx
shrl $5,%ebx
addl $5,%ebx
movl %ebx,240(%edx)
movl $48,%ecx
movl $0,%edi
leal .L_vpaes_consts+0x30-.L016pic_point,%ebp
call _vpaes_schedule_core
.L016pic_point:
movl 48(%esp),%esp
xorl %eax,%eax
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
.globl vpaes_set_decrypt_key
.type vpaes_set_decrypt_key,@function
.align 16
vpaes_set_decrypt_key:
.L_vpaes_set_decrypt_key_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%esi
leal -56(%esp),%ebx
movl 24(%esp),%eax
andl $-16,%ebx
movl 28(%esp),%edx
xchgl %esp,%ebx
movl %ebx,48(%esp)
movl %eax,%ebx
shrl $5,%ebx
addl $5,%ebx
movl %ebx,240(%edx)
shll $4,%ebx
leal 16(%edx,%ebx,1),%edx
movl $1,%edi
movl %eax,%ecx
shrl $1,%ecx
andl $32,%ecx
xorl $32,%ecx
leal .L_vpaes_consts+0x30-.L017pic_point,%ebp
call _vpaes_schedule_core
.L017pic_point:
movl 48(%esp),%esp
xorl %eax,%eax
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
.globl vpaes_encrypt
.type vpaes_encrypt,@function
.align 16
vpaes_encrypt:
.L_vpaes_encrypt_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
leal .L_vpaes_consts+0x30-.L018pic_point,%ebp
call _vpaes_preheat
.L018pic_point:
movl 20(%esp),%esi
leal -56(%esp),%ebx
movl 24(%esp),%edi
andl $-16,%ebx
movl 28(%esp),%edx
xchgl %esp,%ebx
movl %ebx,48(%esp)
movdqu (%esi),%xmm0
call _vpaes_encrypt_core
movdqu %xmm0,(%edi)
movl 48(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size vpaes_encrypt,.-.L_vpaes_encrypt_begin
.globl vpaes_decrypt
.type vpaes_decrypt,@function
.align 16
vpaes_decrypt:
.L_vpaes_decrypt_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
leal .L_vpaes_consts+0x30-.L019pic_point,%ebp
call _vpaes_preheat
.L019pic_point:
movl 20(%esp),%esi
leal -56(%esp),%ebx
movl 24(%esp),%edi
andl $-16,%ebx
movl 28(%esp),%edx
xchgl %esp,%ebx
movl %ebx,48(%esp)
movdqu (%esi),%xmm0
call _vpaes_decrypt_core
movdqu %xmm0,(%edi)
movl 48(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size vpaes_decrypt,.-.L_vpaes_decrypt_begin
.globl vpaes_cbc_encrypt
.type vpaes_cbc_encrypt,@function
.align 16
vpaes_cbc_encrypt:
.L_vpaes_cbc_encrypt_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%esi
movl 24(%esp),%edi
movl 28(%esp),%eax
movl 32(%esp),%edx
subl $16,%eax
jc .L020cbc_abort
leal -56(%esp),%ebx
movl 36(%esp),%ebp
andl $-16,%ebx
movl 40(%esp),%ecx
xchgl %esp,%ebx
movdqu (%ebp),%xmm1
subl %esi,%edi
movl %ebx,48(%esp)
movl %edi,(%esp)
movl %edx,4(%esp)
movl %ebp,8(%esp)
movl %eax,%edi
leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
call _vpaes_preheat
.L021pic_point:
cmpl $0,%ecx
je .L022cbc_dec_loop
jmp .L023cbc_enc_loop
.align 16
.L023cbc_enc_loop:
movdqu (%esi),%xmm0
pxor %xmm1,%xmm0
call _vpaes_encrypt_core
movl (%esp),%ebx
movl 4(%esp),%edx
movdqa %xmm0,%xmm1
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
jnc .L023cbc_enc_loop
jmp .L024cbc_done
.align 16
.L022cbc_dec_loop:
movdqu (%esi),%xmm0
movdqa %xmm1,16(%esp)
movdqa %xmm0,32(%esp)
call _vpaes_decrypt_core
movl (%esp),%ebx
movl 4(%esp),%edx
pxor 16(%esp),%xmm0
movdqa 32(%esp),%xmm1
movdqu %xmm0,(%ebx,%esi,1)
leal 16(%esi),%esi
subl $16,%edi
jnc .L022cbc_dec_loop
.L024cbc_done:
movl 8(%esp),%ebx
movl 48(%esp),%esp
movdqu %xmm1,(%ebx)
.L020cbc_abort:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin

View File

@@ -0,0 +1,903 @@
#!/usr/bin/env perl
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
######################################################################
# September 2011.
#
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
# doesn't handle partial vectors (doesn't have to if called from
# EVP only). "Drop-in" implies that this module doesn't share key
# schedule structure with the original nor does it make assumption
# about its alignment...
#
# Performance summary. aes-586.pl column lists large-block CBC
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
# large-block CBC] encrypt/decrypt.
#
# aes-586.pl vpaes-x86.pl
#
# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
# Nehalem 27.9/40.4/18.1 10.3/12.0
# Atom 102./119./60.1 64.5/85.3(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
# majority of contemporary cores share cache, slower code path
# is common place. In other words "with-hyper-threading-off"
# results are presented mostly for reference purposes.
#
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +32%/65% improvement on Core 2
# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
$PREFIX="vpaes";
my ($round, $base, $magic, $key, $const, $inp, $out)=
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
&static_label("_vpaes_consts");
&static_label("_vpaes_schedule_low_round");
&set_label("_vpaes_consts",64);
$k_inv=-0x30; # inv, inva
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
$k_s0F=-0x10; # s0F
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
$k_ipt=0x00; # input transform (lo, hi)
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
$k_sb1=0x20; # sb1u, sb1t
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
$k_sb2=0x40; # sb2u, sb2t
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
$k_sbo=0x60; # sbou, sbot
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
$k_mc_forward=0x80; # mc_forward
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
$k_mc_backward=0xc0; # mc_backward
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
$k_sr=0x100; # sr
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
$k_rcon=0x140; # rcon
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
$k_s63=0x150; # s63: all equal to 0x63 transformed
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
$k_opt=0x160; # output transform
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
##
## Decryption stuff
## Key schedule constants
##
$k_dksd=0x1a0; # decryption key schedule: invskew x*D
&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
$k_dksb=0x1c0; # decryption key schedule: invskew x*B
&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
$k_dks9=0x200; # decryption key schedule: invskew x*9
&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
##
## Decryption stuff
## Round function constants
##
$k_dipt=0x220; # decryption input transform
&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
$k_dsbo=0x2c0; # decryption sbox final output
&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
&align (64);
&function_begin_B("_vpaes_preheat");
&add ($const,&DWP(0,"esp"));
&movdqa ("xmm7",&QWP($k_inv,$const));
&movdqa ("xmm6",&QWP($k_s0F,$const));
&ret ();
&function_end_B("_vpaes_preheat");
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm6-%xmm7 as in _vpaes_preheat
## (%edx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
##
##
&function_begin_B("_vpaes_encrypt_core");
&mov ($magic,16);
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
&movdqu ("xmm5",&QWP(0,$key));
&psrld ("xmm1",4);
&pand ("xmm0","xmm6");
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
&pshufb ("xmm0","xmm1");
&pxor ("xmm2","xmm5");
&pxor ("xmm0","xmm2");
&add ($key,16);
&lea ($base,&DWP($k_mc_backward,$const));
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
&pshufb ("xmm4","xmm2"); # 4 = sb1u
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
&pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
&pshufb ("xmm2","xmm3"); # 2 = sb2t
&pxor ("xmm2","xmm5"); # 2 = 2A
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
&movdqa ("xmm3","xmm0"); # 3 = A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
&pshufb ("xmm3","xmm4"); # 3 = D
&add ($magic,16); # next mc
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&sub ($round,1); # nr--
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm5","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
# middle of last round
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm1");
&ret ();
&function_end_B("_vpaes_encrypt_core");
##
## Decryption core
##
## Same API as encryption core.
##
&function_begin_B("_vpaes_decrypt_core");
&mov ($round,&DWP(240,$key));
&lea ($base,&DWP($k_dsbd,$const));
&movdqa ("xmm1","xmm6");
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
&pandn ("xmm1","xmm0");
&mov ($magic,$round);
&psrld ("xmm1",4)
&movdqu ("xmm5",&QWP(0,$key));
&shl ($magic,4);
&pand ("xmm0","xmm6");
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
&xor ($magic,0x30);
&pshufb ("xmm0","xmm1");
&and ($magic,0x30);
&pxor ("xmm2","xmm5");
&movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
&pxor ("xmm0","xmm2");
&add ($key,16);
&lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
&jmp (&label("dec_entry"));
&set_label("dec_loop",16);
##
## Inverse mix columns
##
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
&pshufb ("xmm4","xmm2"); # 4 = sb9u
&pxor ("xmm4","xmm0");
&movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
&pshufb ("xmm0","xmm3"); # 0 = sb9t
&pxor ("xmm0","xmm4"); # 0 = ch
&add ($key,16); # next round key
&pshufb ("xmm0","xmm5"); # MC ch
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
&pshufb ("xmm4","xmm2"); # 4 = sbdu
&pxor ("xmm4","xmm0"); # 4 = ch
&movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
&pshufb ("xmm0","xmm3"); # 0 = sbdt
&pxor ("xmm0","xmm4"); # 0 = ch
&sub ($round,1); # nr--
&pshufb ("xmm0","xmm5"); # MC ch
&movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
&pshufb ("xmm4","xmm2"); # 4 = sbbu
&pxor ("xmm4","xmm0"); # 4 = ch
&movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
&pshufb ("xmm0","xmm3"); # 0 = sbbt
&pxor ("xmm0","xmm4"); # 0 = ch
&pshufb ("xmm0","xmm5"); # MC ch
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
&pshufb ("xmm4","xmm2"); # 4 = sbeu
&pxor ("xmm4","xmm0"); # 4 = ch
&movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
&pshufb ("xmm0","xmm3"); # 0 = sbet
&pxor ("xmm0","xmm4"); # 0 = ch
&palignr("xmm5","xmm5",12);
&set_label("dec_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm2","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&movdqu ("xmm0",&QWP(0,$key));
&jnz (&label("dec_loop"));
# middle of last round
&movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm0"); # 4 = sb1u + k
&movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
&movdqa ("xmm2",&QWP(0,$magic));
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_decrypt_core");
########################################################
## ##
## AES key schedule ##
## ##
########################################################
&function_begin_B("_vpaes_schedule_core");
&add ($const,&DWP(0,"esp"));
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
# input transform
&movdqa ("xmm3","xmm0");
&lea ($base,&DWP($k_ipt,$const));
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
&call ("_vpaes_schedule_transform");
&movdqa ("xmm7","xmm0");
&test ($out,$out);
&jnz (&label("schedule_am_decrypting"));
# encrypting, output zeroth round key after transform
&movdqu (&QWP(0,$key),"xmm0");
&jmp (&label("schedule_go"));
&set_label("schedule_am_decrypting");
# decrypting, output zeroth round key after shiftrows
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&movdqu (&QWP(0,$key),"xmm3");
&xor ($magic,0x30);
&set_label("schedule_go");
&cmp ($round,192);
&ja (&label("schedule_256"));
&je (&label("schedule_192"));
# 128: fall though
##
## .schedule_128
##
## 128-bit specific part of key schedule.
##
## This schedule is really simple, because all its parts
## are accomplished by the subroutines.
##
&set_label("schedule_128");
&mov ($round,10);
&set_label("loop_schedule_128");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # write output
&jmp (&label("loop_schedule_128"));
##
## .aes_schedule_192
##
## 192-bit specific part of key schedule.
##
## The main body of this schedule is the same as the 128-bit
## schedule, but with more smearing. The long, high side is
## stored in %xmm7 as before, and the short, low side is in
## the high bits of %xmm6.
##
## This schedule is somewhat nastier, however, because each
## round produces 192 bits of key material, or 1.5 round keys.
## Therefore, on each cycle we do 2 rounds and produce 3 round
## keys.
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
&mov ($round,4);
&set_label("loop_schedule_192");
&call ("_vpaes_schedule_round");
&palignr("xmm0","xmm6",8);
&call ("_vpaes_schedule_mangle"); # save key n
&call ("_vpaes_schedule_192_smear");
&call ("_vpaes_schedule_mangle"); # save key n+1
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # save key n+2
&call ("_vpaes_schedule_192_smear");
&jmp (&label("loop_schedule_192"));
##
## .aes_schedule_256
##
## 256-bit specific part of key schedule.
##
## The structure here is very similar to the 128-bit
## schedule, but with an additional "low side" in
## %xmm6. The low side's rounds are the same as the
## high side's, except no rcon and no rotation.
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_mangle"); # output low result
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
# high round
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
&movdqa (&QWP(20,"esp"),"xmm7");
&movdqa ("xmm7","xmm6");
&call ("_vpaes_schedule_low_round");
&movdqa ("xmm7",&QWP(20,"esp"));
&jmp (&label("loop_schedule_256"));
##
## .aes_schedule_mangle_last
##
## Mangler for last round of key schedule
## Mangles %xmm0
## when encrypting, outputs out(%xmm0) ^ 63
## when decrypting, outputs unskew(%xmm0)
##
## Always called right before return... jumps to cleanup and exits
##
&set_label("schedule_mangle_last",16);
# schedule last round key from xmm0
&lea ($base,&DWP($k_deskew,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_last_dec"));
# encrypting
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm0","xmm1"); # output permute
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
&add ($key,32);
&set_label("schedule_mangle_last_dec");
&add ($key,-16);
&pxor ("xmm0",&QWP($k_s63,$const));
&call ("_vpaes_schedule_transform"); # output transform
&movdqu (&QWP(0,$key),"xmm0"); # save last key
# cleanup
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&ret ();
&function_end_B("_vpaes_schedule_core");
##
## .aes_schedule_192_smear
##
## Smear the short, low side in the 192-bit key schedule.
##
## Inputs:
## %xmm7: high side, b a x y
## %xmm6: low side, d c 0 0
## %xmm13: 0
##
## Outputs:
## %xmm6: b+c+d b+c 0 0
## %xmm0: b+c+d b+c b a
##
&function_begin_B("_vpaes_schedule_192_smear");
&pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
&pxor ("xmm6","xmm0"); # -> c+d c 0 0
&pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
&pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
&movdqa ("xmm0","xmm6");
&pxor ("xmm1","xmm1");
&movhlps("xmm6","xmm1"); # clobber low side with zeros
&ret ();
&function_end_B("_vpaes_schedule_192_smear");
##
## .aes_schedule_round
##
## Runs one main round of the key schedule on %xmm0, %xmm7
##
## Specifically, runs subbytes on the high dword of %xmm0
## then rotates it by one byte and xors into the low dword of
## %xmm7.
##
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
## next rcon.
##
## Smears the dwords of %xmm7 by xoring the low into the
## second low, result into third, result into highest.
##
## Returns results in %xmm7 = %xmm0.
## Clobbers %xmm1-%xmm5.
##
&function_begin_B("_vpaes_schedule_round");
# extract rcon from xmm8
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
&pxor ("xmm1","xmm1");
&palignr("xmm1","xmm2",15);
&palignr("xmm2","xmm2",15);
&pxor ("xmm7","xmm1");
# rotate
&pshufd ("xmm0","xmm0",0xFF);
&palignr("xmm0","xmm0",1);
# fall through...
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
# low round: same as high round, but no rotation and no rcon.
&set_label("_vpaes_schedule_low_round");
# smear xmm7
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",4);
&pxor ("xmm7","xmm1");
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",8);
&pxor ("xmm7","xmm1");
&pxor ("xmm7",&QWP($k_s63,$const));
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm2","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm5"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm5"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = sbox output
# add in smeared stuff
&pxor ("xmm0","xmm7");
&movdqa ("xmm7","xmm0");
&ret ();
&function_end_B("_vpaes_schedule_round");
##
## .aes_schedule_transform
##
## Linear-transform %xmm0 according to tables at (%ebx)
##
## Output in %xmm0
## Clobbers %xmm1, %xmm2
##
&function_begin_B("_vpaes_schedule_transform");
&movdqa ("xmm2",&QWP($k_s0F,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4);
&pand ("xmm0","xmm2");
&movdqa ("xmm2",&QWP(0,$base));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP(16,$base));
&pshufb ("xmm0","xmm1");
&pxor ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_schedule_transform");
##
## .aes_schedule_mangle
##
## Mangle xmm0 from (basis-transformed) standard version
## to our version.
##
## On encrypt,
## xor with 0x63
## multiply by circulant 0,1,1,1
## apply shiftrows transform
##
## On decrypt,
## xor with 0x63
## multiply by "inverse mixcolumns" circulant E,B,D,9
## deskew
## apply shiftrows transform
##
##
## Writes out to (%edx), and increments or decrements it
## Keeps track of round number mod 4 in %ecx
## Preserves xmm0
## Clobbers xmm1-xmm5
##
&function_begin_B("_vpaes_schedule_mangle");
&movdqa ("xmm4","xmm0"); # save xmm0 for later
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_dec"));
# encrypting
&add ($key,16);
&pxor ("xmm4",&QWP($k_s63,$const));
&pshufb ("xmm4","xmm5");
&movdqa ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&jmp (&label("schedule_mangle_both"));
&set_label("schedule_mangle_dec",16);
# inverse mix columns
&movdqa ("xmm2",&QWP($k_s0F,$const));
&lea ($inp,&DWP($k_dksd,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm4");
&psrld ("xmm1",4); # 1 = hi
&pand ("xmm4","xmm2"); # 4 = lo
&movdqa ("xmm2",&QWP(0,$inp));
&pshufb ("xmm2","xmm4");
&movdqa ("xmm3",&QWP(0x10,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x20,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x30,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x40,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x50,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x60,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x70,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&add ($key,-16);
&set_label("schedule_mangle_both");
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&add ($magic,-16);
&and ($magic,0x30);
&movdqu (&QWP(0,$key),"xmm3");
&ret ();
&function_end_B("_vpaes_schedule_mangle");
#
# Interface to OpenSSL
#
&function_begin("${PREFIX}_set_encrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&mov ($magic,0x30);
&mov ($out,0);
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_encrypt_key");
&function_begin("${PREFIX}_set_decrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&shl ($base,4);
&lea ($key,&DWP(16,$key,$base));
&mov ($out,1);
&mov ($magic,$round);
&shr ($magic,1);
&and ($magic,32);
&xor ($magic,32); # nbist==192?0:32;
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_decrypt_key");
&function_begin("${PREFIX}_encrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_encrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_encrypt");
&function_begin("${PREFIX}_decrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_decrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_decrypt");
&function_begin("${PREFIX}_cbc_encrypt");
&mov ($inp,&wparam(0)); # inp
&mov ($out,&wparam(1)); # out
&mov ($round,&wparam(2)); # len
&mov ($key,&wparam(3)); # key
&sub ($round,16);
&jc (&label("cbc_abort"));
&lea ($base,&DWP(-56,"esp"));
&mov ($const,&wparam(4)); # ivp
&and ($base,-16);
&mov ($magic,&wparam(5)); # enc
&xchg ($base,"esp"); # alloca
&movdqu ("xmm1",&QWP(0,$const)); # load IV
&sub ($out,$inp);
&mov (&DWP(48,"esp"),$base);
&mov (&DWP(0,"esp"),$out); # save out
&mov (&DWP(4,"esp"),$key) # save key
&mov (&DWP(8,"esp"),$const); # save ivp
&mov ($out,$round); # $out works as $len
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&cmp ($magic,0);
&je (&label("cbc_dec_loop"));
&jmp (&label("cbc_enc_loop"));
&set_label("cbc_enc_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&pxor ("xmm0","xmm1"); # inp^=iv
&call ("_vpaes_encrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&movdqa ("xmm1","xmm0");
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_enc_loop"));
&jmp (&label("cbc_done"));
&set_label("cbc_dec_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&movdqa (&QWP(16,"esp"),"xmm1"); # save IV
&movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
&call ("_vpaes_decrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&pxor ("xmm0",&QWP(16,"esp")); # out^=iv
&movdqa ("xmm1",&QWP(32,"esp")); # load next IV
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_dec_loop"));
&set_label("cbc_done");
&mov ($base,&DWP(8,"esp")); # restore ivp
&mov ("esp",&DWP(48,"esp"));
&movdqu (&QWP(0,$base),"xmm1"); # write IV
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
&asm_finish();

View File

@@ -0,0 +1,828 @@
.text
.type _vpaes_encrypt_core,@function
.align 16
_vpaes_encrypt_core:
movq %rdx,%r9
movq $16,%r11
movl 240(%rdx),%eax
movdqa %xmm9,%xmm1
movdqa .Lk_ipt(%rip),%xmm2
pandn %xmm0,%xmm1
movdqu (%r9),%xmm5
psrld $4,%xmm1
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_ipt+16(%rip),%xmm0
.byte 102,15,56,0,193
pxor %xmm5,%xmm2
pxor %xmm2,%xmm0
addq $16,%r9
leaq .Lk_mc_backward(%rip),%r10
jmp .Lenc_entry
.align 16
.Lenc_loop:
movdqa %xmm13,%xmm4
.byte 102,15,56,0,226
pxor %xmm5,%xmm4
movdqa %xmm12,%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
movdqa %xmm15,%xmm5
.byte 102,15,56,0,234
movdqa -64(%r11,%r10,1),%xmm1
movdqa %xmm14,%xmm2
.byte 102,15,56,0,211
pxor %xmm5,%xmm2
movdqa (%r11,%r10,1),%xmm4
movdqa %xmm0,%xmm3
.byte 102,15,56,0,193
addq $16,%r9
pxor %xmm2,%xmm0
.byte 102,15,56,0,220
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
andq $48,%r11
pxor %xmm3,%xmm0
subq $1,%rax
.Lenc_entry:
movdqa %xmm9,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa %xmm11,%xmm5
.byte 102,15,56,0,232
pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
.byte 102,15,56,0,217
pxor %xmm5,%xmm3
movdqa %xmm10,%xmm4
.byte 102,15,56,0,224
pxor %xmm5,%xmm4
movdqa %xmm10,%xmm2
.byte 102,15,56,0,211
pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
movdqu (%r9),%xmm5
.byte 102,15,56,0,220
pxor %xmm1,%xmm3
jnz .Lenc_loop
movdqa -96(%r10),%xmm4
movdqa -80(%r10),%xmm0
.byte 102,15,56,0,226
pxor %xmm5,%xmm4
.byte 102,15,56,0,195
movdqa 64(%r11,%r10,1),%xmm1
pxor %xmm4,%xmm0
.byte 102,15,56,0,193
.byte 0xf3,0xc3
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
.type _vpaes_decrypt_core,@function
.align 16
_vpaes_decrypt_core:
movq %rdx,%r9
movl 240(%rdx),%eax
movdqa %xmm9,%xmm1
movdqa .Lk_dipt(%rip),%xmm2
pandn %xmm0,%xmm1
movq %rax,%r11
psrld $4,%xmm1
movdqu (%r9),%xmm5
shlq $4,%r11
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
xorq $48,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
andq $48,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
addq $16,%r9
addq %r10,%r11
jmp .Ldec_entry
.align 16
.Ldec_loop:
movdqa -32(%r10),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa -16(%r10),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
addq $16,%r9
.byte 102,15,56,0,197
movdqa 0(%r10),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 16(%r10),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
subq $1,%rax
.byte 102,15,56,0,197
movdqa 32(%r10),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 48(%r10),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,56,0,197
movdqa 64(%r10),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 80(%r10),%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,58,15,237,12
.Ldec_entry:
movdqa %xmm9,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa %xmm11,%xmm2
.byte 102,15,56,0,208
pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
movdqa %xmm10,%xmm4
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
.byte 102,15,56,0,211
pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
.byte 102,15,56,0,220
pxor %xmm1,%xmm3
movdqu (%r9),%xmm0
jnz .Ldec_loop
movdqa 96(%r10),%xmm4
.byte 102,15,56,0,226
pxor %xmm0,%xmm4
movdqa 112(%r10),%xmm0
movdqa -352(%r11),%xmm2
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
.byte 102,15,56,0,194
.byte 0xf3,0xc3
.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
.type _vpaes_schedule_core,@function
.align 16
_vpaes_schedule_core:
call _vpaes_preheat
movdqa .Lk_rcon(%rip),%xmm8
movdqu (%rdi),%xmm0
movdqa %xmm0,%xmm3
leaq .Lk_ipt(%rip),%r11
call _vpaes_schedule_transform
movdqa %xmm0,%xmm7
leaq .Lk_sr(%rip),%r10
testq %rcx,%rcx
jnz .Lschedule_am_decrypting
movdqu %xmm0,(%rdx)
jmp .Lschedule_go
.Lschedule_am_decrypting:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
xorq $48,%r8
.Lschedule_go:
cmpl $192,%esi
ja .Lschedule_256
je .Lschedule_192
.Lschedule_128:
movl $10,%esi
.Loop_schedule_128:
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
call _vpaes_schedule_mangle
jmp .Loop_schedule_128
.align 16
.Lschedule_192:
movdqu 8(%rdi),%xmm0
call _vpaes_schedule_transform
movdqa %xmm0,%xmm6
pxor %xmm4,%xmm4
movhlps %xmm4,%xmm6
movl $4,%esi
.Loop_schedule_192:
call _vpaes_schedule_round
.byte 102,15,58,15,198,8
call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
call _vpaes_schedule_mangle
call _vpaes_schedule_192_smear
jmp .Loop_schedule_192
.align 16
.Lschedule_256:
movdqu 16(%rdi),%xmm0
call _vpaes_schedule_transform
movl $7,%esi
.Loop_schedule_256:
call _vpaes_schedule_mangle
movdqa %xmm0,%xmm6
call _vpaes_schedule_round
decq %rsi
jz .Lschedule_mangle_last
call _vpaes_schedule_mangle
pshufd $255,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
movdqa %xmm5,%xmm7
jmp .Loop_schedule_256
.align 16
.Lschedule_mangle_last:
leaq .Lk_deskew(%rip),%r11
testq %rcx,%rcx
jnz .Lschedule_mangle_last_dec
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,193
leaq .Lk_opt(%rip),%r11
addq $32,%rdx
.Lschedule_mangle_last_dec:
addq $-16,%rdx
pxor .Lk_s63(%rip),%xmm0
call _vpaes_schedule_transform
movdqu %xmm0,(%rdx)
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
.byte 0xf3,0xc3
.size _vpaes_schedule_core,.-_vpaes_schedule_core
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
pshufd $128,%xmm6,%xmm0
pxor %xmm0,%xmm6
pshufd $254,%xmm7,%xmm0
pxor %xmm0,%xmm6
movdqa %xmm6,%xmm0
pxor %xmm1,%xmm1
movhlps %xmm1,%xmm6
.byte 0xf3,0xc3
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
.type _vpaes_schedule_round,@function
.align 16
_vpaes_schedule_round:
pxor %xmm1,%xmm1
.byte 102,65,15,58,15,200,15
.byte 102,69,15,58,15,192,15
pxor %xmm1,%xmm7
pshufd $255,%xmm0,%xmm0
.byte 102,15,58,15,192,1
_vpaes_schedule_low_round:
movdqa %xmm7,%xmm1
pslldq $4,%xmm7
pxor %xmm1,%xmm7
movdqa %xmm7,%xmm1
pslldq $8,%xmm7
pxor %xmm1,%xmm7
pxor .Lk_s63(%rip),%xmm7
movdqa %xmm9,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa %xmm11,%xmm2
.byte 102,15,56,0,208
pxor %xmm1,%xmm0
movdqa %xmm10,%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
movdqa %xmm10,%xmm4
.byte 102,15,56,0,224
pxor %xmm2,%xmm4
movdqa %xmm10,%xmm2
.byte 102,15,56,0,211
pxor %xmm0,%xmm2
movdqa %xmm10,%xmm3
.byte 102,15,56,0,220
pxor %xmm1,%xmm3
movdqa %xmm13,%xmm4
.byte 102,15,56,0,226
movdqa %xmm12,%xmm0
.byte 102,15,56,0,195
pxor %xmm4,%xmm0
pxor %xmm7,%xmm0
movdqa %xmm0,%xmm7
.byte 0xf3,0xc3
.size _vpaes_schedule_round,.-_vpaes_schedule_round
.type _vpaes_schedule_transform,@function
.align 16
_vpaes_schedule_transform:
movdqa %xmm9,%xmm1
pandn %xmm0,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm0
movdqa (%r11),%xmm2
.byte 102,15,56,0,208
movdqa 16(%r11),%xmm0
.byte 102,15,56,0,193
pxor %xmm2,%xmm0
.byte 0xf3,0xc3
.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
.type _vpaes_schedule_mangle,@function
.align 16
_vpaes_schedule_mangle:
movdqa %xmm0,%xmm4
movdqa .Lk_mc_forward(%rip),%xmm5
testq %rcx,%rcx
jnz .Lschedule_mangle_dec
addq $16,%rdx
pxor .Lk_s63(%rip),%xmm4
.byte 102,15,56,0,229
movdqa %xmm4,%xmm3
.byte 102,15,56,0,229
pxor %xmm4,%xmm3
.byte 102,15,56,0,229
pxor %xmm4,%xmm3
jmp .Lschedule_mangle_both
.align 16
.Lschedule_mangle_dec:
leaq .Lk_dksd(%rip),%r11
movdqa %xmm9,%xmm1
pandn %xmm4,%xmm1
psrld $4,%xmm1
pand %xmm9,%xmm4
movdqa 0(%r11),%xmm2
.byte 102,15,56,0,212
movdqa 16(%r11),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
.byte 102,15,56,0,221
movdqa 32(%r11),%xmm2
.byte 102,15,56,0,212
pxor %xmm3,%xmm2
movdqa 48(%r11),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
.byte 102,15,56,0,221
movdqa 64(%r11),%xmm2
.byte 102,15,56,0,212
pxor %xmm3,%xmm2
movdqa 80(%r11),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
.byte 102,15,56,0,221
movdqa 96(%r11),%xmm2
.byte 102,15,56,0,212
pxor %xmm3,%xmm2
movdqa 112(%r11),%xmm3
.byte 102,15,56,0,217
pxor %xmm2,%xmm3
addq $-16,%rdx
.Lschedule_mangle_both:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
andq $48,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
.globl vpaes_set_encrypt_key
.type vpaes_set_encrypt_key,@function
.align 16
vpaes_set_encrypt_key:
movl %esi,%eax
shrl $5,%eax
addl $5,%eax
movl %eax,240(%rdx)
movl $0,%ecx
movl $48,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
.globl vpaes_set_decrypt_key
.type vpaes_set_decrypt_key,@function
.align 16
vpaes_set_decrypt_key:
movl %esi,%eax
shrl $5,%eax
addl $5,%eax
movl %eax,240(%rdx)
shll $4,%eax
leaq 16(%rdx,%rax,1),%rdx
movl $1,%ecx
movl %esi,%r8d
shrl $1,%r8d
andl $32,%r8d
xorl $32,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
.globl vpaes_encrypt
.type vpaes_encrypt,@function
.align 16
vpaes_encrypt:
movdqu (%rdi),%xmm0
call _vpaes_preheat
call _vpaes_encrypt_core
movdqu %xmm0,(%rsi)
.byte 0xf3,0xc3
.size vpaes_encrypt,.-vpaes_encrypt
.globl vpaes_decrypt
.type vpaes_decrypt,@function
.align 16
vpaes_decrypt:
movdqu (%rdi),%xmm0
call _vpaes_preheat
call _vpaes_decrypt_core
movdqu %xmm0,(%rsi)
.byte 0xf3,0xc3
.size vpaes_decrypt,.-vpaes_decrypt
.globl vpaes_cbc_encrypt
.type vpaes_cbc_encrypt,@function
.align 16
vpaes_cbc_encrypt:
xchgq %rcx,%rdx
subq $16,%rcx
jc .Lcbc_abort
movdqu (%r8),%xmm6
subq %rdi,%rsi
call _vpaes_preheat
cmpl $0,%r9d
je .Lcbc_dec_loop
jmp .Lcbc_enc_loop
.align 16
.Lcbc_enc_loop:
movdqu (%rdi),%xmm0
pxor %xmm6,%xmm0
call _vpaes_encrypt_core
movdqa %xmm0,%xmm6
movdqu %xmm0,(%rsi,%rdi,1)
leaq 16(%rdi),%rdi
subq $16,%rcx
jnc .Lcbc_enc_loop
jmp .Lcbc_done
.align 16
.Lcbc_dec_loop:
movdqu (%rdi),%xmm0
movdqa %xmm0,%xmm7
call _vpaes_decrypt_core
pxor %xmm6,%xmm0
movdqa %xmm7,%xmm6
movdqu %xmm0,(%rsi,%rdi,1)
leaq 16(%rdi),%rdi
subq $16,%rcx
jnc .Lcbc_dec_loop
.Lcbc_done:
movdqu %xmm6,(%r8)
.Lcbc_abort:
.byte 0xf3,0xc3
.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
.type _vpaes_preheat,@function
.align 16
_vpaes_preheat:
leaq .Lk_s0F(%rip),%r10
movdqa -32(%r10),%xmm10
movdqa -16(%r10),%xmm11
movdqa 0(%r10),%xmm9
movdqa 48(%r10),%xmm13
movdqa 64(%r10),%xmm12
movdqa 80(%r10),%xmm15
movdqa 96(%r10),%xmm14
.byte 0xf3,0xc3
.size _vpaes_preheat,.-_vpaes_preheat
.type _vpaes_consts,@object
.align 64
_vpaes_consts:
.Lk_inv:
.quad 0x0E05060F0D080180, 0x040703090A0B0C02
.quad 0x01040A060F0B0780, 0x030D0E0C02050809
.Lk_s0F:
.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
.Lk_ipt:
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
.Lk_sb1:
.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
.Lk_sb2:
.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
.Lk_sbo:
.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
.Lk_mc_forward:
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
.quad 0x080B0A0904070605, 0x000302010C0F0E0D
.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
.quad 0x000302010C0F0E0D, 0x080B0A0904070605
.Lk_mc_backward:
.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
.quad 0x020100030E0D0C0F, 0x0A09080B06050407
.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
.quad 0x0A09080B06050407, 0x020100030E0D0C0F
.Lk_sr:
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
.quad 0x0F060D040B020900, 0x070E050C030A0108
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
.Lk_rcon:
.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
.Lk_s63:
.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
.Lk_opt:
.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
.Lk_deskew:
.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
.Lk_dksd:
.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
.Lk_dksb:
.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
.Lk_dkse:
.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
.Lk_dks9:
.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
.Lk_dipt:
.quad 0x0F505B040B545F00, 0x154A411E114E451A
.quad 0x86E383E660056500, 0x12771772F491F194
.Lk_dsb9:
.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
.Lk_dsbd:
.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
.Lk_dsbb:
.quad 0xD022649296B44200, 0x602646F6B0F2D404
.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
.Lk_dsbe:
.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.align 64
.size _vpaes_consts,.-_vpaes_consts

File diff suppressed because it is too large Load Diff