Attachment 13524 Details for Bug 13008 – Original patch from Justin.

[patch] Original patch from Justin.

0001-lib-crypto-Add-support-for-AES-NI-acceleration.patch (text/plain), 94.74 KB, created by Jeremy Allison on 2017-08-31 18:29:05 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Jeremy Allison

Created: 2017-08-31 18:29:05 UTC

Size: 94.74 KB

patch

obsolete

>From dff7d340df41ba61cd183e7465cab34449604f74 Mon Sep 17 00:00:00 2001
>From: Justin Maggard <jmaggard@netgear.com>
>Date: Fri, 17 Mar 2017 14:41:40 -0700
>Subject: [PATCH] lib/crypto: Add support for AES-NI acceleration
>
>This commit takes the Linux kernel AES-NI code, and puts it into a
>third_party private library. Then the crypto API is modified slightly to
>allow this all to work together.
>
>This can result in massive speed improvements (up to 200% on some
>platforms), by using Intel AES-NI instructions.
>
>Signed-off-by: Justin Maggard <jmaggard@netgear.com>
>---
> lib/crypto/aes.h                          |   16 +-
> lib/crypto/aes_acc.c                      |  113 ++
> lib/crypto/aes_ccm_128.c                  |    1 +
> lib/crypto/aes_cmac_128.c                 |    1 +
> lib/crypto/aes_gcm_128.c                  |    1 +
> lib/crypto/crypto_aes.h                   |   45 +
> lib/crypto/wscript_build                  |    4 +
> source4/heimdal/lib/hcrypto/aes.h         |    2 +
> third_party/aesni-intel/aesni-intel_asm.c | 2793 +++++++++++++++++++++++++++++
> third_party/aesni-intel/inst-intel.h      |  306 ++++
> third_party/aesni-intel/wscript           |   12 +
> third_party/wscript                       |    2 +
> 12 files changed, 3294 insertions(+), 2 deletions(-)
> create mode 100644 lib/crypto/aes_acc.c
> create mode 100644 lib/crypto/crypto_aes.h
> create mode 100644 third_party/aesni-intel/aesni-intel_asm.c
> create mode 100644 third_party/aesni-intel/inst-intel.h
> create mode 100644 third_party/aesni-intel/wscript
>
>diff --git a/lib/crypto/aes.h b/lib/crypto/aes.h
>index 48ea764d514..07881e2dbc2 100644
>--- a/lib/crypto/aes.h
>+++ b/lib/crypto/aes.h
>@@ -44,8 +44,6 @@
> /* symbol renaming */
> #define AES_set_encrypt_key samba_AES_set_encrypt_key
> #define AES_set_decrypt_key samba_AES_decrypt_key
>-#define AES_encrypt samba_AES_encrypt
>-#define AES_decrypt samba_AES_decrypt
> #define AES_cbc_encrypt samba_AES_cbc_encrypt
> #define AES_cfb8_encrypt samba_AES_cfb8_encrypt
> 
>@@ -59,9 +57,23 @@
> #define AES_ENCRYPT 1
> #define AES_DECRYPT 0
> 
>+#include "config.h"
>+
>+#ifdef HAVE_AES_ACC
>+#include "crypto_aes.h"
>+#endif
>+
> typedef struct aes_key {
>     uint32_t key[(AES_MAXNR+1)*4];
>     int rounds;
>+#ifdef HAVE_AES_ACC
>+    char _acc_ctx[sizeof(struct crypto_aes_ctx) + 16];
>+    struct crypto_aes_ctx	*acc_ctx;
>+    void (*encrypt)(const unsigned char *,
>+			unsigned char *, const struct aes_key *);
>+    void (*decrypt)(const unsigned char *,
>+			unsigned char *, const struct aes_key *);
>+#endif
> } AES_KEY;
> 
> #ifdef __cplusplus
>diff --git a/lib/crypto/aes_acc.c b/lib/crypto/aes_acc.c
>new file mode 100644
>index 00000000000..187237f178d
>--- /dev/null
>+++ b/lib/crypto/aes_acc.c
>@@ -0,0 +1,113 @@
>+#include "replace.h"
>+#include "crypto.h"
>+#include "aes.h"
>+#include "lib/util/byteorder.h"
>+
>+#undef AES_encrypt
>+#undef AES_decrypt
>+
>+#ifdef HAVE_AES_ACC
>+#ifdef HAVE_AESNI_INTEL
>+static inline void __cpuid(unsigned int where[4], unsigned int leaf)
>+{
>+	asm volatile("cpuid" :
>+			"=a" (where[0]),
>+			"=b" (where[1]),
>+			"=c" (where[2]),
>+			"=d" (where[3]): "a" (leaf));
>+}
>+
>+/* 
>+ * check_for_aes_instructions()
>+ * return 1 if supports AES-NI and 0 if doesn't
>+ */
>+static int check_for_aes_instructions(void)
>+{
>+	unsigned int cpuid_results[4];
>+
>+	__cpuid(cpuid_results, 0);
>+	/*
>+ 	 *        MSB         LSB
>+ 	 *  EBX = 'u' 'n' 'e' 'G'
>+ 	 *  EDX = 'I' 'e' 'n' 'i'
>+ 	 *  ECX = 'l' 'e' 't' 'n'
>+ 	 */
>+	if (memcmp((unsigned char *)&cpuid_results[1], "Genu", 4) != 0 ||
>+		memcmp((unsigned char *)&cpuid_results[3], "ineI", 4) != 0 ||
>+		memcmp((unsigned char *)&cpuid_results[2], "ntel", 4) != 0)
>+		return 0;
>+
>+	__cpuid(cpuid_results, 1);
>+	return !!(cpuid_results[2] & (1 << 25));
>+}
>+
>+static void __AES_encrypt_acc(const unsigned char *in, unsigned char *out,
>+							const AES_KEY *key)
>+{
>+	aesni_enc(key->acc_ctx, out, in);
>+}
>+
>+static void __AES_decrypt_acc(const unsigned char *in, unsigned char *out,
>+							const AES_KEY *key)
>+{
>+	aesni_dec(key->acc_ctx, out, in);
>+}
>+#endif
>+#else
>+#error "AES-NI unsupported."
>+#endif
>+
>+#define SET_ACC_CTX(k)	\
>+	do {	\
>+		(k)->acc_ctx =	\
>+(struct crypto_aes_ctx *)(((unsigned long)(k)->_acc_ctx + 15) & ~0xfUL); \
>+	} while (0)
>+
>+
>+int __AES_set_encrypt_key(const unsigned char *userkey,
>+					const int bits, AES_KEY *key)
>+{
>+#ifdef HAVE_AESNI_INTEL
>+	if (check_for_aes_instructions()) {
>+		key->encrypt = __AES_encrypt_acc;
>+		SET_ACC_CTX(key);
>+		return aesni_set_key(key->acc_ctx, userkey, bits/8);
>+	}
>+	key->encrypt = AES_encrypt;
>+#endif
>+	return AES_set_encrypt_key(userkey, bits, key);
>+}
>+
>+int __AES_set_decrypt_key(const unsigned char *userkey,
>+					const int bits, AES_KEY *key)
>+{
>+#ifdef HAVE_AESNI_INTEL
>+	if (check_for_aes_instructions()) {
>+		key->decrypt = __AES_decrypt_acc;
>+		SET_ACC_CTX(key);
>+		return aesni_set_key(key->acc_ctx, userkey, bits/8);
>+	}
>+	key->decrypt = AES_decrypt;
>+#endif
>+	return AES_set_decrypt_key(userkey, bits, key);
>+}
>+
>+void __AES_encrypt(const unsigned char *in, unsigned char *out,
>+				const AES_KEY *key)
>+{
>+#ifdef HAVE_AES_ACC
>+	key->encrypt(in, out, key);
>+#else
>+	AES_encrypt(in, out, key);
>+#endif
>+}
>+
>+void __AES_decrypt(const unsigned char *in, unsigned char *out,
>+				const AES_KEY *key)
>+{
>+#ifdef HAVE_AES_ACC
>+	key->decrypt(in, out, key);
>+#else
>+	AES_decrypt(in, out, key);
>+#endif
>+}
>diff --git a/lib/crypto/aes_ccm_128.c b/lib/crypto/aes_ccm_128.c
>index a821e8d48a8..ee3880942de 100644
>--- a/lib/crypto/aes_ccm_128.c
>+++ b/lib/crypto/aes_ccm_128.c
>@@ -18,6 +18,7 @@
> */
> 
> #include "replace.h"
>+#define AES_ACC_CLIENT	1
> #include "../lib/crypto/crypto.h"
> #include "lib/util/byteorder.h"
> 
>diff --git a/lib/crypto/aes_cmac_128.c b/lib/crypto/aes_cmac_128.c
>index 5d71e82de0d..8d9ec182f16 100644
>--- a/lib/crypto/aes_cmac_128.c
>+++ b/lib/crypto/aes_cmac_128.c
>@@ -19,6 +19,7 @@
> */
> 
> #include "replace.h"
>+#define AES_ACC_CLIENT	1
> #include "../lib/crypto/crypto.h"
> 
> static const uint8_t const_Zero[] = {
>diff --git a/lib/crypto/aes_gcm_128.c b/lib/crypto/aes_gcm_128.c
>index bfbf176e53d..6cb49d6f932 100644
>--- a/lib/crypto/aes_gcm_128.c
>+++ b/lib/crypto/aes_gcm_128.c
>@@ -18,6 +18,7 @@
> */
> 
> #include "replace.h"
>+#define AES_ACC_CLIENT	1
> #include "../lib/crypto/crypto.h"
> #include "lib/util/byteorder.h"
> 
>diff --git a/lib/crypto/crypto_aes.h b/lib/crypto/crypto_aes.h
>new file mode 100644
>index 00000000000..b0ccdd8d6a5
>--- /dev/null
>+++ b/lib/crypto/crypto_aes.h
>@@ -0,0 +1,45 @@
>+/*
>+ * Common values for AES algorithms
>+ */
>+
>+#ifdef HAVE_AES_ACC
>+#ifndef _CRYPTO_AES_H
>+#define _CRYPTO_AES_H
>+
>+#define AES_MAX_KEYLENGTH	(15 * 16)
>+#define AES_MAX_KEYLENGTH_U32	(AES_MAX_KEYLENGTH / sizeof(uint32_t))
>+
>+/*
>+ * Please ensure that the first two fields are 16-byte aligned
>+ * relative to the start of the structure, i.e., don't move them!
>+ */
>+struct crypto_aes_ctx {
>+	uint32_t key_enc[AES_MAX_KEYLENGTH_U32];
>+	uint32_t key_dec[AES_MAX_KEYLENGTH_U32];
>+	uint32_t key_length;
>+};
>+
>+#ifdef AES_ACC_CLIENT
>+struct aes_key;
>+
>+int __AES_set_encrypt_key(const unsigned char *userkey,
>+					const int bits, struct aes_key *key);
>+int __AES_set_decrypt_key(const unsigned char *userkey,
>+					const int bits, struct aes_key *key);
>+void __AES_encrypt(const unsigned char *in, unsigned char *out,
>+				const struct aes_key *key);
>+void __AES_decrypt(const unsigned char *in, unsigned char *out,
>+				const struct aes_key *key);
>+
>+#undef	AES_set_encrypt_key
>+#undef	AES_set_decrypt_key
>+#undef	AES_encrypt
>+#undef	AES_decrypt
>+
>+#define	AES_set_encrypt_key	__AES_set_encrypt_key
>+#define	AES_set_decrypt_key	__AES_set_decrypt_key
>+#define	AES_encrypt		__AES_encrypt
>+#define	AES_decrypt		__AES_decrypt
>+#endif
>+#endif
>+#endif
>diff --git a/lib/crypto/wscript_build b/lib/crypto/wscript_build
>index d1f152ebcf1..c56ecf17912 100644
>--- a/lib/crypto/wscript_build
>+++ b/lib/crypto/wscript_build
>@@ -11,6 +11,10 @@ elif bld.CONFIG_SET('HAVE_SYS_MD5_H') and bld.CONFIG_SET('HAVE_LIBMD'):
> elif not bld.CONFIG_SET('HAVE_SYS_MD5_H') and not bld.CONFIG_SET('HAVE_COMMONCRYPTO_COMMONDIGEST_H'):
> 	extra_source += ' md5.c'
> 
>+if bld.CONFIG_SET("HAVE_AES_ACC"):
>+	extra_source += ' aes_acc.c'
>+	extra_deps += ' aesni-intel'
>+
> bld.SAMBA_SUBSYSTEM('LIBCRYPTO',
>         source='''crc32.c hmacmd5.c md4.c arcfour.c sha256.c sha512.c hmacsha256.c
>         aes.c rijndael-alg-fst.c aes_cmac_128.c aes_ccm_128.c aes_gcm_128.c
>diff --git a/source4/heimdal/lib/hcrypto/aes.h b/source4/heimdal/lib/hcrypto/aes.h
>index 4ba4516519f..3093a45304b 100644
>--- a/source4/heimdal/lib/hcrypto/aes.h
>+++ b/source4/heimdal/lib/hcrypto/aes.h
>@@ -36,6 +36,8 @@
> #ifndef HEIM_AES_H
> #define HEIM_AES_H 1
> 
>+#include "../lib/crypto/crypto_aes.h"
>+
> /* symbol renaming */
> #define AES_set_encrypt_key hc_AES_set_encrypt_key
> #define AES_set_decrypt_key hc_AES_decrypt_key
>diff --git a/third_party/aesni-intel/aesni-intel_asm.c b/third_party/aesni-intel/aesni-intel_asm.c
>new file mode 100644
>index 00000000000..eb1f476657c
>--- /dev/null
>+++ b/third_party/aesni-intel/aesni-intel_asm.c
>@@ -0,0 +1,2793 @@
>+/*
>+ * Implement AES algorithm in Intel AES-NI instructions.
>+ *
>+ * The white paper of AES-NI instructions can be downloaded from:
>+ *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
>+ *
>+ * Copyright (C) 2008, Intel Corp.
>+ *    Author: Huang Ying <ying.huang@intel.com>
>+ *            Vinodh Gopal <vinodh.gopal@intel.com>
>+ *            Kahraman Akdemir
>+ *
>+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
>+ * interface for 64-bit kernels.
>+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
>+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
>+ *             Adrian Hoban <adrian.hoban@intel.com>
>+ *             James Guilford (james.guilford@intel.com)
>+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
>+ *             Tadeusz Struk (tadeusz.struk@intel.com)
>+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
>+ *    Copyright (c) 2010, Intel Corporation.
>+ *
>+ * Ported x86_64 version to x86:
>+ *    Author: Mathias Krause <minipli@googlemail.com>
>+ *
>+ * This program is free software; you can redistribute it and/or modify
>+ * it under the terms of the GNU General Public License as published by
>+ * the Free Software Foundation; either version 2 of the License, or
>+ * (at your option) any later version.
>+ */
>+
>+#define ENTRY(name) \
>+	.globl name ; \
>+	.align 4,0x90 ; \
>+	name:
>+#define ENDPROC(name) \
>+	.type name, @function ; \
>+	.size name, .-name
>+#include "inst-intel.h"
>+
>+/*
>+ * The following macros are used to move an (un)aligned 16 byte value to/from
>+ * an XMM register.  This can done for either FP or integer values, for FP use
>+ * movaps (move aligned packed single) or integer use movdqa (move double quad
>+ * aligned).  It doesn't make a performance difference which instruction is used
>+ * since Nehalem (original Core i7) was released.  However, the movaps is a byte
>+ * shorter, so that is the one we'll use for now. (same for unaligned).
>+ */
>+#define MOVADQ	movaps
>+#define MOVUDQ	movups
>+
>+#ifdef __x86_64__
>+
>+.data
>+.align 16
>+.Lgf128mul_x_ble_mask:
>+	.octa 0x00000000000000010000000000000087
>+POLY:   .octa 0xC2000000000000000000000000000001
>+TWOONE: .octa 0x00000001000000000000000000000001
>+
>+# order of these constants should not change.
>+# more specifically, ALL_F should follow SHIFT_MASK,
>+# and ZERO should follow ALL_F
>+
>+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
>+MASK1:      .octa 0x0000000000000000ffffffffffffffff
>+MASK2:      .octa 0xffffffffffffffff0000000000000000
>+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
>+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
>+ZERO:       .octa 0x00000000000000000000000000000000
>+ONE:        .octa 0x00000000000000000000000000000001
>+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
>+dec:        .octa 0x1
>+enc:        .octa 0x2
>+
>+
>+.text
>+
>+
>+#define	STACK_OFFSET    8*3
>+#define	HashKey		16*0	// store HashKey <<1 mod poly here
>+#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
>+#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
>+#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
>+#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
>+				// bits of  HashKey <<1 mod poly here
>+				//(for Karatsuba purposes)
>+#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
>+				// bits of  HashKey^2 <<1 mod poly here
>+				// (for Karatsuba purposes)
>+#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
>+				// bits of  HashKey^3 <<1 mod poly here
>+				// (for Karatsuba purposes)
>+#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
>+				// bits of  HashKey^4 <<1 mod poly here
>+				// (for Karatsuba purposes)
>+#define	VARIABLE_OFFSET	16*8
>+
>+#define arg1 rdi
>+#define arg2 rsi
>+#define arg3 rdx
>+#define arg4 rcx
>+#define arg5 r8
>+#define arg6 r9
>+#define arg7 STACK_OFFSET+8(%r14)
>+#define arg8 STACK_OFFSET+16(%r14)
>+#define arg9 STACK_OFFSET+24(%r14)
>+#define arg10 STACK_OFFSET+32(%r14)
>+#define keysize 2*15*16(%arg1)
>+#endif
>+
>+
>+#define STATE1	%xmm0
>+#define STATE2	%xmm4
>+#define STATE3	%xmm5
>+#define STATE4	%xmm6
>+#define STATE	STATE1
>+#define IN1	%xmm1
>+#define IN2	%xmm7
>+#define IN3	%xmm8
>+#define IN4	%xmm9
>+#define IN	IN1
>+#define KEY	%xmm2
>+#define IV	%xmm3
>+
>+#define BSWAP_MASK %xmm10
>+#define CTR	%xmm11
>+#define INC	%xmm12
>+
>+#define GF128MUL_MASK %xmm10
>+
>+#ifdef __x86_64__
>+#define AREG	%rax
>+#define KEYP	%rdi
>+#define OUTP	%rsi
>+#define UKEYP	OUTP
>+#define INP	%rdx
>+#define LEN	%rcx
>+#define IVP	%r8
>+#define KLEN	%r9d
>+#define T1	%r10
>+#define TKEYP	T1
>+#define T2	%r11
>+#define TCTR_LOW T2
>+#else
>+#define AREG	%eax
>+#define KEYP	%edi
>+#define OUTP	AREG
>+#define UKEYP	OUTP
>+#define INP	%edx
>+#define LEN	%esi
>+#define IVP	%ebp
>+#define KLEN	%ebx
>+#define T1	%ecx
>+#define TKEYP	T1
>+#endif
>+
>+
>+#ifdef __x86_64__
>+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
>+*
>+*
>+* Input: A and B (128-bits each, bit-reflected)
>+* Output: C = A*B*x mod poly, (i.e. >>1 )
>+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
>+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
>+*
>+*/
>+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
>+	movdqa	  \GH, \TMP1
>+	pshufd	  $78, \GH, \TMP2
>+	pshufd	  $78, \HK, \TMP3
>+	pxor	  \GH, \TMP2            # TMP2 = a1+a0
>+	pxor	  \HK, \TMP3            # TMP3 = b1+b0
>+	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
>+	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
>+	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
>+	pxor	  \GH, \TMP2
>+	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
>+	movdqa	  \TMP2, \TMP3
>+	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
>+	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
>+	pxor	  \TMP3, \GH
>+	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
>+
>+        # first phase of the reduction
>+
>+	movdqa    \GH, \TMP2
>+	movdqa    \GH, \TMP3
>+	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
>+					# in in order to perform
>+					# independent shifts
>+	pslld     $31, \TMP2            # packed right shift <<31
>+	pslld     $30, \TMP3            # packed right shift <<30
>+	pslld     $25, \TMP4            # packed right shift <<25
>+	pxor      \TMP3, \TMP2          # xor the shifted versions
>+	pxor      \TMP4, \TMP2
>+	movdqa    \TMP2, \TMP5
>+	psrldq    $4, \TMP5             # right shift TMP5 1 DW
>+	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
>+	pxor      \TMP2, \GH
>+
>+        # second phase of the reduction
>+
>+	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
>+					# in in order to perform
>+					# independent shifts
>+	movdqa    \GH,\TMP3
>+	movdqa    \GH,\TMP4
>+	psrld     $1,\TMP2              # packed left shift >>1
>+	psrld     $2,\TMP3              # packed left shift >>2
>+	psrld     $7,\TMP4              # packed left shift >>7
>+	pxor      \TMP3,\TMP2		# xor the shifted versions
>+	pxor      \TMP4,\TMP2
>+	pxor      \TMP5, \TMP2
>+	pxor      \TMP2, \GH
>+	pxor      \TMP1, \GH            # result is in TMP1
>+.endm
>+
>+/*
>+* if a = number of total plaintext bytes
>+* b = floor(a/16)
>+* num_initial_blocks = b mod 4
>+* encrypt the initial num_initial_blocks blocks and apply ghash on
>+* the ciphertext
>+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
>+* are clobbered
>+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
>+*/
>+
>+
>+.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
>+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
>+        MOVADQ     SHUF_MASK(%rip), %xmm14
>+	mov	   arg7, %r10           # %r10 = AAD
>+	mov	   arg8, %r12           # %r12 = aadLen
>+	mov	   %r12, %r11
>+	pxor	   %xmm\i, %xmm\i
>+
>+_get_AAD_loop\num_initial_blocks\operation:
>+	movd	   (%r10), \TMP1
>+	pslldq	   $12, \TMP1
>+	psrldq	   $4, %xmm\i
>+	pxor	   \TMP1, %xmm\i
>+	add	   $4, %r10
>+	sub	   $4, %r12
>+	jne	   _get_AAD_loop\num_initial_blocks\operation
>+
>+	cmp	   $16, %r11
>+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
>+
>+	mov	   $16, %r12
>+_get_AAD_loop2\num_initial_blocks\operation:
>+	psrldq	   $4, %xmm\i
>+	sub	   $4, %r12
>+	cmp	   %r11, %r12
>+	jne	   _get_AAD_loop2\num_initial_blocks\operation
>+
>+_get_AAD_loop2_done\num_initial_blocks\operation:
>+	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
>+
>+	xor	   %r11, %r11 # initialise the data pointer offset as zero
>+
>+        # start AES for num_initial_blocks blocks
>+
>+	mov	   %arg5, %rax                      # %rax = *Y0
>+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
>+	PSHUFB_XMM   %xmm14, \XMM0
>+
>+.if (\i == 5) || (\i == 6) || (\i == 7)
>+	MOVADQ		ONE(%RIP),\TMP1
>+	MOVADQ		(%arg1),\TMP2
>+.irpc index, \i_seq
>+	paddd	   \TMP1, \XMM0                 # INCR Y0
>+	movdqa	   \XMM0, %xmm\index
>+	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
>+	pxor	   \TMP2, %xmm\index
>+.endr
>+	lea	0x10(%arg1),%r10
>+	mov	keysize,%eax
>+	shr	$2,%eax				# 128->4, 192->6, 256->8
>+	add	$5,%eax			      # 128->9, 192->11, 256->13
>+
>+aes_loop_initial_dec\num_initial_blocks:
>+	MOVADQ	(%r10),\TMP1
>+.irpc	index, \i_seq
>+	AESENC	\TMP1, %xmm\index
>+.endr
>+	add	$16,%r10
>+	sub	$1,%eax
>+	jnz	aes_loop_initial_dec\num_initial_blocks
>+
>+	MOVADQ	(%r10), \TMP1
>+.irpc index, \i_seq
>+	AESENCLAST \TMP1, %xmm\index         # Last Round
>+.endr
>+.irpc index, \i_seq
>+	movdqu	   (%arg3 , %r11, 1), \TMP1
>+	pxor	   \TMP1, %xmm\index
>+	movdqu	   %xmm\index, (%arg2 , %r11, 1)
>+	# write back plaintext/ciphertext for num_initial_blocks
>+	add	   $16, %r11
>+
>+	movdqa     \TMP1, %xmm\index
>+	PSHUFB_XMM	   %xmm14, %xmm\index
>+                # prepare plaintext/ciphertext for GHASH computation
>+.endr
>+.endif
>+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        # apply GHASH on num_initial_blocks blocks
>+
>+.if \i == 5
>+        pxor       %xmm5, %xmm6
>+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        pxor       %xmm6, %xmm7
>+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        pxor       %xmm7, %xmm8
>+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+.elseif \i == 6
>+        pxor       %xmm6, %xmm7
>+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        pxor       %xmm7, %xmm8
>+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+.elseif \i == 7
>+        pxor       %xmm7, %xmm8
>+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+.endif
>+	cmp	   $64, %r13
>+	jl	_initial_blocks_done\num_initial_blocks\operation
>+	# no need for precomputed values
>+/*
>+*
>+* Precomputations for HashKey parallel with encryption of first 4 blocks.
>+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
>+*/
>+	MOVADQ	   ONE(%rip), \TMP1
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM1
>+	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
>+
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM2
>+	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
>+
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM3
>+	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
>+
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM4
>+	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
>+
>+	MOVADQ	   0(%arg1),\TMP1
>+	pxor	   \TMP1, \XMM1
>+	pxor	   \TMP1, \XMM2
>+	pxor	   \TMP1, \XMM3
>+	pxor	   \TMP1, \XMM4
>+	movdqa	   \TMP3, \TMP5
>+	pshufd	   $78, \TMP3, \TMP1
>+	pxor	   \TMP3, \TMP1
>+	movdqa	   \TMP1, HashKey_k(%rsp)
>+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
>+# TMP5 = HashKey^2<<1 (mod poly)
>+	movdqa	   \TMP5, HashKey_2(%rsp)
>+# HashKey_2 = HashKey^2<<1 (mod poly)
>+	pshufd	   $78, \TMP5, \TMP1
>+	pxor	   \TMP5, \TMP1
>+	movdqa	   \TMP1, HashKey_2_k(%rsp)
>+.irpc index, 1234 # do 4 rounds
>+	movaps 0x10*\index(%arg1), \TMP1
>+	AESENC	   \TMP1, \XMM1
>+	AESENC	   \TMP1, \XMM2
>+	AESENC	   \TMP1, \XMM3
>+	AESENC	   \TMP1, \XMM4
>+.endr
>+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
>+# TMP5 = HashKey^3<<1 (mod poly)
>+	movdqa	   \TMP5, HashKey_3(%rsp)
>+	pshufd	   $78, \TMP5, \TMP1
>+	pxor	   \TMP5, \TMP1
>+	movdqa	   \TMP1, HashKey_3_k(%rsp)
>+.irpc index, 56789 # do next 5 rounds
>+	movaps 0x10*\index(%arg1), \TMP1
>+	AESENC	   \TMP1, \XMM1
>+	AESENC	   \TMP1, \XMM2
>+	AESENC	   \TMP1, \XMM3
>+	AESENC	   \TMP1, \XMM4
>+.endr
>+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
>+# TMP5 = HashKey^3<<1 (mod poly)
>+	movdqa	   \TMP5, HashKey_4(%rsp)
>+	pshufd	   $78, \TMP5, \TMP1
>+	pxor	   \TMP5, \TMP1
>+	movdqa	   \TMP1, HashKey_4_k(%rsp)
>+	lea	   0xa0(%arg1),%r10
>+	mov	   keysize,%eax
>+	shr	   $2,%eax			# 128->4, 192->6, 256->8
>+	sub	   $4,%eax			# 128->0, 192->2, 256->4
>+	jz	   aes_loop_pre_dec_done\num_initial_blocks
>+
>+aes_loop_pre_dec\num_initial_blocks:
>+	MOVADQ	   (%r10),\TMP2
>+.irpc	index, 1234
>+	AESENC	   \TMP2, %xmm\index
>+.endr
>+	add	   $16,%r10
>+	sub	   $1,%eax
>+	jnz	   aes_loop_pre_dec\num_initial_blocks
>+
>+aes_loop_pre_dec_done\num_initial_blocks:
>+	MOVADQ	   (%r10), \TMP2
>+	AESENCLAST \TMP2, \XMM1
>+	AESENCLAST \TMP2, \XMM2
>+	AESENCLAST \TMP2, \XMM3
>+	AESENCLAST \TMP2, \XMM4
>+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM1
>+	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
>+	movdqa     \TMP1, \XMM1
>+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM2
>+	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
>+	movdqa     \TMP1, \XMM2
>+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM3
>+	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
>+	movdqa     \TMP1, \XMM3
>+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM4
>+	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
>+	movdqa     \TMP1, \XMM4
>+	add	   $64, %r11
>+	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
>+	pxor	   \XMMDst, \XMM1
>+# combine GHASHed value with the corresponding ciphertext
>+	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
>+	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
>+	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
>+
>+_initial_blocks_done\num_initial_blocks\operation:
>+
>+.endm
>+
>+
>+/*
>+* if a = number of total plaintext bytes
>+* b = floor(a/16)
>+* num_initial_blocks = b mod 4
>+* encrypt the initial num_initial_blocks blocks and apply ghash on
>+* the ciphertext
>+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
>+* are clobbered
>+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
>+*/
>+
>+
>+.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
>+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
>+        MOVADQ     SHUF_MASK(%rip), %xmm14
>+	mov	   arg7, %r10           # %r10 = AAD
>+	mov	   arg8, %r12           # %r12 = aadLen
>+	mov	   %r12, %r11
>+	pxor	   %xmm\i, %xmm\i
>+_get_AAD_loop\num_initial_blocks\operation:
>+	movd	   (%r10), \TMP1
>+	pslldq	   $12, \TMP1
>+	psrldq	   $4, %xmm\i
>+	pxor	   \TMP1, %xmm\i
>+	add	   $4, %r10
>+	sub	   $4, %r12
>+	jne	   _get_AAD_loop\num_initial_blocks\operation
>+	cmp	   $16, %r11
>+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
>+	mov	   $16, %r12
>+_get_AAD_loop2\num_initial_blocks\operation:
>+	psrldq	   $4, %xmm\i
>+	sub	   $4, %r12
>+	cmp	   %r11, %r12
>+	jne	   _get_AAD_loop2\num_initial_blocks\operation
>+_get_AAD_loop2_done\num_initial_blocks\operation:
>+	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
>+
>+	xor	   %r11, %r11 # initialise the data pointer offset as zero
>+
>+        # start AES for num_initial_blocks blocks
>+
>+	mov	   %arg5, %rax                      # %rax = *Y0
>+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
>+	PSHUFB_XMM   %xmm14, \XMM0
>+
>+.if (\i == 5) || (\i == 6) || (\i == 7)
>+
>+	MOVADQ		ONE(%RIP),\TMP1
>+	MOVADQ		0(%arg1),\TMP2
>+.irpc index, \i_seq
>+	paddd		\TMP1, \XMM0                 # INCR Y0
>+	MOVADQ		\XMM0, %xmm\index
>+	PSHUFB_XMM	%xmm14, %xmm\index      # perform a 16 byte swap
>+	pxor		\TMP2, %xmm\index
>+.endr
>+	lea	0x10(%arg1),%r10
>+	mov	keysize,%eax
>+	shr	$2,%eax				# 128->4, 192->6, 256->8
>+	add	$5,%eax			      # 128->9, 192->11, 256->13
>+
>+aes_loop_initial_enc\num_initial_blocks:
>+	MOVADQ	(%r10),\TMP1
>+.irpc	index, \i_seq
>+	AESENC	\TMP1, %xmm\index
>+.endr
>+	add	$16,%r10
>+	sub	$1,%eax
>+	jnz	aes_loop_initial_enc\num_initial_blocks
>+
>+	MOVADQ	(%r10), \TMP1
>+.irpc index, \i_seq
>+	AESENCLAST \TMP1, %xmm\index         # Last Round
>+.endr
>+.irpc index, \i_seq
>+	movdqu	   (%arg3 , %r11, 1), \TMP1
>+	pxor	   \TMP1, %xmm\index
>+	movdqu	   %xmm\index, (%arg2 , %r11, 1)
>+	# write back plaintext/ciphertext for num_initial_blocks
>+	add	   $16, %r11
>+	PSHUFB_XMM	   %xmm14, %xmm\index
>+
>+		# prepare plaintext/ciphertext for GHASH computation
>+.endr
>+.endif
>+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        # apply GHASH on num_initial_blocks blocks
>+
>+.if \i == 5
>+        pxor       %xmm5, %xmm6
>+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        pxor       %xmm6, %xmm7
>+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        pxor       %xmm7, %xmm8
>+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+.elseif \i == 6
>+        pxor       %xmm6, %xmm7
>+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+        pxor       %xmm7, %xmm8
>+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+.elseif \i == 7
>+        pxor       %xmm7, %xmm8
>+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
>+.endif
>+	cmp	   $64, %r13
>+	jl	_initial_blocks_done\num_initial_blocks\operation
>+	# no need for precomputed values
>+/*
>+*
>+* Precomputations for HashKey parallel with encryption of first 4 blocks.
>+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
>+*/
>+	MOVADQ	   ONE(%RIP),\TMP1
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM1
>+	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
>+
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM2
>+	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
>+
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM3
>+	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
>+
>+	paddd	   \TMP1, \XMM0              # INCR Y0
>+	MOVADQ	   \XMM0, \XMM4
>+	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
>+
>+	MOVADQ	   0(%arg1),\TMP1
>+	pxor	   \TMP1, \XMM1
>+	pxor	   \TMP1, \XMM2
>+	pxor	   \TMP1, \XMM3
>+	pxor	   \TMP1, \XMM4
>+	movdqa	   \TMP3, \TMP5
>+	pshufd	   $78, \TMP3, \TMP1
>+	pxor	   \TMP3, \TMP1
>+	movdqa	   \TMP1, HashKey_k(%rsp)
>+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
>+# TMP5 = HashKey^2<<1 (mod poly)
>+	movdqa	   \TMP5, HashKey_2(%rsp)
>+# HashKey_2 = HashKey^2<<1 (mod poly)
>+	pshufd	   $78, \TMP5, \TMP1
>+	pxor	   \TMP5, \TMP1
>+	movdqa	   \TMP1, HashKey_2_k(%rsp)
>+.irpc index, 1234 # do 4 rounds
>+	movaps 0x10*\index(%arg1), \TMP1
>+	AESENC	   \TMP1, \XMM1
>+	AESENC	   \TMP1, \XMM2
>+	AESENC	   \TMP1, \XMM3
>+	AESENC	   \TMP1, \XMM4
>+.endr
>+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
>+# TMP5 = HashKey^3<<1 (mod poly)
>+	movdqa	   \TMP5, HashKey_3(%rsp)
>+	pshufd	   $78, \TMP5, \TMP1
>+	pxor	   \TMP5, \TMP1
>+	movdqa	   \TMP1, HashKey_3_k(%rsp)
>+.irpc index, 56789 # do next 5 rounds
>+	movaps 0x10*\index(%arg1), \TMP1
>+	AESENC	   \TMP1, \XMM1
>+	AESENC	   \TMP1, \XMM2
>+	AESENC	   \TMP1, \XMM3
>+	AESENC	   \TMP1, \XMM4
>+.endr
>+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
>+# TMP5 = HashKey^3<<1 (mod poly)
>+	movdqa	   \TMP5, HashKey_4(%rsp)
>+	pshufd	   $78, \TMP5, \TMP1
>+	pxor	   \TMP5, \TMP1
>+	movdqa	   \TMP1, HashKey_4_k(%rsp)
>+	lea	   0xa0(%arg1),%r10
>+	mov	   keysize,%eax
>+	shr	   $2,%eax			# 128->4, 192->6, 256->8
>+	sub	   $4,%eax			# 128->0, 192->2, 256->4
>+	jz	   aes_loop_pre_enc_done\num_initial_blocks
>+
>+aes_loop_pre_enc\num_initial_blocks:
>+	MOVADQ	   (%r10),\TMP2
>+.irpc	index, 1234
>+	AESENC	   \TMP2, %xmm\index
>+.endr
>+	add	   $16,%r10
>+	sub	   $1,%eax
>+	jnz	   aes_loop_pre_enc\num_initial_blocks
>+
>+aes_loop_pre_enc_done\num_initial_blocks:
>+	MOVADQ	   (%r10), \TMP2
>+	AESENCLAST \TMP2, \XMM1
>+	AESENCLAST \TMP2, \XMM2
>+	AESENCLAST \TMP2, \XMM3
>+	AESENCLAST \TMP2, \XMM4
>+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM1
>+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM2
>+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM3
>+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
>+	pxor	   \TMP1, \XMM4
>+	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
>+	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
>+	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
>+	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
>+
>+	add	   $64, %r11
>+	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
>+	pxor	   \XMMDst, \XMM1
>+# combine GHASHed value with the corresponding ciphertext
>+	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
>+	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
>+	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
>+
>+_initial_blocks_done\num_initial_blocks\operation:
>+
>+.endm
>+
>+/*
>+* encrypt 4 blocks at a time
>+* ghash the 4 previously encrypted ciphertext blocks
>+* arg1, %arg2, %arg3 are used as pointers only, not modified
>+* %r11 is the data offset value
>+*/
>+.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
>+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
>+
>+	movdqa	  \XMM1, \XMM5
>+	movdqa	  \XMM2, \XMM6
>+	movdqa	  \XMM3, \XMM7
>+	movdqa	  \XMM4, \XMM8
>+
>+        movdqa    SHUF_MASK(%rip), %xmm15
>+        # multiply TMP5 * HashKey using karatsuba
>+
>+	movdqa	  \XMM5, \TMP4
>+	pshufd	  $78, \XMM5, \TMP6
>+	pxor	  \XMM5, \TMP6
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa	  HashKey_4(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
>+	movdqa    \XMM0, \XMM1
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa    \XMM0, \XMM2
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa    \XMM0, \XMM3
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa    \XMM0, \XMM4
>+	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
>+	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
>+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
>+
>+	pxor	  (%arg1), \XMM1
>+	pxor	  (%arg1), \XMM2
>+	pxor	  (%arg1), \XMM3
>+	pxor	  (%arg1), \XMM4
>+	movdqa	  HashKey_4_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
>+	movaps 0x10(%arg1), \TMP1
>+	AESENC	  \TMP1, \XMM1              # Round 1
>+	AESENC	  \TMP1, \XMM2
>+	AESENC	  \TMP1, \XMM3
>+	AESENC	  \TMP1, \XMM4
>+	movaps 0x20(%arg1), \TMP1
>+	AESENC	  \TMP1, \XMM1              # Round 2
>+	AESENC	  \TMP1, \XMM2
>+	AESENC	  \TMP1, \XMM3
>+	AESENC	  \TMP1, \XMM4
>+	movdqa	  \XMM6, \TMP1
>+	pshufd	  $78, \XMM6, \TMP2
>+	pxor	  \XMM6, \TMP2
>+	movdqa	  HashKey_3(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
>+	movaps 0x30(%arg1), \TMP3
>+	AESENC    \TMP3, \XMM1              # Round 3
>+	AESENC    \TMP3, \XMM2
>+	AESENC    \TMP3, \XMM3
>+	AESENC    \TMP3, \XMM4
>+	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
>+	movaps 0x40(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1              # Round 4
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	movdqa	  HashKey_3_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
>+	movaps 0x50(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1              # Round 5
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	pxor	  \TMP1, \TMP4
>+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
>+	pxor	  \XMM6, \XMM5
>+	pxor	  \TMP2, \TMP6
>+	movdqa	  \XMM7, \TMP1
>+	pshufd	  $78, \XMM7, \TMP2
>+	pxor	  \XMM7, \TMP2
>+	movdqa	  HashKey_2(%rsp ), \TMP5
>+
>+        # Multiply TMP5 * HashKey using karatsuba
>+
>+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
>+	movaps 0x60(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1              # Round 6
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
>+	movaps 0x70(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1             # Round 7
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	movdqa	  HashKey_2_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
>+	movaps 0x80(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1             # Round 8
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	pxor	  \TMP1, \TMP4
>+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
>+	pxor	  \XMM7, \XMM5
>+	pxor	  \TMP2, \TMP6
>+
>+        # Multiply XMM8 * HashKey
>+        # XMM8 and TMP5 hold the values for the two operands
>+
>+	movdqa	  \XMM8, \TMP1
>+	pshufd	  $78, \XMM8, \TMP2
>+	pxor	  \XMM8, \TMP2
>+	movdqa	  HashKey(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
>+	movaps 0x90(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1            # Round 9
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
>+	lea	  0xa0(%arg1),%r10
>+	mov	  keysize,%eax
>+	shr	  $2,%eax			# 128->4, 192->6, 256->8
>+	sub	  $4,%eax			# 128->0, 192->2, 256->4
>+	jz	  aes_loop_par_enc_done
>+
>+aes_loop_par_enc:
>+	MOVADQ	  (%r10),\TMP3
>+.irpc	index, 1234
>+	AESENC	  \TMP3, %xmm\index
>+.endr
>+	add	  $16,%r10
>+	sub	  $1,%eax
>+	jnz	  aes_loop_par_enc
>+
>+aes_loop_par_enc_done:
>+	MOVADQ	  (%r10), \TMP3
>+	AESENCLAST \TMP3, \XMM1           # Round 10
>+	AESENCLAST \TMP3, \XMM2
>+	AESENCLAST \TMP3, \XMM3
>+	AESENCLAST \TMP3, \XMM4
>+	movdqa    HashKey_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
>+	movdqu	  (%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
>+	movdqu	  16(%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
>+	movdqu	  32(%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
>+	movdqu	  48(%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
>+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
>+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
>+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
>+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
>+	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
>+
>+	pxor	  \TMP4, \TMP1
>+	pxor	  \XMM8, \XMM5
>+	pxor	  \TMP6, \TMP2
>+	pxor	  \TMP1, \TMP2
>+	pxor	  \XMM5, \TMP2
>+	movdqa	  \TMP2, \TMP3
>+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
>+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
>+	pxor	  \TMP3, \XMM5
>+	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
>+
>+        # first phase of reduction
>+
>+	movdqa    \XMM5, \TMP2
>+	movdqa    \XMM5, \TMP3
>+	movdqa    \XMM5, \TMP4
>+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
>+	pslld     $31, \TMP2                   # packed right shift << 31
>+	pslld     $30, \TMP3                   # packed right shift << 30
>+	pslld     $25, \TMP4                   # packed right shift << 25
>+	pxor      \TMP3, \TMP2	               # xor the shifted versions
>+	pxor      \TMP4, \TMP2
>+	movdqa    \TMP2, \TMP5
>+	psrldq    $4, \TMP5                    # right shift T5 1 DW
>+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
>+	pxor      \TMP2, \XMM5
>+
>+        # second phase of reduction
>+
>+	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
>+	movdqa    \XMM5,\TMP3
>+	movdqa    \XMM5,\TMP4
>+	psrld     $1, \TMP2                    # packed left shift >>1
>+	psrld     $2, \TMP3                    # packed left shift >>2
>+	psrld     $7, \TMP4                    # packed left shift >>7
>+	pxor      \TMP3,\TMP2		       # xor the shifted versions
>+	pxor      \TMP4,\TMP2
>+	pxor      \TMP5, \TMP2
>+	pxor      \TMP2, \XMM5
>+	pxor      \TMP1, \XMM5                 # result is in TMP1
>+
>+	pxor	  \XMM5, \XMM1
>+.endm
>+
>+/*
>+* decrypt 4 blocks at a time
>+* ghash the 4 previously decrypted ciphertext blocks
>+* arg1, %arg2, %arg3 are used as pointers only, not modified
>+* %r11 is the data offset value
>+*/
>+.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
>+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
>+
>+	movdqa	  \XMM1, \XMM5
>+	movdqa	  \XMM2, \XMM6
>+	movdqa	  \XMM3, \XMM7
>+	movdqa	  \XMM4, \XMM8
>+
>+        movdqa    SHUF_MASK(%rip), %xmm15
>+        # multiply TMP5 * HashKey using karatsuba
>+
>+	movdqa	  \XMM5, \TMP4
>+	pshufd	  $78, \XMM5, \TMP6
>+	pxor	  \XMM5, \TMP6
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa	  HashKey_4(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
>+	movdqa    \XMM0, \XMM1
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa    \XMM0, \XMM2
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa    \XMM0, \XMM3
>+	paddd     ONE(%rip), \XMM0		# INCR CNT
>+	movdqa    \XMM0, \XMM4
>+	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
>+	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
>+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
>+
>+	pxor	  (%arg1), \XMM1
>+	pxor	  (%arg1), \XMM2
>+	pxor	  (%arg1), \XMM3
>+	pxor	  (%arg1), \XMM4
>+	movdqa	  HashKey_4_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
>+	movaps 0x10(%arg1), \TMP1
>+	AESENC	  \TMP1, \XMM1              # Round 1
>+	AESENC	  \TMP1, \XMM2
>+	AESENC	  \TMP1, \XMM3
>+	AESENC	  \TMP1, \XMM4
>+	movaps 0x20(%arg1), \TMP1
>+	AESENC	  \TMP1, \XMM1              # Round 2
>+	AESENC	  \TMP1, \XMM2
>+	AESENC	  \TMP1, \XMM3
>+	AESENC	  \TMP1, \XMM4
>+	movdqa	  \XMM6, \TMP1
>+	pshufd	  $78, \XMM6, \TMP2
>+	pxor	  \XMM6, \TMP2
>+	movdqa	  HashKey_3(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
>+	movaps 0x30(%arg1), \TMP3
>+	AESENC    \TMP3, \XMM1              # Round 3
>+	AESENC    \TMP3, \XMM2
>+	AESENC    \TMP3, \XMM3
>+	AESENC    \TMP3, \XMM4
>+	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
>+	movaps 0x40(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1              # Round 4
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	movdqa	  HashKey_3_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
>+	movaps 0x50(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1              # Round 5
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	pxor	  \TMP1, \TMP4
>+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
>+	pxor	  \XMM6, \XMM5
>+	pxor	  \TMP2, \TMP6
>+	movdqa	  \XMM7, \TMP1
>+	pshufd	  $78, \XMM7, \TMP2
>+	pxor	  \XMM7, \TMP2
>+	movdqa	  HashKey_2(%rsp ), \TMP5
>+
>+        # Multiply TMP5 * HashKey using karatsuba
>+
>+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
>+	movaps 0x60(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1              # Round 6
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
>+	movaps 0x70(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1             # Round 7
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	movdqa	  HashKey_2_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
>+	movaps 0x80(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1             # Round 8
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	pxor	  \TMP1, \TMP4
>+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
>+	pxor	  \XMM7, \XMM5
>+	pxor	  \TMP2, \TMP6
>+
>+        # Multiply XMM8 * HashKey
>+        # XMM8 and TMP5 hold the values for the two operands
>+
>+	movdqa	  \XMM8, \TMP1
>+	pshufd	  $78, \XMM8, \TMP2
>+	pxor	  \XMM8, \TMP2
>+	movdqa	  HashKey(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
>+	movaps 0x90(%arg1), \TMP3
>+	AESENC	  \TMP3, \XMM1            # Round 9
>+	AESENC	  \TMP3, \XMM2
>+	AESENC	  \TMP3, \XMM3
>+	AESENC	  \TMP3, \XMM4
>+	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
>+	lea	  0xa0(%arg1),%r10
>+	mov	  keysize,%eax
>+	shr	  $2,%eax		        # 128->4, 192->6, 256->8
>+	sub	  $4,%eax			# 128->0, 192->2, 256->4
>+	jz	  aes_loop_par_dec_done
>+
>+aes_loop_par_dec:
>+	MOVADQ	  (%r10),\TMP3
>+.irpc	index, 1234
>+	AESENC	  \TMP3, %xmm\index
>+.endr
>+	add	  $16,%r10
>+	sub	  $1,%eax
>+	jnz	  aes_loop_par_dec
>+
>+aes_loop_par_dec_done:
>+	MOVADQ	  (%r10), \TMP3
>+	AESENCLAST \TMP3, \XMM1           # last round
>+	AESENCLAST \TMP3, \XMM2
>+	AESENCLAST \TMP3, \XMM3
>+	AESENCLAST \TMP3, \XMM4
>+	movdqa    HashKey_k(%rsp), \TMP5
>+	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
>+	movdqu	  (%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
>+	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
>+	movdqa    \TMP3, \XMM1
>+	movdqu	  16(%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
>+	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
>+	movdqa    \TMP3, \XMM2
>+	movdqu	  32(%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
>+	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
>+	movdqa    \TMP3, \XMM3
>+	movdqu	  48(%arg3,%r11,1), \TMP3
>+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
>+	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
>+	movdqa    \TMP3, \XMM4
>+	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
>+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
>+
>+	pxor	  \TMP4, \TMP1
>+	pxor	  \XMM8, \XMM5
>+	pxor	  \TMP6, \TMP2
>+	pxor	  \TMP1, \TMP2
>+	pxor	  \XMM5, \TMP2
>+	movdqa	  \TMP2, \TMP3
>+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
>+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
>+	pxor	  \TMP3, \XMM5
>+	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
>+
>+        # first phase of reduction
>+
>+	movdqa    \XMM5, \TMP2
>+	movdqa    \XMM5, \TMP3
>+	movdqa    \XMM5, \TMP4
>+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
>+	pslld     $31, \TMP2                   # packed right shift << 31
>+	pslld     $30, \TMP3                   # packed right shift << 30
>+	pslld     $25, \TMP4                   # packed right shift << 25
>+	pxor      \TMP3, \TMP2	               # xor the shifted versions
>+	pxor      \TMP4, \TMP2
>+	movdqa    \TMP2, \TMP5
>+	psrldq    $4, \TMP5                    # right shift T5 1 DW
>+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
>+	pxor      \TMP2, \XMM5
>+
>+        # second phase of reduction
>+
>+	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
>+	movdqa    \XMM5,\TMP3
>+	movdqa    \XMM5,\TMP4
>+	psrld     $1, \TMP2                    # packed left shift >>1
>+	psrld     $2, \TMP3                    # packed left shift >>2
>+	psrld     $7, \TMP4                    # packed left shift >>7
>+	pxor      \TMP3,\TMP2		       # xor the shifted versions
>+	pxor      \TMP4,\TMP2
>+	pxor      \TMP5, \TMP2
>+	pxor      \TMP2, \XMM5
>+	pxor      \TMP1, \XMM5                 # result is in TMP1
>+
>+	pxor	  \XMM5, \XMM1
>+.endm
>+
>+/* GHASH the last 4 ciphertext blocks. */
>+.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
>+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
>+
>+        # Multiply TMP6 * HashKey (using Karatsuba)
>+
>+	movdqa	  \XMM1, \TMP6
>+	pshufd	  $78, \XMM1, \TMP2
>+	pxor	  \XMM1, \TMP2
>+	movdqa	  HashKey_4(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
>+	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
>+	movdqa	  HashKey_4_k(%rsp), \TMP4
>+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
>+	movdqa	  \XMM1, \XMMDst
>+	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
>+
>+        # Multiply TMP1 * HashKey (using Karatsuba)
>+
>+	movdqa	  \XMM2, \TMP1
>+	pshufd	  $78, \XMM2, \TMP2
>+	pxor	  \XMM2, \TMP2
>+	movdqa	  HashKey_3(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
>+	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
>+	movdqa	  HashKey_3_k(%rsp), \TMP4
>+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
>+	pxor	  \TMP1, \TMP6
>+	pxor	  \XMM2, \XMMDst
>+	pxor	  \TMP2, \XMM1
>+# results accumulated in TMP6, XMMDst, XMM1
>+
>+        # Multiply TMP1 * HashKey (using Karatsuba)
>+
>+	movdqa	  \XMM3, \TMP1
>+	pshufd	  $78, \XMM3, \TMP2
>+	pxor	  \XMM3, \TMP2
>+	movdqa	  HashKey_2(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
>+	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
>+	movdqa	  HashKey_2_k(%rsp), \TMP4
>+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
>+	pxor	  \TMP1, \TMP6
>+	pxor	  \XMM3, \XMMDst
>+	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
>+
>+        # Multiply TMP1 * HashKey (using Karatsuba)
>+	movdqa	  \XMM4, \TMP1
>+	pshufd	  $78, \XMM4, \TMP2
>+	pxor	  \XMM4, \TMP2
>+	movdqa	  HashKey(%rsp), \TMP5
>+	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
>+	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
>+	movdqa	  HashKey_k(%rsp), \TMP4
>+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
>+	pxor	  \TMP1, \TMP6
>+	pxor	  \XMM4, \XMMDst
>+	pxor	  \XMM1, \TMP2
>+	pxor	  \TMP6, \TMP2
>+	pxor	  \XMMDst, \TMP2
>+	# middle section of the temp results combined as in karatsuba algorithm
>+	movdqa	  \TMP2, \TMP4
>+	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
>+	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
>+	pxor	  \TMP4, \XMMDst
>+	pxor	  \TMP2, \TMP6
>+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
>+	# first phase of the reduction
>+	movdqa    \XMMDst, \TMP2
>+	movdqa    \XMMDst, \TMP3
>+	movdqa    \XMMDst, \TMP4
>+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
>+	pslld     $31, \TMP2                # packed right shifting << 31
>+	pslld     $30, \TMP3                # packed right shifting << 30
>+	pslld     $25, \TMP4                # packed right shifting << 25
>+	pxor      \TMP3, \TMP2              # xor the shifted versions
>+	pxor      \TMP4, \TMP2
>+	movdqa    \TMP2, \TMP7
>+	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
>+	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
>+	pxor      \TMP2, \XMMDst
>+
>+        # second phase of the reduction
>+	movdqa    \XMMDst, \TMP2
>+	# make 3 copies of XMMDst for doing 3 shift operations
>+	movdqa    \XMMDst, \TMP3
>+	movdqa    \XMMDst, \TMP4
>+	psrld     $1, \TMP2                 # packed left shift >> 1
>+	psrld     $2, \TMP3                 # packed left shift >> 2
>+	psrld     $7, \TMP4                 # packed left shift >> 7
>+	pxor      \TMP3, \TMP2              # xor the shifted versions
>+	pxor      \TMP4, \TMP2
>+	pxor      \TMP7, \TMP2
>+	pxor      \TMP2, \XMMDst
>+	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
>+.endm
>+
>+
>+/* Encryption of a single block
>+* uses eax & r10
>+*/
>+
>+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
>+
>+	pxor		(%arg1), \XMM0
>+	mov		keysize,%eax
>+	shr		$2,%eax			# 128->4, 192->6, 256->8
>+	add		$5,%eax			# 128->9, 192->11, 256->13
>+	lea		16(%arg1), %r10	  # get first expanded key address
>+
>+_esb_loop_\@:
>+	MOVADQ		(%r10),\TMP1
>+	AESENC		\TMP1,\XMM0
>+	add		$16,%r10
>+	sub		$1,%eax
>+	jnz		_esb_loop_\@
>+
>+	MOVADQ		(%r10),\TMP1
>+	AESENCLAST	\TMP1,\XMM0
>+.endm
>+/*****************************************************************************
>+* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
>+*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
>+*                   const u8 *in,      // Ciphertext input
>+*                   u64 plaintext_len, // Length of data in bytes for decryption.
>+*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
>+*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
>+*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
>+*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
>+*                   const u8 *aad,     // Additional Authentication Data (AAD)
>+*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
>+*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
>+*                                      // given authentication tag and only return the plaintext if they match.
>+*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
>+*                                      // (most likely), 12 or 8.
>+*
>+* Assumptions:
>+*
>+* keys:
>+*       keys are pre-expanded and aligned to 16 bytes. we are using the first
>+*       set of 11 keys in the data structure void *aes_ctx
>+*
>+* iv:
>+*       0                   1                   2                   3
>+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                             Salt  (From the SA)               |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                     Initialization Vector                     |
>+*       |         (This is the sequence number from IPSec header)       |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                              0x1                              |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*
>+*
>+*
>+* AAD:
>+*       AAD padded to 128 bits with 0
>+*       for example, assume AAD is a u32 vector
>+*
>+*       if AAD is 8 bytes:
>+*       AAD[3] = {A0, A1};
>+*       padded AAD in xmm register = {A1 A0 0 0}
>+*
>+*       0                   1                   2                   3
>+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                               SPI (A1)                        |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                     32-bit Sequence Number (A0)               |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                              0x0                              |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*
>+*                                       AAD Format with 32-bit Sequence Number
>+*
>+*       if AAD is 12 bytes:
>+*       AAD[3] = {A0, A1, A2};
>+*       padded AAD in xmm register = {A2 A1 A0 0}
>+*
>+*       0                   1                   2                   3
>+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                               SPI (A2)                        |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                 64-bit Extended Sequence Number {A1,A0}       |
>+*       |                                                               |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                              0x0                              |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*
>+*                        AAD Format with 64-bit Extended Sequence Number
>+*
>+* aadLen:
>+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
>+*       The code supports 16 too but for other sizes, the code will fail.
>+*
>+* TLen:
>+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
>+*       For other sizes, the code will fail.
>+*
>+* poly = x^128 + x^127 + x^126 + x^121 + 1
>+*
>+*****************************************************************************/
>+ENTRY(aesni_gcm_dec)
>+	push	%r12
>+	push	%r13
>+	push	%r14
>+	mov	%rsp, %r14
>+/*
>+* states of %xmm registers %xmm6:%xmm15 not saved
>+* all %xmm registers are clobbered
>+*/
>+	sub	$VARIABLE_OFFSET, %rsp
>+	and	$~63, %rsp                        # align rsp to 64 bytes
>+	mov	%arg6, %r12
>+	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
>+        movdqa  SHUF_MASK(%rip), %xmm2
>+	PSHUFB_XMM %xmm2, %xmm13
>+
>+
>+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
>+
>+	movdqa	%xmm13, %xmm2
>+	psllq	$1, %xmm13
>+	psrlq	$63, %xmm2
>+	movdqa	%xmm2, %xmm1
>+	pslldq	$8, %xmm2
>+	psrldq	$8, %xmm1
>+	por	%xmm2, %xmm13
>+
>+        # Reduction
>+
>+	pshufd	$0x24, %xmm1, %xmm2
>+	pcmpeqd TWOONE(%rip), %xmm2
>+	pand	POLY(%rip), %xmm2
>+	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
>+
>+
>+        # Decrypt first few blocks
>+
>+	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
>+	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
>+	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
>+	mov %r13, %r12
>+	and $(3<<4), %r12
>+	jz _initial_num_blocks_is_0_decrypt
>+	cmp $(2<<4), %r12
>+	jb _initial_num_blocks_is_1_decrypt
>+	je _initial_num_blocks_is_2_decrypt
>+_initial_num_blocks_is_3_decrypt:
>+	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
>+	sub	$48, %r13
>+	jmp	_initial_blocks_decrypted
>+_initial_num_blocks_is_2_decrypt:
>+	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
>+	sub	$32, %r13
>+	jmp	_initial_blocks_decrypted
>+_initial_num_blocks_is_1_decrypt:
>+	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
>+	sub	$16, %r13
>+	jmp	_initial_blocks_decrypted
>+_initial_num_blocks_is_0_decrypt:
>+	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
>+_initial_blocks_decrypted:
>+	cmp	$0, %r13
>+	je	_zero_cipher_left_decrypt
>+	sub	$64, %r13
>+	je	_four_cipher_left_decrypt
>+_decrypt_by_4:
>+	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
>+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
>+	add	$64, %r11
>+	sub	$64, %r13
>+	jne	_decrypt_by_4
>+_four_cipher_left_decrypt:
>+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
>+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
>+_zero_cipher_left_decrypt:
>+	mov	%arg4, %r13
>+	and	$15, %r13				# %r13 = arg4 (mod 16)
>+	je	_multiple_of_16_bytes_decrypt
>+
>+        # Handle the last <16 byte block separately
>+
>+	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
>+        movdqa SHUF_MASK(%rip), %xmm10
>+	PSHUFB_XMM %xmm10, %xmm0
>+
>+	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
>+	sub $16, %r11
>+	add %r13, %r11
>+	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
>+	lea SHIFT_MASK+16(%rip), %r12
>+	sub %r13, %r12
>+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
>+# (%r13 is the number of bytes in plaintext mod 16)
>+	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
>+	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
>+
>+	movdqa  %xmm1, %xmm2
>+	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
>+	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
>+	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
>+	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
>+	pand    %xmm1, %xmm2
>+        movdqa SHUF_MASK(%rip), %xmm10
>+	PSHUFB_XMM %xmm10 ,%xmm2
>+
>+	pxor %xmm2, %xmm8
>+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
>+	          # GHASH computation for the last <16 byte block
>+	sub %r13, %r11
>+	add $16, %r11
>+
>+        # output %r13 bytes
>+	MOVQ_R64_XMM	%xmm0, %rax
>+	cmp	$8, %r13
>+	jle	_less_than_8_bytes_left_decrypt
>+	mov	%rax, (%arg2 , %r11, 1)
>+	add	$8, %r11
>+	psrldq	$8, %xmm0
>+	MOVQ_R64_XMM	%xmm0, %rax
>+	sub	$8, %r13
>+_less_than_8_bytes_left_decrypt:
>+	mov	%al,  (%arg2, %r11, 1)
>+	add	$1, %r11
>+	shr	$8, %rax
>+	sub	$1, %r13
>+	jne	_less_than_8_bytes_left_decrypt
>+_multiple_of_16_bytes_decrypt:
>+	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
>+	shl	$3, %r12		  # convert into number of bits
>+	movd	%r12d, %xmm15		  # len(A) in %xmm15
>+	shl	$3, %arg4		  # len(C) in bits (*128)
>+	MOVQ_R64_XMM	%arg4, %xmm1
>+	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
>+	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
>+	pxor	%xmm15, %xmm8
>+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
>+	         # final GHASH computation
>+        movdqa SHUF_MASK(%rip), %xmm10
>+	PSHUFB_XMM %xmm10, %xmm8
>+
>+	mov	%arg5, %rax		  # %rax = *Y0
>+	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
>+	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
>+	pxor	%xmm8, %xmm0
>+_return_T_decrypt:
>+	mov	arg9, %r10                # %r10 = authTag
>+	mov	arg10, %r11               # %r11 = auth_tag_len
>+	cmp	$16, %r11
>+	je	_T_16_decrypt
>+	cmp	$12, %r11
>+	je	_T_12_decrypt
>+_T_8_decrypt:
>+	MOVQ_R64_XMM	%xmm0, %rax
>+	mov	%rax, (%r10)
>+	jmp	_return_T_done_decrypt
>+_T_12_decrypt:
>+	MOVQ_R64_XMM	%xmm0, %rax
>+	mov	%rax, (%r10)
>+	psrldq	$8, %xmm0
>+	movd	%xmm0, %eax
>+	mov	%eax, 8(%r10)
>+	jmp	_return_T_done_decrypt
>+_T_16_decrypt:
>+	movdqu	%xmm0, (%r10)
>+_return_T_done_decrypt:
>+	mov	%r14, %rsp
>+	pop	%r14
>+	pop	%r13
>+	pop	%r12
>+	ret
>+ENDPROC(aesni_gcm_dec)
>+
>+
>+/*****************************************************************************
>+* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
>+*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
>+*                    const u8 *in,       // Plaintext input
>+*                    u64 plaintext_len,  // Length of data in bytes for encryption.
>+*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
>+*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
>+*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
>+*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
>+*                    const u8 *aad,      // Additional Authentication Data (AAD)
>+*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
>+*                    u8 *auth_tag,       // Authenticated Tag output.
>+*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
>+*                                        // 12 or 8.
>+*
>+* Assumptions:
>+*
>+* keys:
>+*       keys are pre-expanded and aligned to 16 bytes. we are using the
>+*       first set of 11 keys in the data structure void *aes_ctx
>+*
>+*
>+* iv:
>+*       0                   1                   2                   3
>+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                             Salt  (From the SA)               |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                     Initialization Vector                     |
>+*       |         (This is the sequence number from IPSec header)       |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                              0x1                              |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*
>+*
>+*
>+* AAD:
>+*       AAD padded to 128 bits with 0
>+*       for example, assume AAD is a u32 vector
>+*
>+*       if AAD is 8 bytes:
>+*       AAD[3] = {A0, A1};
>+*       padded AAD in xmm register = {A1 A0 0 0}
>+*
>+*       0                   1                   2                   3
>+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                               SPI (A1)                        |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                     32-bit Sequence Number (A0)               |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                              0x0                              |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*
>+*                                 AAD Format with 32-bit Sequence Number
>+*
>+*       if AAD is 12 bytes:
>+*       AAD[3] = {A0, A1, A2};
>+*       padded AAD in xmm register = {A2 A1 A0 0}
>+*
>+*       0                   1                   2                   3
>+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                               SPI (A2)                        |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                 64-bit Extended Sequence Number {A1,A0}       |
>+*       |                                                               |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*       |                              0x0                              |
>+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>+*
>+*                         AAD Format with 64-bit Extended Sequence Number
>+*
>+* aadLen:
>+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
>+*       The code supports 16 too but for other sizes, the code will fail.
>+*
>+* TLen:
>+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
>+*       For other sizes, the code will fail.
>+*
>+* poly = x^128 + x^127 + x^126 + x^121 + 1
>+***************************************************************************/
>+ENTRY(aesni_gcm_enc)
>+	push	%r12
>+	push	%r13
>+	push	%r14
>+	mov	%rsp, %r14
>+#
>+# states of %xmm registers %xmm6:%xmm15 not saved
>+# all %xmm registers are clobbered
>+#
>+	sub	$VARIABLE_OFFSET, %rsp
>+	and	$~63, %rsp
>+	mov	%arg6, %r12
>+	movdqu	(%r12), %xmm13
>+        movdqa  SHUF_MASK(%rip), %xmm2
>+	PSHUFB_XMM %xmm2, %xmm13
>+
>+
>+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
>+
>+	movdqa	%xmm13, %xmm2
>+	psllq	$1, %xmm13
>+	psrlq	$63, %xmm2
>+	movdqa	%xmm2, %xmm1
>+	pslldq	$8, %xmm2
>+	psrldq	$8, %xmm1
>+	por	%xmm2, %xmm13
>+
>+        # reduce HashKey<<1
>+
>+	pshufd	$0x24, %xmm1, %xmm2
>+	pcmpeqd TWOONE(%rip), %xmm2
>+	pand	POLY(%rip), %xmm2
>+	pxor	%xmm2, %xmm13
>+	movdqa	%xmm13, HashKey(%rsp)
>+	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
>+	and	$-16, %r13
>+	mov	%r13, %r12
>+
>+        # Encrypt first few blocks
>+
>+	and	$(3<<4), %r12
>+	jz	_initial_num_blocks_is_0_encrypt
>+	cmp	$(2<<4), %r12
>+	jb	_initial_num_blocks_is_1_encrypt
>+	je	_initial_num_blocks_is_2_encrypt
>+_initial_num_blocks_is_3_encrypt:
>+	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
>+	sub	$48, %r13
>+	jmp	_initial_blocks_encrypted
>+_initial_num_blocks_is_2_encrypt:
>+	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
>+	sub	$32, %r13
>+	jmp	_initial_blocks_encrypted
>+_initial_num_blocks_is_1_encrypt:
>+	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
>+	sub	$16, %r13
>+	jmp	_initial_blocks_encrypted
>+_initial_num_blocks_is_0_encrypt:
>+	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
>+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
>+_initial_blocks_encrypted:
>+
>+        # Main loop - Encrypt remaining blocks
>+
>+	cmp	$0, %r13
>+	je	_zero_cipher_left_encrypt
>+	sub	$64, %r13
>+	je	_four_cipher_left_encrypt
>+_encrypt_by_4_encrypt:
>+	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
>+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
>+	add	$64, %r11
>+	sub	$64, %r13
>+	jne	_encrypt_by_4_encrypt
>+_four_cipher_left_encrypt:
>+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
>+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
>+_zero_cipher_left_encrypt:
>+	mov	%arg4, %r13
>+	and	$15, %r13			# %r13 = arg4 (mod 16)
>+	je	_multiple_of_16_bytes_encrypt
>+
>+         # Handle the last <16 Byte block separately
>+	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
>+        movdqa SHUF_MASK(%rip), %xmm10
>+	PSHUFB_XMM %xmm10, %xmm0
>+
>+
>+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
>+	sub $16, %r11
>+	add %r13, %r11
>+	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
>+	lea SHIFT_MASK+16(%rip), %r12
>+	sub %r13, %r12
>+	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
>+	# (%r13 is the number of bytes in plaintext mod 16)
>+	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
>+	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
>+	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
>+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
>+	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
>+	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
>+        movdqa SHUF_MASK(%rip), %xmm10
>+	PSHUFB_XMM %xmm10,%xmm0
>+
>+	pxor	%xmm0, %xmm8
>+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
>+	# GHASH computation for the last <16 byte block
>+	sub	%r13, %r11
>+	add	$16, %r11
>+
>+	movdqa SHUF_MASK(%rip), %xmm10
>+	PSHUFB_XMM %xmm10, %xmm0
>+
>+	# shuffle xmm0 back to output as ciphertext
>+
>+        # Output %r13 bytes
>+	MOVQ_R64_XMM %xmm0, %rax
>+	cmp $8, %r13
>+	jle _less_than_8_bytes_left_encrypt
>+	mov %rax, (%arg2 , %r11, 1)
>+	add $8, %r11
>+	psrldq $8, %xmm0
>+	MOVQ_R64_XMM %xmm0, %rax
>+	sub $8, %r13
>+_less_than_8_bytes_left_encrypt:
>+	mov %al,  (%arg2, %r11, 1)
>+	add $1, %r11
>+	shr $8, %rax
>+	sub $1, %r13
>+	jne _less_than_8_bytes_left_encrypt
>+_multiple_of_16_bytes_encrypt:
>+	mov	arg8, %r12    # %r12 = addLen (number of bytes)
>+	shl	$3, %r12
>+	movd	%r12d, %xmm15       # len(A) in %xmm15
>+	shl	$3, %arg4               # len(C) in bits (*128)
>+	MOVQ_R64_XMM	%arg4, %xmm1
>+	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
>+	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
>+	pxor	%xmm15, %xmm8
>+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
>+	# final GHASH computation
>+        movdqa SHUF_MASK(%rip), %xmm10
>+	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
>+
>+	mov	%arg5, %rax		       # %rax  = *Y0
>+	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
>+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
>+	pxor	%xmm8, %xmm0
>+_return_T_encrypt:
>+	mov	arg9, %r10                     # %r10 = authTag
>+	mov	arg10, %r11                    # %r11 = auth_tag_len
>+	cmp	$16, %r11
>+	je	_T_16_encrypt
>+	cmp	$12, %r11
>+	je	_T_12_encrypt
>+_T_8_encrypt:
>+	MOVQ_R64_XMM	%xmm0, %rax
>+	mov	%rax, (%r10)
>+	jmp	_return_T_done_encrypt
>+_T_12_encrypt:
>+	MOVQ_R64_XMM	%xmm0, %rax
>+	mov	%rax, (%r10)
>+	psrldq	$8, %xmm0
>+	movd	%xmm0, %eax
>+	mov	%eax, 8(%r10)
>+	jmp	_return_T_done_encrypt
>+_T_16_encrypt:
>+	movdqu	%xmm0, (%r10)
>+_return_T_done_encrypt:
>+	mov	%r14, %rsp
>+	pop	%r14
>+	pop	%r13
>+	pop	%r12
>+	ret
>+ENDPROC(aesni_gcm_enc)
>+
>+#endif
>+
>+
>+.align 4
>+_key_expansion_128:
>+_key_expansion_256a:
>+	pshufd $0b11111111, %xmm1, %xmm1
>+	shufps $0b00010000, %xmm0, %xmm4
>+	pxor %xmm4, %xmm0
>+	shufps $0b10001100, %xmm0, %xmm4
>+	pxor %xmm4, %xmm0
>+	pxor %xmm1, %xmm0
>+	movaps %xmm0, (TKEYP)
>+	add $0x10, TKEYP
>+	ret
>+ENDPROC(_key_expansion_128)
>+ENDPROC(_key_expansion_256a)
>+
>+.align 4
>+_key_expansion_192a:
>+	pshufd $0b01010101, %xmm1, %xmm1
>+	shufps $0b00010000, %xmm0, %xmm4
>+	pxor %xmm4, %xmm0
>+	shufps $0b10001100, %xmm0, %xmm4
>+	pxor %xmm4, %xmm0
>+	pxor %xmm1, %xmm0
>+
>+	movaps %xmm2, %xmm5
>+	movaps %xmm2, %xmm6
>+	pslldq $4, %xmm5
>+	pshufd $0b11111111, %xmm0, %xmm3
>+	pxor %xmm3, %xmm2
>+	pxor %xmm5, %xmm2
>+
>+	movaps %xmm0, %xmm1
>+	shufps $0b01000100, %xmm0, %xmm6
>+	movaps %xmm6, (TKEYP)
>+	shufps $0b01001110, %xmm2, %xmm1
>+	movaps %xmm1, 0x10(TKEYP)
>+	add $0x20, TKEYP
>+	ret
>+ENDPROC(_key_expansion_192a)
>+
>+.align 4
>+_key_expansion_192b:
>+	pshufd $0b01010101, %xmm1, %xmm1
>+	shufps $0b00010000, %xmm0, %xmm4
>+	pxor %xmm4, %xmm0
>+	shufps $0b10001100, %xmm0, %xmm4
>+	pxor %xmm4, %xmm0
>+	pxor %xmm1, %xmm0
>+
>+	movaps %xmm2, %xmm5
>+	pslldq $4, %xmm5
>+	pshufd $0b11111111, %xmm0, %xmm3
>+	pxor %xmm3, %xmm2
>+	pxor %xmm5, %xmm2
>+
>+	movaps %xmm0, (TKEYP)
>+	add $0x10, TKEYP
>+	ret
>+ENDPROC(_key_expansion_192b)
>+
>+.align 4
>+_key_expansion_256b:
>+	pshufd $0b10101010, %xmm1, %xmm1
>+	shufps $0b00010000, %xmm2, %xmm4
>+	pxor %xmm4, %xmm2
>+	shufps $0b10001100, %xmm2, %xmm4
>+	pxor %xmm4, %xmm2
>+	pxor %xmm1, %xmm2
>+	movaps %xmm2, (TKEYP)
>+	add $0x10, TKEYP
>+	ret
>+ENDPROC(_key_expansion_256b)
>+
>+/*
>+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
>+ *                   unsigned int key_len)
>+ */
>+ENTRY(aesni_set_key)
>+#ifndef __x86_64__
>+	pushl KEYP
>+	movl 8(%esp), KEYP		# ctx
>+	movl 12(%esp), UKEYP		# in_key
>+	movl 16(%esp), %edx		# key_len
>+#endif
>+	movups (UKEYP), %xmm0		# user key (first 16 bytes)
>+	movaps %xmm0, (KEYP)
>+	lea 0x10(KEYP), TKEYP		# key addr
>+	movl %edx, 480(KEYP)
>+	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
>+	cmp $24, %dl
>+	jb .Lenc_key128
>+	je .Lenc_key192
>+	movups 0x10(UKEYP), %xmm2	# other user key
>+	movaps %xmm2, (TKEYP)
>+	add $0x10, TKEYP
>+	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
>+	call _key_expansion_256a
>+	AESKEYGENASSIST 0x1 %xmm0 %xmm1
>+	call _key_expansion_256b
>+	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
>+	call _key_expansion_256a
>+	AESKEYGENASSIST 0x2 %xmm0 %xmm1
>+	call _key_expansion_256b
>+	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
>+	call _key_expansion_256a
>+	AESKEYGENASSIST 0x4 %xmm0 %xmm1
>+	call _key_expansion_256b
>+	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
>+	call _key_expansion_256a
>+	AESKEYGENASSIST 0x8 %xmm0 %xmm1
>+	call _key_expansion_256b
>+	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
>+	call _key_expansion_256a
>+	AESKEYGENASSIST 0x10 %xmm0 %xmm1
>+	call _key_expansion_256b
>+	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
>+	call _key_expansion_256a
>+	AESKEYGENASSIST 0x20 %xmm0 %xmm1
>+	call _key_expansion_256b
>+	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
>+	call _key_expansion_256a
>+	jmp .Ldec_key
>+.Lenc_key192:
>+	movq 0x10(UKEYP), %xmm2		# other user key
>+	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
>+	call _key_expansion_192a
>+	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
>+	call _key_expansion_192b
>+	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
>+	call _key_expansion_192a
>+	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
>+	call _key_expansion_192b
>+	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
>+	call _key_expansion_192a
>+	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
>+	call _key_expansion_192b
>+	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
>+	call _key_expansion_192a
>+	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
>+	call _key_expansion_192b
>+	jmp .Ldec_key
>+.Lenc_key128:
>+	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
>+	call _key_expansion_128
>+	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
>+	call _key_expansion_128
>+.Ldec_key:
>+	sub $0x10, TKEYP
>+	movaps (KEYP), %xmm0
>+	movaps (TKEYP), %xmm1
>+	movaps %xmm0, 240(TKEYP)
>+	movaps %xmm1, 240(KEYP)
>+	add $0x10, KEYP
>+	lea 240-16(TKEYP), UKEYP
>+.align 4
>+.Ldec_key_loop:
>+	movaps (KEYP), %xmm0
>+	AESIMC %xmm0 %xmm1
>+	movaps %xmm1, (UKEYP)
>+	add $0x10, KEYP
>+	sub $0x10, UKEYP
>+	cmp TKEYP, KEYP
>+	jb .Ldec_key_loop
>+	xor AREG, AREG
>+#ifndef __x86_64__
>+	popl KEYP
>+#endif
>+	ret
>+ENDPROC(aesni_set_key)
>+
>+/*
>+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
>+ */
>+ENTRY(aesni_enc)
>+#ifndef __x86_64__
>+	pushl KEYP
>+	pushl KLEN
>+	movl 12(%esp), KEYP
>+	movl 16(%esp), OUTP
>+	movl 20(%esp), INP
>+#endif
>+	movl 480(KEYP), KLEN		# key length
>+	movups (INP), STATE		# input
>+	call _aesni_enc1
>+	movups STATE, (OUTP)		# output
>+#ifndef __x86_64__
>+	popl KLEN
>+	popl KEYP
>+#endif
>+	ret
>+ENDPROC(aesni_enc)
>+
>+/*
>+ * _aesni_enc1:		internal ABI
>+ * input:
>+ *	KEYP:		key struct pointer
>+ *	KLEN:		round count
>+ *	STATE:		initial state (input)
>+ * output:
>+ *	STATE:		finial state (output)
>+ * changed:
>+ *	KEY
>+ *	TKEYP (T1)
>+ */
>+.align 4
>+_aesni_enc1:
>+	movaps (KEYP), KEY		# key
>+	mov KEYP, TKEYP
>+	pxor KEY, STATE		# round 0
>+	add $0x30, TKEYP
>+	cmp $24, KLEN
>+	jb .Lenc128
>+	lea 0x20(TKEYP), TKEYP
>+	je .Lenc192
>+	add $0x20, TKEYP
>+	movaps -0x60(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps -0x50(TKEYP), KEY
>+	AESENC KEY STATE
>+.align 4
>+.Lenc192:
>+	movaps -0x40(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps -0x30(TKEYP), KEY
>+	AESENC KEY STATE
>+.align 4
>+.Lenc128:
>+	movaps -0x20(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps -0x10(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps (TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps 0x10(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps 0x20(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps 0x30(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps 0x40(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps 0x50(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps 0x60(TKEYP), KEY
>+	AESENC KEY STATE
>+	movaps 0x70(TKEYP), KEY
>+	AESENCLAST KEY STATE
>+	ret
>+ENDPROC(_aesni_enc1)
>+
>+/*
>+ * _aesni_enc4:	internal ABI
>+ * input:
>+ *	KEYP:		key struct pointer
>+ *	KLEN:		round count
>+ *	STATE1:		initial state (input)
>+ *	STATE2
>+ *	STATE3
>+ *	STATE4
>+ * output:
>+ *	STATE1:		finial state (output)
>+ *	STATE2
>+ *	STATE3
>+ *	STATE4
>+ * changed:
>+ *	KEY
>+ *	TKEYP (T1)
>+ */
>+.align 4
>+_aesni_enc4:
>+	movaps (KEYP), KEY		# key
>+	mov KEYP, TKEYP
>+	pxor KEY, STATE1		# round 0
>+	pxor KEY, STATE2
>+	pxor KEY, STATE3
>+	pxor KEY, STATE4
>+	add $0x30, TKEYP
>+	cmp $24, KLEN
>+	jb .L4enc128
>+	lea 0x20(TKEYP), TKEYP
>+	je .L4enc192
>+	add $0x20, TKEYP
>+	movaps -0x60(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps -0x50(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+#.align 4
>+.L4enc192:
>+	movaps -0x40(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps -0x30(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+#.align 4
>+.L4enc128:
>+	movaps -0x20(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps -0x10(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps (TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps 0x10(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps 0x20(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps 0x30(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps 0x40(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps 0x50(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps 0x60(TKEYP), KEY
>+	AESENC KEY STATE1
>+	AESENC KEY STATE2
>+	AESENC KEY STATE3
>+	AESENC KEY STATE4
>+	movaps 0x70(TKEYP), KEY
>+	AESENCLAST KEY STATE1		# last round
>+	AESENCLAST KEY STATE2
>+	AESENCLAST KEY STATE3
>+	AESENCLAST KEY STATE4
>+	ret
>+ENDPROC(_aesni_enc4)
>+
>+/*
>+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
>+ */
>+ENTRY(aesni_dec)
>+#ifndef __x86_64__
>+	pushl KEYP
>+	pushl KLEN
>+	movl 12(%esp), KEYP
>+	movl 16(%esp), OUTP
>+	movl 20(%esp), INP
>+#endif
>+	mov 480(KEYP), KLEN		# key length
>+	add $240, KEYP
>+	movups (INP), STATE		# input
>+	call _aesni_dec1
>+	movups STATE, (OUTP)		#output
>+#ifndef __x86_64__
>+	popl KLEN
>+	popl KEYP
>+#endif
>+	ret
>+ENDPROC(aesni_dec)
>+
>+/*
>+ * _aesni_dec1:		internal ABI
>+ * input:
>+ *	KEYP:		key struct pointer
>+ *	KLEN:		key length
>+ *	STATE:		initial state (input)
>+ * output:
>+ *	STATE:		finial state (output)
>+ * changed:
>+ *	KEY
>+ *	TKEYP (T1)
>+ */
>+.align 4
>+_aesni_dec1:
>+	movaps (KEYP), KEY		# key
>+	mov KEYP, TKEYP
>+	pxor KEY, STATE		# round 0
>+	add $0x30, TKEYP
>+	cmp $24, KLEN
>+	jb .Ldec128
>+	lea 0x20(TKEYP), TKEYP
>+	je .Ldec192
>+	add $0x20, TKEYP
>+	movaps -0x60(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps -0x50(TKEYP), KEY
>+	AESDEC KEY STATE
>+.align 4
>+.Ldec192:
>+	movaps -0x40(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps -0x30(TKEYP), KEY
>+	AESDEC KEY STATE
>+.align 4
>+.Ldec128:
>+	movaps -0x20(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps -0x10(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps (TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps 0x10(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps 0x20(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps 0x30(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps 0x40(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps 0x50(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps 0x60(TKEYP), KEY
>+	AESDEC KEY STATE
>+	movaps 0x70(TKEYP), KEY
>+	AESDECLAST KEY STATE
>+	ret
>+ENDPROC(_aesni_dec1)
>+
>+/*
>+ * _aesni_dec4:	internal ABI
>+ * input:
>+ *	KEYP:		key struct pointer
>+ *	KLEN:		key length
>+ *	STATE1:		initial state (input)
>+ *	STATE2
>+ *	STATE3
>+ *	STATE4
>+ * output:
>+ *	STATE1:		finial state (output)
>+ *	STATE2
>+ *	STATE3
>+ *	STATE4
>+ * changed:
>+ *	KEY
>+ *	TKEYP (T1)
>+ */
>+.align 4
>+_aesni_dec4:
>+	movaps (KEYP), KEY		# key
>+	mov KEYP, TKEYP
>+	pxor KEY, STATE1		# round 0
>+	pxor KEY, STATE2
>+	pxor KEY, STATE3
>+	pxor KEY, STATE4
>+	add $0x30, TKEYP
>+	cmp $24, KLEN
>+	jb .L4dec128
>+	lea 0x20(TKEYP), TKEYP
>+	je .L4dec192
>+	add $0x20, TKEYP
>+	movaps -0x60(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps -0x50(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+.align 4
>+.L4dec192:
>+	movaps -0x40(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps -0x30(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+.align 4
>+.L4dec128:
>+	movaps -0x20(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps -0x10(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps (TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps 0x10(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps 0x20(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps 0x30(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps 0x40(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps 0x50(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps 0x60(TKEYP), KEY
>+	AESDEC KEY STATE1
>+	AESDEC KEY STATE2
>+	AESDEC KEY STATE3
>+	AESDEC KEY STATE4
>+	movaps 0x70(TKEYP), KEY
>+	AESDECLAST KEY STATE1		# last round
>+	AESDECLAST KEY STATE2
>+	AESDECLAST KEY STATE3
>+	AESDECLAST KEY STATE4
>+	ret
>+ENDPROC(_aesni_dec4)
>+
>+/*
>+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
>+ *		      size_t len)
>+ */
>+ENTRY(aesni_ecb_enc)
>+#ifndef __x86_64__
>+	pushl LEN
>+	pushl KEYP
>+	pushl KLEN
>+	movl 16(%esp), KEYP
>+	movl 20(%esp), OUTP
>+	movl 24(%esp), INP
>+	movl 28(%esp), LEN
>+#endif
>+	test LEN, LEN		# check length
>+	jz .Lecb_enc_ret
>+	mov 480(KEYP), KLEN
>+	cmp $16, LEN
>+	jb .Lecb_enc_ret
>+	cmp $64, LEN
>+	jb .Lecb_enc_loop1
>+.align 4
>+.Lecb_enc_loop4:
>+	movups (INP), STATE1
>+	movups 0x10(INP), STATE2
>+	movups 0x20(INP), STATE3
>+	movups 0x30(INP), STATE4
>+	call _aesni_enc4
>+	movups STATE1, (OUTP)
>+	movups STATE2, 0x10(OUTP)
>+	movups STATE3, 0x20(OUTP)
>+	movups STATE4, 0x30(OUTP)
>+	sub $64, LEN
>+	add $64, INP
>+	add $64, OUTP
>+	cmp $64, LEN
>+	jge .Lecb_enc_loop4
>+	cmp $16, LEN
>+	jb .Lecb_enc_ret
>+.align 4
>+.Lecb_enc_loop1:
>+	movups (INP), STATE1
>+	call _aesni_enc1
>+	movups STATE1, (OUTP)
>+	sub $16, LEN
>+	add $16, INP
>+	add $16, OUTP
>+	cmp $16, LEN
>+	jge .Lecb_enc_loop1
>+.Lecb_enc_ret:
>+#ifndef __x86_64__
>+	popl KLEN
>+	popl KEYP
>+	popl LEN
>+#endif
>+	ret
>+ENDPROC(aesni_ecb_enc)
>+
>+/*
>+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
>+ *		      size_t len);
>+ */
>+ENTRY(aesni_ecb_dec)
>+#ifndef __x86_64__
>+	pushl LEN
>+	pushl KEYP
>+	pushl KLEN
>+	movl 16(%esp), KEYP
>+	movl 20(%esp), OUTP
>+	movl 24(%esp), INP
>+	movl 28(%esp), LEN
>+#endif
>+	test LEN, LEN
>+	jz .Lecb_dec_ret
>+	mov 480(KEYP), KLEN
>+	add $240, KEYP
>+	cmp $16, LEN
>+	jb .Lecb_dec_ret
>+	cmp $64, LEN
>+	jb .Lecb_dec_loop1
>+.align 4
>+.Lecb_dec_loop4:
>+	movups (INP), STATE1
>+	movups 0x10(INP), STATE2
>+	movups 0x20(INP), STATE3
>+	movups 0x30(INP), STATE4
>+	call _aesni_dec4
>+	movups STATE1, (OUTP)
>+	movups STATE2, 0x10(OUTP)
>+	movups STATE3, 0x20(OUTP)
>+	movups STATE4, 0x30(OUTP)
>+	sub $64, LEN
>+	add $64, INP
>+	add $64, OUTP
>+	cmp $64, LEN
>+	jge .Lecb_dec_loop4
>+	cmp $16, LEN
>+	jb .Lecb_dec_ret
>+.align 4
>+.Lecb_dec_loop1:
>+	movups (INP), STATE1
>+	call _aesni_dec1
>+	movups STATE1, (OUTP)
>+	sub $16, LEN
>+	add $16, INP
>+	add $16, OUTP
>+	cmp $16, LEN
>+	jge .Lecb_dec_loop1
>+.Lecb_dec_ret:
>+#ifndef __x86_64__
>+	popl KLEN
>+	popl KEYP
>+	popl LEN
>+#endif
>+	ret
>+ENDPROC(aesni_ecb_dec)
>+
>+/*
>+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
>+ *		      size_t len, u8 *iv)
>+ */
>+ENTRY(aesni_cbc_enc)
>+#ifndef __x86_64__
>+	pushl IVP
>+	pushl LEN
>+	pushl KEYP
>+	pushl KLEN
>+	movl 20(%esp), KEYP
>+	movl 24(%esp), OUTP
>+	movl 28(%esp), INP
>+	movl 32(%esp), LEN
>+	movl 36(%esp), IVP
>+#endif
>+	cmp $16, LEN
>+	jb .Lcbc_enc_ret
>+	mov 480(KEYP), KLEN
>+	movups (IVP), STATE	# load iv as initial state
>+.align 4
>+.Lcbc_enc_loop:
>+	movups (INP), IN	# load input
>+	pxor IN, STATE
>+	call _aesni_enc1
>+	movups STATE, (OUTP)	# store output
>+	sub $16, LEN
>+	add $16, INP
>+	add $16, OUTP
>+	cmp $16, LEN
>+	jge .Lcbc_enc_loop
>+	movups STATE, (IVP)
>+.Lcbc_enc_ret:
>+#ifndef __x86_64__
>+	popl KLEN
>+	popl KEYP
>+	popl LEN
>+	popl IVP
>+#endif
>+	ret
>+ENDPROC(aesni_cbc_enc)
>+
>+/*
>+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
>+ *		      size_t len, u8 *iv)
>+ */
>+ENTRY(aesni_cbc_dec)
>+#ifndef __x86_64__
>+	pushl IVP
>+	pushl LEN
>+	pushl KEYP
>+	pushl KLEN
>+	movl 20(%esp), KEYP
>+	movl 24(%esp), OUTP
>+	movl 28(%esp), INP
>+	movl 32(%esp), LEN
>+	movl 36(%esp), IVP
>+#endif
>+	cmp $16, LEN
>+	jb .Lcbc_dec_just_ret
>+	mov 480(KEYP), KLEN
>+	add $240, KEYP
>+	movups (IVP), IV
>+	cmp $64, LEN
>+	jb .Lcbc_dec_loop1
>+.align 4
>+.Lcbc_dec_loop4:
>+	movups (INP), IN1
>+	movaps IN1, STATE1
>+	movups 0x10(INP), IN2
>+	movaps IN2, STATE2
>+#ifdef __x86_64__
>+	movups 0x20(INP), IN3
>+	movaps IN3, STATE3
>+	movups 0x30(INP), IN4
>+	movaps IN4, STATE4
>+#else
>+	movups 0x20(INP), IN1
>+	movaps IN1, STATE3
>+	movups 0x30(INP), IN2
>+	movaps IN2, STATE4
>+#endif
>+	call _aesni_dec4
>+	pxor IV, STATE1
>+#ifdef __x86_64__
>+	pxor IN1, STATE2
>+	pxor IN2, STATE3
>+	pxor IN3, STATE4
>+	movaps IN4, IV
>+#else
>+	pxor IN1, STATE4
>+	movaps IN2, IV
>+	movups (INP), IN1
>+	pxor IN1, STATE2
>+	movups 0x10(INP), IN2
>+	pxor IN2, STATE3
>+#endif
>+	movups STATE1, (OUTP)
>+	movups STATE2, 0x10(OUTP)
>+	movups STATE3, 0x20(OUTP)
>+	movups STATE4, 0x30(OUTP)
>+	sub $64, LEN
>+	add $64, INP
>+	add $64, OUTP
>+	cmp $64, LEN
>+	jge .Lcbc_dec_loop4
>+	cmp $16, LEN
>+	jb .Lcbc_dec_ret
>+.align 4
>+.Lcbc_dec_loop1:
>+	movups (INP), IN
>+	movaps IN, STATE
>+	call _aesni_dec1
>+	pxor IV, STATE
>+	movups STATE, (OUTP)
>+	movaps IN, IV
>+	sub $16, LEN
>+	add $16, INP
>+	add $16, OUTP
>+	cmp $16, LEN
>+	jge .Lcbc_dec_loop1
>+.Lcbc_dec_ret:
>+	movups IV, (IVP)
>+.Lcbc_dec_just_ret:
>+#ifndef __x86_64__
>+	popl KLEN
>+	popl KEYP
>+	popl LEN
>+	popl IVP
>+#endif
>+	ret
>+ENDPROC(aesni_cbc_dec)
>+
>+#ifdef __x86_64__
>+.align 16
>+.Lbswap_mask:
>+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
>+
>+/*
>+ * _aesni_inc_init:	internal ABI
>+ *	setup registers used by _aesni_inc
>+ * input:
>+ *	IV
>+ * output:
>+ *	CTR:	== IV, in little endian
>+ *	TCTR_LOW: == lower qword of CTR
>+ *	INC:	== 1, in little endian
>+ *	BSWAP_MASK == endian swapping mask
>+ */
>+.align 4
>+_aesni_inc_init:
>+	//movaps .Lbswap_mask, BSWAP_MASK
>+	movaps .Lbswap_mask(%rip), BSWAP_MASK
>+	movaps IV, CTR
>+	PSHUFB_XMM BSWAP_MASK CTR
>+	mov $1, TCTR_LOW
>+	MOVQ_R64_XMM TCTR_LOW INC
>+	MOVQ_R64_XMM CTR TCTR_LOW
>+	ret
>+ENDPROC(_aesni_inc_init)
>+
>+/*
>+ * _aesni_inc:		internal ABI
>+ *	Increase IV by 1, IV is in big endian
>+ * input:
>+ *	IV
>+ *	CTR:	== IV, in little endian
>+ *	TCTR_LOW: == lower qword of CTR
>+ *	INC:	== 1, in little endian
>+ *	BSWAP_MASK == endian swapping mask
>+ * output:
>+ *	IV:	Increase by 1
>+ * changed:
>+ *	CTR:	== output IV, in little endian
>+ *	TCTR_LOW: == lower qword of CTR
>+ */
>+.align 4
>+_aesni_inc:
>+	paddq INC, CTR
>+	add $1, TCTR_LOW
>+	jnc .Linc_low
>+	pslldq $8, INC
>+	paddq INC, CTR
>+	psrldq $8, INC
>+.Linc_low:
>+	movaps CTR, IV
>+	PSHUFB_XMM BSWAP_MASK IV
>+	ret
>+ENDPROC(_aesni_inc)
>+
>+/*
>+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
>+ *		      size_t len, u8 *iv)
>+ */
>+ENTRY(aesni_ctr_enc)
>+	cmp $16, LEN
>+	jb .Lctr_enc_just_ret
>+	mov 480(KEYP), KLEN
>+	movups (IVP), IV
>+	call _aesni_inc_init
>+	cmp $64, LEN
>+	jb .Lctr_enc_loop1
>+.align 4
>+.Lctr_enc_loop4:
>+	movaps IV, STATE1
>+	call _aesni_inc
>+	movups (INP), IN1
>+	movaps IV, STATE2
>+	call _aesni_inc
>+	movups 0x10(INP), IN2
>+	movaps IV, STATE3
>+	call _aesni_inc
>+	movups 0x20(INP), IN3
>+	movaps IV, STATE4
>+	call _aesni_inc
>+	movups 0x30(INP), IN4
>+	call _aesni_enc4
>+	pxor IN1, STATE1
>+	movups STATE1, (OUTP)
>+	pxor IN2, STATE2
>+	movups STATE2, 0x10(OUTP)
>+	pxor IN3, STATE3
>+	movups STATE3, 0x20(OUTP)
>+	pxor IN4, STATE4
>+	movups STATE4, 0x30(OUTP)
>+	sub $64, LEN
>+	add $64, INP
>+	add $64, OUTP
>+	cmp $64, LEN
>+	jge .Lctr_enc_loop4
>+	cmp $16, LEN
>+	jb .Lctr_enc_ret
>+.align 4
>+.Lctr_enc_loop1:
>+	movaps IV, STATE
>+	call _aesni_inc
>+	movups (INP), IN
>+	call _aesni_enc1
>+	pxor IN, STATE
>+	movups STATE, (OUTP)
>+	sub $16, LEN
>+	add $16, INP
>+	add $16, OUTP
>+	cmp $16, LEN
>+	jge .Lctr_enc_loop1
>+.Lctr_enc_ret:
>+	movups IV, (IVP)
>+.Lctr_enc_just_ret:
>+	ret
>+ENDPROC(aesni_ctr_enc)
>+
>+/*
>+ * _aesni_gf128mul_x_ble:		internal ABI
>+ *	Multiply in GF(2^128) for XTS IVs
>+ * input:
>+ *	IV:	current IV
>+ *	GF128MUL_MASK == mask with 0x87 and 0x01
>+ * output:
>+ *	IV:	next IV
>+ * changed:
>+ *	CTR:	== temporary value
>+ */
>+#define _aesni_gf128mul_x_ble() \
>+	pshufd $0x13, IV, CTR; \
>+	paddq IV, IV; \
>+	psrad $31, CTR; \
>+	pand GF128MUL_MASK, CTR; \
>+	pxor CTR, IV;
>+
>+/*
>+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
>+ *			 bool enc, u8 *iv)
>+ */
>+ENTRY(aesni_xts_crypt8)
>+	cmpb $0, %cl
>+	movl $0, %ecx
>+	movl $240, %r10d
>+	//leaq _aesni_enc4, %r11
>+	//leaq _aesni_dec4, %rax
>+	leaq _aesni_enc4(%rip), %r11
>+	leaq _aesni_dec4(%rip), %rax
>+	cmovel %r10d, %ecx
>+	cmoveq %rax, %r11
>+
>+	//movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
>+	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
>+	movups (IVP), IV
>+
>+	mov 480(KEYP), KLEN
>+	addq %rcx, KEYP
>+
>+	movdqa IV, STATE1
>+	movdqu 0x00(INP), INC
>+	pxor INC, STATE1
>+	movdqu IV, 0x00(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movdqa IV, STATE2
>+	movdqu 0x10(INP), INC
>+	pxor INC, STATE2
>+	movdqu IV, 0x10(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movdqa IV, STATE3
>+	movdqu 0x20(INP), INC
>+	pxor INC, STATE3
>+	movdqu IV, 0x20(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movdqa IV, STATE4
>+	movdqu 0x30(INP), INC
>+	pxor INC, STATE4
>+	movdqu IV, 0x30(OUTP)
>+
>+	call *%r11
>+
>+	movdqu 0x00(OUTP), INC
>+	pxor INC, STATE1
>+	movdqu STATE1, 0x00(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movdqa IV, STATE1
>+	movdqu 0x40(INP), INC
>+	pxor INC, STATE1
>+	movdqu IV, 0x40(OUTP)
>+
>+	movdqu 0x10(OUTP), INC
>+	pxor INC, STATE2
>+	movdqu STATE2, 0x10(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movdqa IV, STATE2
>+	movdqu 0x50(INP), INC
>+	pxor INC, STATE2
>+	movdqu IV, 0x50(OUTP)
>+
>+	movdqu 0x20(OUTP), INC
>+	pxor INC, STATE3
>+	movdqu STATE3, 0x20(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movdqa IV, STATE3
>+	movdqu 0x60(INP), INC
>+	pxor INC, STATE3
>+	movdqu IV, 0x60(OUTP)
>+
>+	movdqu 0x30(OUTP), INC
>+	pxor INC, STATE4
>+	movdqu STATE4, 0x30(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movdqa IV, STATE4
>+	movdqu 0x70(INP), INC
>+	pxor INC, STATE4
>+	movdqu IV, 0x70(OUTP)
>+
>+	_aesni_gf128mul_x_ble()
>+	movups IV, (IVP)
>+
>+	call *%r11
>+
>+	movdqu 0x40(OUTP), INC
>+	pxor INC, STATE1
>+	movdqu STATE1, 0x40(OUTP)
>+
>+	movdqu 0x50(OUTP), INC
>+	pxor INC, STATE2
>+	movdqu STATE2, 0x50(OUTP)
>+
>+	movdqu 0x60(OUTP), INC
>+	pxor INC, STATE3
>+	movdqu STATE3, 0x60(OUTP)
>+
>+	movdqu 0x70(OUTP), INC
>+	pxor INC, STATE4
>+	movdqu STATE4, 0x70(OUTP)
>+
>+	ret
>+ENDPROC(aesni_xts_crypt8)
>+
>+#endif
>diff --git a/third_party/aesni-intel/inst-intel.h b/third_party/aesni-intel/inst-intel.h
>new file mode 100644
>index 00000000000..95978be17e2
>--- /dev/null
>+++ b/third_party/aesni-intel/inst-intel.h
>@@ -0,0 +1,306 @@
>+/*
>+ * Generate .byte code for some instructions not supported by old
>+ * binutils.
>+ */
>+#ifndef X86_ASM_INST_H
>+#define X86_ASM_INST_H
>+
>+#define REG_NUM_INVALID		100
>+
>+#define REG_TYPE_R32		0
>+#define REG_TYPE_R64		1
>+#define REG_TYPE_XMM		2
>+#define REG_TYPE_INVALID	100
>+
>+	.macro R32_NUM opd r32
>+	\opd = REG_NUM_INVALID
>+	.ifc \r32,%eax
>+	\opd = 0
>+	.endif
>+	.ifc \r32,%ecx
>+	\opd = 1
>+	.endif
>+	.ifc \r32,%edx
>+	\opd = 2
>+	.endif
>+	.ifc \r32,%ebx
>+	\opd = 3
>+	.endif
>+	.ifc \r32,%esp
>+	\opd = 4
>+	.endif
>+	.ifc \r32,%ebp
>+	\opd = 5
>+	.endif
>+	.ifc \r32,%esi
>+	\opd = 6
>+	.endif
>+	.ifc \r32,%edi
>+	\opd = 7
>+	.endif
>+#ifdef CONFIG_X86_64
>+	.ifc \r32,%r8d
>+	\opd = 8
>+	.endif
>+	.ifc \r32,%r9d
>+	\opd = 9
>+	.endif
>+	.ifc \r32,%r10d
>+	\opd = 10
>+	.endif
>+	.ifc \r32,%r11d
>+	\opd = 11
>+	.endif
>+	.ifc \r32,%r12d
>+	\opd = 12
>+	.endif
>+	.ifc \r32,%r13d
>+	\opd = 13
>+	.endif
>+	.ifc \r32,%r14d
>+	\opd = 14
>+	.endif
>+	.ifc \r32,%r15d
>+	\opd = 15
>+	.endif
>+#endif
>+	.endm
>+
>+	.macro R64_NUM opd r64
>+	\opd = REG_NUM_INVALID
>+#ifdef CONFIG_X86_64
>+	.ifc \r64,%rax
>+	\opd = 0
>+	.endif
>+	.ifc \r64,%rcx
>+	\opd = 1
>+	.endif
>+	.ifc \r64,%rdx
>+	\opd = 2
>+	.endif
>+	.ifc \r64,%rbx
>+	\opd = 3
>+	.endif
>+	.ifc \r64,%rsp
>+	\opd = 4
>+	.endif
>+	.ifc \r64,%rbp
>+	\opd = 5
>+	.endif
>+	.ifc \r64,%rsi
>+	\opd = 6
>+	.endif
>+	.ifc \r64,%rdi
>+	\opd = 7
>+	.endif
>+	.ifc \r64,%r8
>+	\opd = 8
>+	.endif
>+	.ifc \r64,%r9
>+	\opd = 9
>+	.endif
>+	.ifc \r64,%r10
>+	\opd = 10
>+	.endif
>+	.ifc \r64,%r11
>+	\opd = 11
>+	.endif
>+	.ifc \r64,%r12
>+	\opd = 12
>+	.endif
>+	.ifc \r64,%r13
>+	\opd = 13
>+	.endif
>+	.ifc \r64,%r14
>+	\opd = 14
>+	.endif
>+	.ifc \r64,%r15
>+	\opd = 15
>+	.endif
>+#endif
>+	.endm
>+
>+	.macro XMM_NUM opd xmm
>+	\opd = REG_NUM_INVALID
>+	.ifc \xmm,%xmm0
>+	\opd = 0
>+	.endif
>+	.ifc \xmm,%xmm1
>+	\opd = 1
>+	.endif
>+	.ifc \xmm,%xmm2
>+	\opd = 2
>+	.endif
>+	.ifc \xmm,%xmm3
>+	\opd = 3
>+	.endif
>+	.ifc \xmm,%xmm4
>+	\opd = 4
>+	.endif
>+	.ifc \xmm,%xmm5
>+	\opd = 5
>+	.endif
>+	.ifc \xmm,%xmm6
>+	\opd = 6
>+	.endif
>+	.ifc \xmm,%xmm7
>+	\opd = 7
>+	.endif
>+	.ifc \xmm,%xmm8
>+	\opd = 8
>+	.endif
>+	.ifc \xmm,%xmm9
>+	\opd = 9
>+	.endif
>+	.ifc \xmm,%xmm10
>+	\opd = 10
>+	.endif
>+	.ifc \xmm,%xmm11
>+	\opd = 11
>+	.endif
>+	.ifc \xmm,%xmm12
>+	\opd = 12
>+	.endif
>+	.ifc \xmm,%xmm13
>+	\opd = 13
>+	.endif
>+	.ifc \xmm,%xmm14
>+	\opd = 14
>+	.endif
>+	.ifc \xmm,%xmm15
>+	\opd = 15
>+	.endif
>+	.endm
>+
>+	.macro REG_TYPE type reg
>+	R32_NUM reg_type_r32 \reg
>+	R64_NUM reg_type_r64 \reg
>+	XMM_NUM reg_type_xmm \reg
>+	.if reg_type_r64 <> REG_NUM_INVALID
>+	\type = REG_TYPE_R64
>+	.elseif reg_type_r32 <> REG_NUM_INVALID
>+	\type = REG_TYPE_R32
>+	.elseif reg_type_xmm <> REG_NUM_INVALID
>+	\type = REG_TYPE_XMM
>+	.else
>+	\type = REG_TYPE_INVALID
>+	.endif
>+	.endm
>+
>+	.macro PFX_OPD_SIZE
>+	.byte 0x66
>+	.endm
>+
>+	.macro PFX_REX opd1 opd2 W=0
>+	.if ((\opd1 | \opd2) & 8) || \W
>+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
>+	.endif
>+	.endm
>+
>+	.macro MODRM mod opd1 opd2
>+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
>+	.endm
>+
>+	.macro PSHUFB_XMM xmm1 xmm2
>+	XMM_NUM pshufb_opd1 \xmm1
>+	XMM_NUM pshufb_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX pshufb_opd1 pshufb_opd2
>+	.byte 0x0f, 0x38, 0x00
>+	MODRM 0xc0 pshufb_opd1 pshufb_opd2
>+	.endm
>+
>+	.macro PCLMULQDQ imm8 xmm1 xmm2
>+	XMM_NUM clmul_opd1 \xmm1
>+	XMM_NUM clmul_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX clmul_opd1 clmul_opd2
>+	.byte 0x0f, 0x3a, 0x44
>+	MODRM 0xc0 clmul_opd1 clmul_opd2
>+	.byte \imm8
>+	.endm
>+
>+	.macro PEXTRD imm8 xmm gpr
>+	R32_NUM extrd_opd1 \gpr
>+	XMM_NUM extrd_opd2 \xmm
>+	PFX_OPD_SIZE
>+	PFX_REX extrd_opd1 extrd_opd2
>+	.byte 0x0f, 0x3a, 0x16
>+	MODRM 0xc0 extrd_opd1 extrd_opd2
>+	.byte \imm8
>+	.endm
>+
>+	.macro AESKEYGENASSIST rcon xmm1 xmm2
>+	XMM_NUM aeskeygen_opd1 \xmm1
>+	XMM_NUM aeskeygen_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX aeskeygen_opd1 aeskeygen_opd2
>+	.byte 0x0f, 0x3a, 0xdf
>+	MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
>+	.byte \rcon
>+	.endm
>+
>+	.macro AESIMC xmm1 xmm2
>+	XMM_NUM aesimc_opd1 \xmm1
>+	XMM_NUM aesimc_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX aesimc_opd1 aesimc_opd2
>+	.byte 0x0f, 0x38, 0xdb
>+	MODRM 0xc0 aesimc_opd1 aesimc_opd2
>+	.endm
>+
>+	.macro AESENC xmm1 xmm2
>+	XMM_NUM aesenc_opd1 \xmm1
>+	XMM_NUM aesenc_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX aesenc_opd1 aesenc_opd2
>+	.byte 0x0f, 0x38, 0xdc
>+	MODRM 0xc0 aesenc_opd1 aesenc_opd2
>+	.endm
>+
>+	.macro AESENCLAST xmm1 xmm2
>+	XMM_NUM aesenclast_opd1 \xmm1
>+	XMM_NUM aesenclast_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX aesenclast_opd1 aesenclast_opd2
>+	.byte 0x0f, 0x38, 0xdd
>+	MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
>+	.endm
>+
>+	.macro AESDEC xmm1 xmm2
>+	XMM_NUM aesdec_opd1 \xmm1
>+	XMM_NUM aesdec_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX aesdec_opd1 aesdec_opd2
>+	.byte 0x0f, 0x38, 0xde
>+	MODRM 0xc0 aesdec_opd1 aesdec_opd2
>+	.endm
>+
>+	.macro AESDECLAST xmm1 xmm2
>+	XMM_NUM aesdeclast_opd1 \xmm1
>+	XMM_NUM aesdeclast_opd2 \xmm2
>+	PFX_OPD_SIZE
>+	PFX_REX aesdeclast_opd1 aesdeclast_opd2
>+	.byte 0x0f, 0x38, 0xdf
>+	MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
>+	.endm
>+
>+	.macro MOVQ_R64_XMM opd1 opd2
>+	REG_TYPE movq_r64_xmm_opd1_type \opd1
>+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
>+	XMM_NUM movq_r64_xmm_opd1 \opd1
>+	R64_NUM movq_r64_xmm_opd2 \opd2
>+	.else
>+	R64_NUM movq_r64_xmm_opd1 \opd1
>+	XMM_NUM movq_r64_xmm_opd2 \opd2
>+	.endif
>+	PFX_OPD_SIZE
>+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
>+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
>+	.byte 0x0f, 0x7e
>+	.else
>+	.byte 0x0f, 0x6e
>+	.endif
>+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
>+	.endm
>+#endif
>diff --git a/third_party/aesni-intel/wscript b/third_party/aesni-intel/wscript
>new file mode 100644
>index 00000000000..58afe0bd5bf
>--- /dev/null
>+++ b/third_party/aesni-intel/wscript
>@@ -0,0 +1,12 @@
>+#!/usr/bin/env python
>+
>+def configure(conf):
>+	if conf.CHECK_CFLAGS('-Wp,-E,-lang-asm', '') and conf.env['SYSTEM_UNAME_MACHINE'] == 'x86_64':
>+		conf.DEFINE('HAVE_AESNI_INTEL', 1)
>+		conf.DEFINE('HAVE_AES_ACC', 1)
>+
>+def build(bld):
>+    bld.SAMBA_LIBRARY('aesni-intel',
>+                      source='aesni-intel_asm.c',
>+                      cflags='-Wp,-E,-lang-asm',
>+                      private_library=True)
>diff --git a/third_party/wscript b/third_party/wscript
>index d5b9df7bb96..49e57b7838d 100644
>--- a/third_party/wscript
>+++ b/third_party/wscript
>@@ -49,6 +49,7 @@ def configure(conf):
>         find_third_party_module(conf, module, package)
>     conf.RECURSE('popt')
>     conf.RECURSE('zlib')
>+    conf.RECURSE('aesni-intel')
> 
> 
> def build(bld):
>@@ -71,3 +72,4 @@ def build(bld):
>     bld.INSTALL_FILES('${PYTHONARCHDIR}/samba/third_party', 'empty_file', destname='__init__.py')
>     bld.RECURSE('zlib')
>     bld.RECURSE('popt')
>+    bld.RECURSE('aesni-intel')
>-- 
>2.11.0
>

Actions: View

Attachments on bug 13008: 13524 | 13525 | 13526 | 13527 | 13528 | 13529 | 13549 | 13550 | 13552 | 13557