Commit 6f6a6130 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes/asm/bsaes-*.pl: improve decrypt performance.

Improve decrypt performance by 10-20% depending on platform. Thanks
to Jussi Kivilinna for providing valuable hint. Also thanks to Ard
Biesheuvel.
parent 99fb2212
Loading
Loading
Loading
Loading
+64 −5
Original line number Diff line number Diff line
@@ -23,14 +23,14 @@
# to collect performance results, which for Cortex-A8 core are:
#
# encrypt	19.5 cycles per byte processed with 128-bit key
# decrypt	24.0 cycles per byte processed with 128-bit key
# decrypt	22.1 cycles per byte processed with 128-bit key
# key conv.	440  cycles per 128-bit key/0.18 of 8x block
#
# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6,
# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
# which is [much] worse than anticipated (for further details see
# http://www.openssl.org/~appro/Snapdragon-S4.html).
#
# Cortex-A15 manages in 14.2/19.6 cycles [when integer-only code
# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
# manages in 20.0 cycles].
#
# When comparing to x86_64 results keep in mind that NEON unit is
@@ -377,6 +377,7 @@ sub MixColumns {
# modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7];
my @t=@_[8..15];
my $inv=@_[16];	# optional
$code.=<<___;
	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
	vext.8	@t[1], @x[1], @x[1], #12
@@ -417,8 +418,9 @@ $code.=<<___;
	veor	@t[3], @t[3], @x[7]
	 vext.8	@x[6], @x[2], @x[2], #8
	veor	@x[7], @t[1], @t[5]
___
$code.=<<___ if (!$inv);
	veor	@x[2], @t[0], @t[4]

	veor	@x[4], @x[4], @t[3]
	veor	@x[5], @x[5], @t[7]
	veor	@x[3], @x[3], @t[6]
@@ -426,9 +428,18 @@ $code.=<<___;
	veor	@x[6], @x[6], @t[2]
	 @ vmov	@x[7], @t[1]
___
$code.=<<___ if ($inv);
	veor	@t[3], @t[3], @x[4]
	veor	@x[5], @x[5], @t[7]
	veor	@x[2], @x[3], @t[6]
	veor	@x[3], @t[0], @t[4]
	veor	@x[4], @x[6], @t[2]
	vmov	@x[6], @t[3]
	 @ vmov	@x[7], @t[1]
___
}

sub InvMixColumns {
sub InvMixColumns_orig {
my @x=@_[0..7];
my @t=@_[8..15];

@@ -581,6 +592,54 @@ $code.=<<___;
___
}

sub InvMixColumns {
my @x=@_[0..7];
my @t=@_[8..15];

# Thanks to Jussi Kivilinna for providing pointer to
#
# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |

$code.=<<___;
	@ multiplication by 0x05-0x00-0x04-0x00
	vext.8	@t[0], @x[0], @x[0], #8
	vext.8	@t[6], @x[6], @x[6], #8
	vext.8	@t[7], @x[7], @x[7], #8
	veor	@t[0], @t[0], @x[0]
	vext.8	@t[1], @x[1], @x[1], #8
	veor	@t[6], @t[6], @x[6]
	vext.8	@t[2], @x[2], @x[2], #8
	veor	@t[7], @t[7], @x[7]
	vext.8	@t[3], @x[3], @x[3], #8
	veor	@t[1], @t[1], @x[1]
	vext.8	@t[4], @x[4], @x[4], #8
	veor	@t[2], @t[2], @x[2]
	vext.8	@t[5], @x[5], @x[5], #8
	veor	@t[3], @t[3], @x[3]
	veor	@t[4], @t[4], @x[4]
	veor	@t[5], @t[5], @x[5]

	 veor	@x[0], @x[0], @t[6]
	 veor	@x[1], @x[1], @t[6]
	 veor	@x[2], @x[2], @t[0]
	 veor	@x[4], @x[4], @t[2]
	 veor	@x[3], @x[3], @t[1]
	 veor	@x[1], @x[1], @t[7]
	 veor	@x[2], @x[2], @t[7]
	 veor	@x[4], @x[4], @t[6]
	 veor	@x[5], @x[5], @t[3]
	 veor	@x[3], @x[3], @t[6]
	 veor	@x[6], @x[6], @t[4]
	 veor	@x[4], @x[4], @t[7]
	 veor	@x[5], @x[5], @t[7]
	 veor	@x[7], @x[7], @t[5]
___
	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
}

sub swapmove {
my ($a,$b,$n,$mask,$t)=@_;
$code.=<<___;
+66 −5
Original line number Diff line number Diff line
@@ -83,9 +83,9 @@
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
# Core 2	11.0
# Nehalem	9.16
# Atom		20.9
# Core 2	9.83
# Nehalem	7.74
# Atom		18.9 (estimated, not measured yet)
#
# November 2011.
#
@@ -456,6 +456,7 @@ sub MixColumns {
# modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7];
my @t=@_[8..15];
my $inv=@_[16];	# optional
$code.=<<___;
	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
	pshufd	\$0x93, @x[1], @t[1]
@@ -497,7 +498,8 @@ $code.=<<___;
	pxor	@t[4], @t[0]
	 pshufd	\$0x4E, @x[2], @x[6]
	pxor	@t[5], @t[1]

___
$code.=<<___ if (!$inv);
	pxor	@t[3], @x[4]
	pxor	@t[7], @x[5]
	pxor	@t[6], @x[3]
@@ -505,9 +507,20 @@ $code.=<<___;
	pxor	@t[2], @x[6]
	 movdqa	@t[1], @x[7]
___
$code.=<<___ if ($inv);
	pxor	@x[4], @t[3]
	pxor	@t[7], @x[5]
	pxor	@x[3], @t[6]
	 movdqa	@t[0], @x[3]
	pxor	@t[2], @x[6]
	 movdqa	@t[6], @x[2]
	 movdqa	@t[1], @x[7]
	 movdqa	@x[6], @x[4]
	 movdqa	@t[3], @x[6]
___
}

sub InvMixColumns {
sub InvMixColumns_orig {
my @x=@_[0..7];
my @t=@_[8..15];

@@ -661,6 +674,54 @@ $code.=<<___;
___
}

sub InvMixColumns {
my @x=@_[0..7];
my @t=@_[8..15];

# Thanks to Jussi Kivilinna for providing pointer to
#
# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |

$code.=<<___;
	# multiplication by 0x05-0x00-0x04-0x00
	pshufd	\$0x4E, @x[0], @t[0]
	pshufd	\$0x4E, @x[6], @t[6]
	pxor	@x[0], @t[0]
	pshufd	\$0x4E, @x[7], @t[7]
	pxor	@x[6], @t[6]
	pshufd	\$0x4E, @x[1], @t[1]
	pxor	@x[7], @t[7]
	pshufd	\$0x4E, @x[2], @t[2]
	pxor	@x[1], @t[1]
	pshufd	\$0x4E, @x[3], @t[3]
	pxor	@x[2], @t[2]
	 pxor	@t[6], @x[0]
	 pxor	@t[6], @x[1]
	pshufd	\$0x4E, @x[4], @t[4]
	pxor	@x[3], @t[3]
	 pxor	@t[0], @x[2]
	 pxor	@t[1], @x[3]
	pshufd	\$0x4E, @x[5], @t[5]
	pxor	@x[4], @t[4]
	 pxor	@t[7], @x[1]
	 pxor	@t[2], @x[4]
	pxor	@x[5], @t[5]

	 pxor	@t[7], @x[2]
	 pxor	@t[6], @x[3]
	 pxor	@t[6], @x[4]
	 pxor	@t[3], @x[5]
	 pxor	@t[4], @x[6]
	 pxor	@t[7], @x[4]
	 pxor	@t[7], @x[5]
	 pxor	@t[5], @x[7]
___
	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
}

sub aesenc {				# not used
my @b=@_[0..7];
my @t=@_[8..15];