Commit 91dbdc63 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha/asm/keccak1600-avx2.pl: remodel register usage.



This gives much more freedom to rearrange instructions. This is
unoptimized version, provided for reference. Basically you need
to compare it to initial 29724d0e
to figure out the key difference.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent 74df8c4c
Loading
Loading
Loading
Loading
+105 −109
Original line number Diff line number Diff line
@@ -111,16 +111,10 @@ my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
#
#			r=1088(*)
#
# Haswell		8.9/+8%
# Skylake		7.9/+19%
# Ryzen			17(**)
# Haswell		9.5
# Skylake		8.8
#
# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
#	coefficient in comparison to scalar keccak1600-x86_64.pl.
# (**)	It's expected that Ryzen performs poorly, because instruction
#	issue rate is limited to two AVX2 instructions per cycle and
#	in addition vpblendd is reportedly bound to specific port.
#	Obviously this code path should not be executed on Ryzen.
# (*)	Corresponds to SHA3-256.

my @T = map("%ymm$_",(7..15));
my ($C14,$C00,$D00,$D14) = @T[5..8];
@@ -140,135 +134,137 @@ __KeccakF1600:
.align	32
.Loop_avx2:
	######################################### Theta
	vpshufd		\$0b01001110,$A20,$C00
	vpxor		$A31,$A01,$C14
	vpxor		$A41,$A21,@T[0]
	vpxor		$A11,$C14,$C14
	vpxor		@T[0],$C14,$C14		# C[1..4]

	vpermq		\$0b11111111,$C14,@T[3]
	vpermq		\$0b10010011,$C14,@T[4]

	vpxor		$A01,$A31,$C14
	vpxor		$A21,$C14,$C14
	vpxor		$A41,$C14,$C14
	vpxor		$A11,$C14,$C14		# C[1..4]
	vpermq		\$0b10110001,$A20,$C00
	vpxor		$A20,$C00,$C00
	vpermq		\$0b01001110,$C00,@T[0]
	vpxor		$A00,$C00,$C00
	vpxor		@T[0],$C00,$C00		# C[0..0]

	vpsrlq		\$63,$C14,@T[1]
	vpaddq		$C14,$C14,@T[2]
	vpor		@T[2],@T[1],@T[1]	# ROL64(C[1..4],1)
	vpaddq		$C14,$C14,@T[3]
	vpor		@T[3],@T[1],@T[1]	# ROL64(C[1..4],1)

	vpermq		\$0b00111001,@T[1],$D14
	vpxor		@T[3],@T[1],$D00
	vpsrlq		\$63,$C00,@T[0]
	vpaddq		$C00,$C00,@T[2]
	vpor		@T[2],@T[0],@T[0]	# ROL64(C[0..0],1)

	vpxor		$A00,$C00,$C00
	vpxor		@T[0],$C00,$C00		# C[0..0]
	vpermq		\$0b00000000,@T[1],$D00	
	vpermq		\$0b11111111,$C14,@T[3]
	vpxor		@T[3],$D00,$D00		# D[0..0] = ROL64(C[1],1) ^ C[4]

	vpsrlq		\$63,$C00,@T[0]
	vpaddq		$C00,$C00,@T[1]
	vpor		@T[0],@T[1],@T[1]	# ROL64(C[0..0],1)
	vpermq		\$0b00111001,@T[1],$D14
	vpblendd	\$0b11000000,@T[0],$D14,$D14
	vpermq		\$0b10010011,$C14,@T[2]
	vpblendd	\$0b00000011,$C00,@T[2],@T[2]
	vpxor		@T[2],$D14,$D14		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]

	vpermq		\$0b00000000,$D00,$D00	# D[0..0] = ROL64(C[1],1) ^ C[4]
	vpxor		$D00,$A20,$A20		# ^= D[0..0]
	vpxor		$D00,$A00,$A00		# ^= D[0..0]
	vpxor		$D00,$A20,$A20		# ^= D[0..0]
	vpxor		$D14,$A01,$A01		# ^= D[1..4]
	vpxor		$D14,$A31,$A31		# ^= D[1..4]
	vpxor		$D14,$A21,$A21		# ^= D[1..4]
	vpxor		$D14,$A41,$A41		# ^= D[1..4]
	vpxor		$D14,$A11,$A11		# ^= D[1..4]

	vpblendd	\$0b11000000,@T[1],$D14,$D14
	vpblendd	\$0b00000011,$C00,@T[4],@T[4]
	vpxor		@T[4],$D14,$D14		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]

	######################################### Rho + Pi + pre-Chi shuffle
	######################################### Rho
	vpsllvq		0*32-96(%r8),$A20,@T[0]
	vpsrlvq		0*32-96(%r9),$A20,$A20
	vpor		$A20,@T[0],@T[0]	# $A20
	vpor		@T[0],$A20,$A20

	vpsllvq		1*32-96(%r8),$A01,@T[1]
	vpsrlvq		1*32-96(%r9),$A01,$A01
	vpor		@T[1],$A01,$A01

	 vpxor		$D14,$A31,$A31		# ^= D[1..4]
	vpsllvq		2*32-96(%r8),$A31,@T[2]
	vpsrlvq		2*32-96(%r9),$A31,$A31
	vpor		$A31,@T[2],@T[2]	# $A31
	vpor		@T[2],$A31,$A31

	 vpxor		$D14,$A21,$A21		# ^= D[1..4]
	vpsllvq		3*32-96(%r8),$A21,@T[3]
	vpsrlvq		3*32-96(%r9),$A21,$A21
	vpor		$A21,@T[3],@T[3]	# $A21
	vpor		@T[3],$A21,$A21

	 vpermq		\$0b10001101,@T[0],$A31	# $A20 -> $A31
	 vpermq		\$0b10001101,@T[2],$A21	# $A31 -> $A21
	 vpxor		$D14,$A41,$A41		# ^= D[1..4]
	vpsllvq		4*32-96(%r8),$A41,@T[4]
	vpsrlvq		4*32-96(%r9),$A41,$A41
	vpor		@T[4],$A41,$A41

	 vpxor		$D14,$A01,$A01		# ^= D[1..4]
	 vpxor		$D14,$A11,$T[6]		# ^= D[1..4]
	vpsllvq		1*32-96(%r8),$A01,@T[1]
	vpsrlvq		1*32-96(%r9),$A01,$A01
	vpor		$A41,@T[4],@T[4]	# $A41
	vpor		@T[1],$A01,$A20		# $A01 -> $A20
	vpsllvq		5*32-96(%r8),$A11,@T[5]
	vpsrlvq		5*32-96(%r9),$A11,$A11
	vpor		@T[5],$A11,$A11

	 vpermq		\$0b00011011,@T[3],$A41	# $A21 -> $A41
	 vpermq		\$0b01110010,@T[4],$A11	# $A41 -> $A11
	vpsllvq		5*32-96(%r8),$T[6],@T[5]
	vpsrlvq		5*32-96(%r9),@T[6],@T[6]
	vpor		@T[5],@T[6],$A01	# $A11 -> $A01
	######################################### Pi + pre-Chi shuffle
	vpermq		\$0b01110010,$A41,@T[6]	# vpermq \$0b00011011,$A41,$A11
	vpermq		\$0b00011011,$A21,@T[5]	# vpermq \$0b01110010,$A21,$A41
	vpermq		\$0b10001101,$A31,@T[4]	# vpermq \$0b10001101,$A31,$A21
	vpermq		\$0b10001101,$A20,@T[3]	# vpermq \$0b01110010,$A20,$A31
	vmovdqa		$A01,@T[2]
	vmovdqa		$A11,@T[1]

	######################################### Chi
	vpsrldq		\$8,$A01,@T[0]
	vpandn		@T[0],$A01,@T[0]	# tgting  [0][0]
	vpermq		\$0b00000000,@T[1],@T[0]	# [0][1] [0][1] [0][1] [0][1]
	vpermq		\$0b01010101,@T[1],@T[7]	# [0][2] [0][2] [0][2] [0][2]
	vpandn		@T[7],@T[0],@T[0]	# tgting  [0][0] [0][0] [0][0] [0][0]

	vpermq		\$0b00111001,$A01,@T[1]		# [0][1] [0][4] [0][3] [0][2]
	vpermq		\$0b00011110,$A01,@T[8]		# [0][1] [0][2] [0][4] [0][3]
	vpblendd	\$0b11000000,$A00,@T[1],@T[1]	# [0][0] [0][4] [0][3] [0][2]
	vpermq		\$0b00111001,@T[1],$A01		# [0][1] [0][4] [0][3] [0][2]
	vpermq		\$0b00011110,@T[1],@T[8]	# [0][1] [0][2] [0][4] [0][3]
	vpblendd	\$0b11000000,$A00,$A01,$A01	# [0][0] [0][4] [0][3] [0][2]
	vpblendd	\$0b00110000,$A00,@T[8],@T[8]	# [0][1] [0][0] [0][4] [0][3]
	vpxor		@T[0],$A00,$A00		# broadcasted below
	vpandn		@T[8],@T[1],@T[1]	# tgting  [0][4] [0][3] [0][2] [0][1]

	vpblendd	\$0b00001100,$A41,$A21, @T[2]	#               [4][1] [2][1]
	vpblendd	\$0b00001100,$A21,$A11, @T[4]	#               [4][2] [2][2]
	vpblendd	\$0b00110000,$A11,@T[2],@T[2]	#        [1][1] [4][1] [2][1]
	vpblendd	\$0b00110000,$A31,@T[4],@T[4]	#        [1][2] [4][2] [2][2]
	vpblendd	\$0b11000000,$A31,@T[2],@T[2]	# [3][1] [1][1] [4][1] [2][1]
	vpblendd	\$0b11000000,$A41,@T[4],@T[4]	# [3][2] [1][2] [4][2] [2][2]
	vpandn		@T[4],@T[2],@T[2]	# tgting  [3][0] [1][0] [4][0] [2][0]

	vpblendd	\$0b00001100,$A11,$A20, @T[3]	#               [4][4] [2][0]
	vpblendd	\$0b00001100,$A20,$A21, @T[5]	#               [4][0] [2][1]
	vpblendd	\$0b00110000,$A21,@T[3],@T[3]	#        [1][3] [4][4] [2][0]
	vpblendd	\$0b00110000,$A41,@T[5],@T[5]	#        [1][4] [4][0] [2][1]
	vpblendd	\$0b11000000,$A41,@T[3],@T[3]	# [3][2] [1][3] [4][4] [2][0]
	vpblendd	\$0b11000000,$A11,@T[5],@T[5]	# [3][3] [1][4] [4][0] [2][1]
	vpandn		@T[5],@T[3],@T[3]	# tgting  [3][1] [1][2] [4][3] [2][4]
	vpxor		$A31,@T[3],@T[3]

	vpblendd	\$0b00001100,$A21,$A31, @T[5]	#               [4][2] [2][4]
	vpblendd	\$0b00001100,$A31,$A20, @T[6]	#               [4][3] [2][0]
	vpblendd	\$0b00110000,$A20,@T[5],@T[5]	#        [1][0] [4][2] [2][4]
	vpblendd	\$0b00110000,$A11,@T[6],@T[6]	#        [1][1] [4][3] [2][0]
	vpblendd	\$0b11000000,$A11,@T[5],@T[5]	# [3][3] [1][0] [4][2] [2][4]
	vpblendd	\$0b11000000,$A21,@T[6],@T[6]	# [3][4] [1][1] [4][3] [2][0]
	vpandn		@T[6],@T[5],@T[5]	# tgting  [3][2] [1][4] [4][1] [2][3]
	vpxor		$A41,@T[5],@T[5]

	vpblendd	\$0b00001100,$A20,$A41, @T[6]	#               [4][0] [2][3]
	vpblendd	\$0b00001100,$A41,$A31, @T[7]	#               [4][1] [2][4]
	vpblendd	\$0b00110000,$A31,@T[6],@T[6]	#        [1][2] [4][0] [2][3]
	vpblendd	\$0b00110000,$A21,@T[7],@T[7]	#        [1][3] [4][1] [2][4]
	vpblendd	\$0b11000000,$A21,@T[6],@T[6]	# [3][4] [1][2] [4][0] [2][3]
	vpblendd	\$0b11000000,$A20,@T[7],@T[7]	# [3][0] [1][3] [4][1] [2][4]
	vpblendd	\$0b00001100,$A31,$A41, @T[4]	#        [1][4] [4][3]
	vpblendd	\$0b11000000,$A31,$A41, @T[8]	# [3][1]               [2][3]
	vpandn		@T[7],@T[6],@T[6]	# tgting  [3][3] [1][1] [4][4] [2][2]
	 vpermq		\$0b00011011,@T[3],$A31	######### post-Chi shuffle
	 vpermq		\$0b10001101,@T[5],$A41
	vpxor		$A11,@T[6],@T[6]
	 vpermq		\$0b00000000,$A00,$A00	# broadcast A[0][0]

	vpblendd	\$0b00000011,$A11,@T[4],@T[4]	#        [1][4] [4][3] [2][2]
	vpblendd	\$0b00001100,$A11,@T[8],@T[8]	# [3][1]        [4][4] [2][3]
	 vpermq		\$0b01110010,@T[6],$A11
	vpblendd	\$0b11000000,$A20,@T[4],@T[4]	# [3][0] [1][4] [4][3] [2][2]
	vpblendd	\$0b00110000,$A20,@T[8],@T[8]	# [3][1] [1][0] [4][4] [2][3]
	vpandn		@T[8],@T[4],@T[4]	# tgting  [3][4] [1][3] [4][2] [2][1]
	vpandn		@T[8],$A01,$A01		# tgting  [0][4] [0][3] [0][2] [0][1]

	vpblendd	\$0b00001100,@T[5],@T[4],$A20	#               [4][1] [2][1]
	vpblendd	\$0b00110000,@T[6],$A20,$A20	#        [1][1] [4][1] [2][1]
	vpblendd	\$0b11000000,@T[3],$A20,$A20	# [3][1] [1][1] [4][1] [2][1]
	vpblendd	\$0b00001100,@T[4],@T[6],@T[7]	#               [4][2] [2][2]
	vpblendd	\$0b00110000,@T[3],@T[7],@T[7]	#        [1][2] [4][2] [2][2]
	vpblendd	\$0b11000000,@T[5],@T[7],@T[7]	# [3][2] [1][2] [4][2] [2][2]
	vpandn		@T[7],$A20,$A20		# tgting  [3][0] [1][0] [4][0] [2][0]

	vpblendd	\$0b00001100,@T[6],@T[2],$A31	#               [4][4] [2][0]
	vpblendd	\$0b00110000,@T[4],$A31,$A31	#        [1][3] [4][4] [2][0]
	vpblendd	\$0b11000000,@T[5],$A31,$A31	# [3][2] [1][3] [4][4] [2][0]
	vpblendd	\$0b00001100,@T[2],@T[4],@T[8]	#               [4][0] [2][1]
	vpblendd	\$0b00110000,@T[5],@T[8],@T[8]	#        [1][4] [4][0] [2][1]
	vpblendd	\$0b11000000,@T[6],@T[8],@T[8]	# [3][3] [1][4] [4][0] [2][1]
	vpandn		@T[8],$A31,$A31		# tgting  [3][1] [1][2] [4][3] [2][4]

	vpblendd	\$0b00001100,@T[3],@T[6],$A21	#               [4][3] [2][2]
	vpblendd	\$0b00110000,@T[5],$A21,$A21	#        [1][4] [4][3] [2][2]
	vpblendd	\$0b11000000,@T[2],$A21,$A21	# [3][0] [1][4] [4][3] [2][2]
	vpblendd	\$0b00001100,@T[6],@T[5],@T[7]	#               [4][4] [2][3]
	vpblendd	\$0b00110000,@T[2],@T[7],@T[7]	#        [1][0] [4][4] [2][3]
	vpblendd	\$0b11000000,@T[3],@T[7],@T[7]	# [3][1] [1][0] [4][4] [2][3]
	vpandn		@T[7],$A21,$A21		# tgting  [3][4] [1][3] [4][2] [2][1]

	vpblendd	\$0b00001100,@T[4],@T[3],$A41	#               [4][2] [2][4]
	vpblendd	\$0b00110000,@T[2],$A41,$A41	#        [1][0] [4][2] [2][4]
	vpblendd	\$0b11000000,@T[6],$A41,$A41	# [3][3] [1][0] [4][2] [2][4]
	vpblendd	\$0b00001100,@T[3],@T[2],@T[8]	#               [4][3] [2][0]
	vpblendd	\$0b00110000,@T[6],@T[8],@T[8]	#        [1][1] [4][3] [2][0]
	vpblendd	\$0b11000000,@T[4],@T[8],@T[8]	# [3][4] [1][1] [4][3] [2][0]
	vpandn		@T[8],$A41,$A41		# tgting  [3][2] [1][4] [4][1] [2][3]

	vpblendd	\$0b00001100,@T[2],@T[5],$A11	#               [4][0] [2][3]
	vpblendd	\$0b00110000,@T[3],$A11,$A11	#        [1][2] [4][0] [2][3]
	vpblendd	\$0b11000000,@T[4],$A11,$A11	# [3][4] [1][2] [4][0] [2][3]
	vpblendd	\$0b00001100,@T[5],@T[3],@T[7]	#               [4][1] [2][4]
	vpblendd	\$0b00110000,@T[4],@T[7],@T[7]	#        [1][3] [4][1] [2][4]
	vpblendd	\$0b11000000,@T[2],@T[7],@T[7]	# [3][0] [1][3] [4][1] [2][4]
	vpandn		@T[7],$A11,$A11		# tgting  [3][3] [1][1] [4][4] [2][2]

	vpxor		@T[2],$A20,$A20
	vpxor		@T[0],$A00,$A00
	vpxor		@T[1],$A01,$A01
	vpxor		@T[2],$A20,$A20
	vpxor		@T[3],$A31,$A31
	vpxor		@T[4],$A21,$A21
	vpxor		@T[5],$A41,$A41
	vpxor		@T[6],$A11,$A11

	vpermq		\$0b00011011,$A31,$A31	# post-Chi shuffle
	vpermq		\$0b10001101,$A41,$A41
	vpermq		\$0b01110010,$A11,$A11

	######################################### Iota
	vpxor		(%r10),$A00,$A00