sha/asm/keccak1600-avx512.pl: improve performance by 17%. (e3c79f0f) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/sha/asm/keccak1600-avx512.pl

+278 −176

Original line number	Diff line number	Diff line
		@@ -20,28 +20,60 @@
		# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
		# Pretty straightforward, the only "magic" is data layout in registers.
		# It's impossible to have one that is optimal for every step, hence
		# it's changing as algorithm progresses. Data is saved in order that
		# benefits Chi, but at the same time is easily convertible to order
		# that benefits Theta. Conversion from Chi layout to Theta is
		# explicit and reverse one is kind of fused with Pi...
		# it's changing as algorithm progresses. Data is saved in linear order,
		# but in-register order morphs between rounds. Even rounds take in
		# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
		#
		########################################################################
		# Numbers are cycles per processed byte out of large message.
		#
		# r=1088(*)
		#
		# Knights Landing 8.9
		# Skylake-X 6.7
		# Knights Landing 7.6
		# Skylake-X 5.7
		#
		# (*) Corresponds to SHA3-256.

		########################################################################
		# Coordinates below correspond to those in sha/keccak1600.c. Layout
		# suitable for Chi is one with y coordinates aligned column-wise. Trick
		# is to add regular shift to x coordinate, so that Chi can still be
		# performed with as little as 7 instructions, yet be converted to layout
		# suitable for Theta with intra-register permutations alone. Here is
		# "magic" layout for Chi (with pre-Theta shuffle):
		# Below code is combination of two ideas. One is taken from Keccak Code
		# Package, hereafter KCP, and another one from initial version of this
		# module. What is common is observation that Pi's input and output are
		# "mostly transposed", i.e. if input is aligned by x coordinate, then
		# output is [mostly] aligned by y. Both versions, KCP and predecessor,
		# were trying to use one of them from round to round, which resulted in
		# some kind of transposition in each round. This version still does
		# transpose data, but only every second round. Another essential factor
		# is that KCP transposition has to be performed with instructions that
		# turned to be rather expensive on Knights Landing, both latency- and
		# throughput-wise. Not to mention that some of them have to depend on
		# each other. On the other hand initial version of this module was
		# relying heavily on blend instructions. There were lots of them,
		# resulting in higher instruction count, yet it performed better on
		# Knights Landing, because processor can execute pair of them each
		# cycle and they have minimal latency. This module is an attempt to
		# bring best parts together:-)
		#
		# Coordinates below correspond to those in sha/keccak1600.c. Input
		# layout is straight linear:
		#
		# [0][4] [0][3] [0][2] [0][1] [0][0]
		# [1][4] [1][3] [1][2] [1][1] [1][0]
		# [2][4] [2][3] [2][2] [2][1] [2][0]
		# [3][4] [3][3] [3][2] [3][1] [3][0]
		# [4][4] [4][3] [4][2] [4][1] [4][0]
		#
		# It's perfect for Theta, while Pi is reduced to intra-register
		# permutations which yield layout perfect for Chi:
		#
		# [4][0] [3][0] [2][0] [1][0] [0][0]
		# [4][1] [3][1] [2][1] [1][1] [0][1]
		# [4][2] [3][2] [2][2] [1][2] [0][2]
		# [4][3] [3][3] [2][3] [1][3] [0][3]
		# [4][4] [3][4] [2][4] [1][4] [0][4]
		#
		# Now instead of performing full transposition and feeding it to next
		# identical round, we perform kind of diagonal transposition to layout
		# from initial version of this module, and make it suitable for Theta:
		#
		# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
		# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
		@@ -49,53 +81,52 @@
		# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
		# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
		#
		# Layout suitable to Theta has x coordinates aligned column-wise
		# [it's interleaved with Pi indices transformation for reference]:
		# Now intra-register permutations yield initial [almost] straight
		# linear layout:
		#
		# [4][4] [3][3] [2][2] [1][1] [0][0] $A00
		# [4][4] [3][3] [2][2] [1][1] [0][0]
		##[0][4] [0][3] [0][2] [0][1] [0][0]
		# [3][4] [2][3] [1][2] [0][1] [4][0] $A01
		# [3][4] [2][3] [1][2] [0][1] [4][0]
		##[2][3] [2][2] [2][1] [2][0] [2][4]
		# [2][4] [1][3] [0][2] [4][1] [3][0] $A02
		# [2][4] [1][3] [0][2] [4][1] [3][0]
		##[4][2] [4][1] [4][0] [4][4] [4][3]
		# [1][4] [0][3] [4][2] [3][1] [2][0] $A03
		# [1][4] [0][3] [4][2] [3][1] [2][0]
		##[1][1] [1][0] [1][4] [1][3] [1][2]
		# [0][4] [4][3] [3][2] [2][1] [1][0] $A04
		# [0][4] [4][3] [3][2] [2][1] [1][0]
		##[3][0] [3][4] [3][3] [3][2] [3][1]
		#
		# Pi itself is performed by blending above data and finally shuffling it
		# to original Chi layout:
		#
		# [1][1] [2][2] [3][3] [4][4] [0][0]>1.2.3.4.0>[4][4] [3][3] [2][2] [1][1] [0][0]
		# [2][3] [3][4] [4][0] [0][1] [1][2]>2.3.4.0.1>[4][0] [3][4] [2][3] [1][2] [0][1]
		# [3][0] [4][1] [0][2] [1][3] [2][4]>3.4.0.1.2>[4][1] [3][0] [2][4] [1][3] [0][2]
		# [4][2] [0][3] [1][4] [2][0] [3][1]>4.0.1.2.3>[4][2] [3][1] [2][0] [1][4] [0][3]
		# [0][4] [1][0] [2][1] [3][2] [4][3]>0.1.2.3.4>[4][3] [3][2] [2][1] [1][0] [0][4]
		# This means that odd round Chi is performed in less suitable layout,
		# with a number of additional permutations. But overall it turned to be
		# a win. Permutations are fastest possible on Knights Landing and they
		# are laid down to be independent of each other. In the essence I traded
		# 20 blend instructions for 3 permutations. The result is 13% faster
		# than KCP on Skylake-X, and >40% on Knights Landing.
		#
		# As implied, data is loaded in Chi layout. Digits in variables' names
		# represent right most coordinates of loaded data chunk:

		my ($A00, # [4][4] [3][3] [2][2] [1][1] [0][0]
		$A01, # [4][0] [3][4] [2][3] [1][2] [0][1]
		$A02, # [4][1] [3][0] [2][4] [1][3] [0][2]
		$A03, # [4][2] [3][1] [2][0] [1][4] [0][3]
		$A04) = # [4][3] [3][2] [2][1] [1][0] [0][4]
		# As implied, data is loaded in straight linear order. Digits in
		# variables' names represent coordinates of right-most element of
		# loaded data chunk:

		my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
		$A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
		$A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
		$A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
		$A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
		map("%zmm$_",(0..4));

		# We also need to map the magic order into offsets within structure:

		my @A_jagged = ([0,0], [1,0], [2,0], [3,0], [4,0],
		[4,1], [0,1], [1,1], [2,1], [3,1],
		[3,2], [4,2], [0,2], [1,2], [2,2],
		[2,3], [3,3], [4,3], [0,3], [1,3],
		[1,4], [2,4], [3,4], [4,4], [0,4]);
		@A_jagged_in = map(8($$_[0]8+$$_[1]), @A_jagged); # ... and now linear
		@A_jagged_out = map(8($$_[0]5+$$_[1]), @A_jagged); # ... and now linear
		my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
		[1,0], [1,1], [1,2], [1,3], [1,4],
		[2,0], [2,1], [2,2], [2,3], [2,4],
		[3,0], [3,1], [3,2], [3,3], [3,4],
		[4,0], [4,1], [4,2], [4,3], [4,4]);
		@A_jagged = map(8($$_[0]8+$$_[1]), @A_jagged); # ... and now linear

		my @T = map("%zmm$_",(5..7,16..17));
		my @Chi = map("%zmm$_",(18..22));
		my @Theta = map("%zmm$_",(33,23..26)); # invalid @Theta[0] is not typo
		my @Rhotate = map("%zmm$_",(27..31));
		my @T = map("%zmm$_",(5..12));
		my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
		my @Pi0 = map("%zmm$_",(17..21));
		my @Rhotate0 = map("%zmm$_",(22..26));
		my @Rhotate1 = map("%zmm$_",(27..31));

		my ($C00,$D00) = @T[0..1];
		my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
		@@ -107,82 +138,136 @@ $code.=<<___;
		.align 32
		__KeccakF1600:
		lea iotas(%rip),%r10
		mov \$24,%eax
		mov \$12,%eax
		jmp .Loop_avx512

		.align 32
		.Loop_avx512:
		######################################### Theta
		#vpermq $A00,@Theta[0],$A00 # doesn't actually change order
		vpermq $A01,@Theta[1],$A01
		vpermq $A02,@Theta[2],$A02
		vpermq $A03,@Theta[3],$A03
		vpermq $A04,@Theta[4],$A04

		######################################### Theta, even round
		vmovdqa64 $A00,@T[0] # put aside original A00
		vpternlogq \$0x96,$A02,$A01,$A00 # and use it as "C00"
		vpternlogq \$0x96,$A04,$A03,$A00
		vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
		vpternlogq \$0x96,$A40,$A30,$A00

		vprolq \$1,$A00,$D00
		vpermq $A00,@Theta[1],$A00
		vpermq $D00,@Theta[4],$D00

		vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
		vpternlogq \$0x96,$A00,$D00,$A01
		vpternlogq \$0x96,$A00,$D00,$A02
		vpternlogq \$0x96,$A00,$D00,$A03
		vpternlogq \$0x96,$A00,$D00,$A04
		vpternlogq \$0x96,$A00,$D00,$A10
		vpternlogq \$0x96,$A00,$D00,$A20
		vpternlogq \$0x96,$A00,$D00,$A30
		vpternlogq \$0x96,$A00,$D00,$A40

		######################################### Rho
		vprolvq @Rhotate[0],@T[0],$A00 # T[0] is original A00
		vprolvq @Rhotate[1],$A01,$A01
		vprolvq @Rhotate[2],$A02,$A02
		vprolvq @Rhotate[3],$A03,$A03
		vprolvq @Rhotate[4],$A04,$A04
		vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
		vprolvq @Rhotate0[1],$A10,$A10
		vprolvq @Rhotate0[2],$A20,$A20
		vprolvq @Rhotate0[3],$A30,$A30
		vprolvq @Rhotate0[4],$A40,$A40

		######################################### Pi
		vpblendmq $A02,$A00,@{T[0]}{$k00010}
		vpblendmq $A00,$A03,@{T[1]}{$k00010}
		vpblendmq $A03,$A01,@{T[2]}{$k00010}
		vpblendmq $A01,$A04,@{T[3]}{$k00010}
		vpblendmq $A04,$A02,@{T[4]}{$k00010}

		vpblendmq $A04,@T[0],@{T[0]}{$k00100}
		vpblendmq $A02,@T[1],@{T[1]}{$k00100}
		vpblendmq $A00,@T[2],@{T[2]}{$k00100}
		vpblendmq $A03,@T[3],@{T[3]}{$k00100}
		vpblendmq $A01,@T[4],@{T[4]}{$k00100}

		vpblendmq $A01,@T[0],@{T[0]}{$k01000}
		vpblendmq $A04,@T[1],@{T[1]}{$k01000}
		vpblendmq $A02,@T[2],@{T[2]}{$k01000}
		vpblendmq $A00,@T[3],@{T[3]}{$k01000}
		vpblendmq $A03,@T[4],@{T[4]}{$k01000}

		vpblendmq $A03,@T[0],@{T[0]}{$k10000}
		vpblendmq $A01,@T[1],@{T[1]}{$k10000}
		vpblendmq $A04,@T[2],@{T[2]}{$k10000}
		vpblendmq $A02,@T[3],@{T[3]}{$k10000}
		vpblendmq $A00,@T[4],@{T[4]}{$k10000}

		vpermq @T[0],@Chi[0],$A00
		vpermq @T[1],@Chi[1],$A01
		vpermq @T[2],@Chi[2],$A02
		vpermq @T[3],@Chi[3],$A03
		vpermq @T[4],@Chi[4],$A04
		vpermq $A00,@Pi0[0],$A00
		vpermq $A10,@Pi0[1],$A10
		vpermq $A20,@Pi0[2],$A20
		vpermq $A30,@Pi0[3],$A30
		vpermq $A40,@Pi0[4],$A40

		######################################### Chi
		vmovdqa64 $A00,@T[0]
		vpternlogq \$0xD2,$A02,$A01,$A00
		vmovdqa64 $A01,@T[1]
		vpternlogq \$0xD2,$A03,$A02,$A01
		vpternlogq \$0xD2,$A04,$A03,$A02
		vpternlogq \$0xD2,@T[0],$A04,$A03
		vpternlogq \$0xD2,@T[1],@T[0],$A04
		vmovdqa64 $A10,@T[1]
		vpternlogq \$0xD2,$A20,$A10,$A00
		vpternlogq \$0xD2,$A30,$A20,$A10
		vpternlogq \$0xD2,$A40,$A30,$A20
		vpternlogq \$0xD2,@T[0],$A40,$A30
		vpternlogq \$0xD2,@T[1],@T[0],$A40

		######################################### Iota
		vpxorq (%r10),$A00,${A00}{$k00001}
		lea 8(%r10),%r10
		lea 16(%r10),%r10

		######################################### Harmonize rounds
		vpblendmq $A20,$A10,@{T[1]}{$k00010}
		vpblendmq $A30,$A20,@{T[2]}{$k00010}
		vpblendmq $A40,$A30,@{T[3]}{$k00010}
		vpblendmq $A10,$A00,@{T[0]}{$k00010}
		vpblendmq $A00,$A40,@{T[4]}{$k00010}

		vpblendmq $A30,@T[1],@{T[1]}{$k00100}
		vpblendmq $A40,@T[2],@{T[2]}{$k00100}
		vpblendmq $A20,@T[0],@{T[0]}{$k00100}
		vpblendmq $A00,@T[3],@{T[3]}{$k00100}
		vpblendmq $A10,@T[4],@{T[4]}{$k00100}

		vpblendmq $A40,@T[1],@{T[1]}{$k01000}
		vpblendmq $A30,@T[0],@{T[0]}{$k01000}
		vpblendmq $A00,@T[2],@{T[2]}{$k01000}
		vpblendmq $A10,@T[3],@{T[3]}{$k01000}
		vpblendmq $A20,@T[4],@{T[4]}{$k01000}

		vpblendmq $A40,@T[0],@{T[0]}{$k10000}
		vpblendmq $A00,@T[1],@{T[1]}{$k10000}
		vpblendmq $A10,@T[2],@{T[2]}{$k10000}
		vpblendmq $A20,@T[3],@{T[3]}{$k10000}
		vpblendmq $A30,@T[4],@{T[4]}{$k10000}

		#vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
		vpermq @T[1],@Theta[1],$A10
		vpermq @T[2],@Theta[2],$A20
		vpermq @T[3],@Theta[3],$A30
		vpermq @T[4],@Theta[4],$A40

		######################################### Theta, odd round
		vmovdqa64 $T[0],$A00 # real A00
		vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
		vpternlogq \$0x96,$A40,$A30,$C00

		vprolq \$1,$C00,$D00
		vpermq $C00,@Theta[1],$C00
		vpermq $D00,@Theta[4],$D00

		vpternlogq \$0x96,$C00,$D00,$A00
		vpternlogq \$0x96,$C00,$D00,$A30
		vpternlogq \$0x96,$C00,$D00,$A10
		vpternlogq \$0x96,$C00,$D00,$A40
		vpternlogq \$0x96,$C00,$D00,$A20

		######################################### Rho
		vprolvq @Rhotate1[0],$A00,$A00
		vprolvq @Rhotate1[3],$A30,@T[1]
		vprolvq @Rhotate1[1],$A10,@T[2]
		vprolvq @Rhotate1[4],$A40,@T[3]
		vprolvq @Rhotate1[2],$A20,@T[4]

		vpermq $A00,@Theta[4],@T[5]
		vpermq $A00,@Theta[3],@T[6]

		######################################### Iota
		vpxorq -8(%r10),$A00,${A00}{$k00001}

		######################################### Pi
		vpermq @T[1],@Theta[2],$A10
		vpermq @T[2],@Theta[4],$A20
		vpermq @T[3],@Theta[1],$A30
		vpermq @T[4],@Theta[3],$A40

		######################################### Chi
		vpternlogq \$0xD2,@T[6],@T[5],$A00

		vpermq @T[1],@Theta[1],@T[7]
		#vpermq @T[1],@Theta[0],@T[1]
		vpternlogq \$0xD2,@T[1],@T[7],$A10

		vpermq @T[2],@Theta[3],@T[0]
		vpermq @T[2],@Theta[2],@T[2]
		vpternlogq \$0xD2,@T[2],@T[0],$A20

		#vpermq @T[3],@Theta[0],@T[3]
		vpermq @T[3],@Theta[4],@T[1]
		vpternlogq \$0xD2,@T[1],@T[3],$A30

		vpermq @T[4],@Theta[2],@T[0]
		vpermq @T[4],@Theta[1],@T[4]
		vpternlogq \$0xD2,@T[4],@T[0],$A40

		dec %eax
		jnz .Loop_avx512
		@@ -208,8 +293,6 @@ SHA3_absorb:
		lea 96($inp),$inp
		lea 128(%rsp),%r9

		vzeroupper

		lea theta_perm(%rip),%r8

		kxnorw $k11111,$k11111,$k11111
		@@ -226,24 +309,30 @@ SHA3_absorb:
		vmovdqa64 64*3(%r8),@Theta[3]
		vmovdqa64 64*4(%r8),@Theta[4]

		vmovdqa64 64*5(%r8),@Rhotate[0]
		vmovdqa64 64*6(%r8),@Rhotate[1]
		vmovdqa64 64*7(%r8),@Rhotate[2]
		vmovdqa64 64*8(%r8),@Rhotate[3]
		vmovdqa64 64*9(%r8),@Rhotate[4]
		vmovdqa64 64*5(%r8),@Rhotate1[0]
		vmovdqa64 64*6(%r8),@Rhotate1[1]
		vmovdqa64 64*7(%r8),@Rhotate1[2]
		vmovdqa64 64*8(%r8),@Rhotate1[3]
		vmovdqa64 64*9(%r8),@Rhotate1[4]

		vmovdqa64 64*10(%r8),@Rhotate0[0]
		vmovdqa64 64*11(%r8),@Rhotate0[1]
		vmovdqa64 64*12(%r8),@Rhotate0[2]
		vmovdqa64 64*13(%r8),@Rhotate0[3]
		vmovdqa64 64*14(%r8),@Rhotate0[4]

		vmovdqa64 64*10(%r8),@Chi[0]
		vmovdqa64 64*11(%r8),@Chi[1]
		vmovdqa64 64*12(%r8),@Chi[2]
		vmovdqa64 64*13(%r8),@Chi[3]
		vmovdqa64 64*14(%r8),@Chi[4]
		vmovdqa64 64*15(%r8),@Pi0[0]
		vmovdqa64 64*16(%r8),@Pi0[1]
		vmovdqa64 64*17(%r8),@Pi0[2]
		vmovdqa64 64*18(%r8),@Pi0[3]
		vmovdqa64 64*19(%r8),@Pi0[4]

		vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
		vpxorq @T[0],@T[0],@T[0]
		vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z}
		vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z}
		vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z}
		vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z}
		vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
		vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
		vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
		vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}

		vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
		vmovdqa64 @T[0],1*64-128(%r9)
		@@ -263,7 +352,7 @@ ___
		for(my $i=0; $i<25; $i++) {
		$code.=<<___
		mov 8*$i-96($inp),%r8
		mov %r8,$A_jagged_in[$i]-128(%r9)
		mov %r8,$A_jagged[$i]-128(%r9)
		dec %eax
		jz .Labsorved_avx512
		___
		@@ -273,10 +362,10 @@ $code.=<<___;
		lea ($inp,$bsz),$inp

		vpxorq 64*0-128(%r9),$A00,$A00
		vpxorq 64*1-128(%r9),$A01,$A01
		vpxorq 64*2-128(%r9),$A02,$A02
		vpxorq 64*3-128(%r9),$A03,$A03
		vpxorq 64*4-128(%r9),$A04,$A04
		vpxorq 64*1-128(%r9),$A10,$A10
		vpxorq 64*2-128(%r9),$A20,$A20
		vpxorq 64*3-128(%r9),$A30,$A30
		vpxorq 64*4-128(%r9),$A40,$A40

		call __KeccakF1600

		@@ -285,10 +374,10 @@ $code.=<<___;
		.align 32
		.Ldone_absorb_avx512:
		vmovdqu64 $A00,40*0-96($A_flat){$k11111}
		vmovdqu64 $A01,40*1-96($A_flat){$k11111}
		vmovdqu64 $A02,40*2-96($A_flat){$k11111}
		vmovdqu64 $A03,40*3-96($A_flat){$k11111}
		vmovdqu64 $A04,40*4-96($A_flat){$k11111}
		vmovdqu64 $A10,40*1-96($A_flat){$k11111}
		vmovdqu64 $A20,40*2-96($A_flat){$k11111}
		vmovdqu64 $A30,40*3-96($A_flat){$k11111}
		vmovdqu64 $A40,40*4-96($A_flat){$k11111}

		vzeroupper

		@@ -307,8 +396,6 @@ SHA3_squeeze:
		cmp $bsz,$len
		jbe .Lno_output_extension_avx512

		vzeroupper

		lea theta_perm(%rip),%r8

		kxnorw $k11111,$k11111,$k11111
		@@ -325,65 +412,72 @@ SHA3_squeeze:
		vmovdqa64 64*3(%r8),@Theta[3]
		vmovdqa64 64*4(%r8),@Theta[4]

		vmovdqa64 64*5(%r8),@Rhotate[0]
		vmovdqa64 64*6(%r8),@Rhotate[1]
		vmovdqa64 64*7(%r8),@Rhotate[2]
		vmovdqa64 64*8(%r8),@Rhotate[3]
		vmovdqa64 64*9(%r8),@Rhotate[4]
		vmovdqa64 64*5(%r8),@Rhotate1[0]
		vmovdqa64 64*6(%r8),@Rhotate1[1]
		vmovdqa64 64*7(%r8),@Rhotate1[2]
		vmovdqa64 64*8(%r8),@Rhotate1[3]
		vmovdqa64 64*9(%r8),@Rhotate1[4]

		vmovdqa64 64*10(%r8),@Rhotate0[0]
		vmovdqa64 64*11(%r8),@Rhotate0[1]
		vmovdqa64 64*12(%r8),@Rhotate0[2]
		vmovdqa64 64*13(%r8),@Rhotate0[3]
		vmovdqa64 64*14(%r8),@Rhotate0[4]

		vmovdqa64 64*10(%r8),@Chi[0]
		vmovdqa64 64*11(%r8),@Chi[1]
		vmovdqa64 64*12(%r8),@Chi[2]
		vmovdqa64 64*13(%r8),@Chi[3]
		vmovdqa64 64*14(%r8),@Chi[4]
		vmovdqa64 64*15(%r8),@Pi0[0]
		vmovdqa64 64*16(%r8),@Pi0[1]
		vmovdqa64 64*17(%r8),@Pi0[2]
		vmovdqa64 64*18(%r8),@Pi0[3]
		vmovdqa64 64*19(%r8),@Pi0[4]

		vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
		vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z}
		vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z}
		vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z}
		vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z}
		vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
		vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
		vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
		vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}

		.Lno_output_extension_avx512:
		shr \$3,$bsz
		lea -96($A_flat),%r9
		mov $bsz,%rax
		jmp .Loop_squeeze_avx512

		.align 32
		.Loop_squeeze_avx512:
		mov @A_jagged_out[$i]-96($A_flat),%r8
		___
		for (my $i=0; $i<25; $i++) {
		$code.=<<___;
		sub \$8,$len
		jc .Ltail_squeeze_avx512
		cmp \$8,$len
		jb .Ltail_squeeze_avx512

		mov (%r9),%r8
		lea 8(%r9),%r9
		mov %r8,($out)
		lea 8($out),$out
		je .Ldone_squeeze_avx512
		dec %eax
		je .Lextend_output_avx512
		mov @A_jagged_out[$i+1]-96($A_flat),%r8
		___
		}
		$code.=<<___;
		.Lextend_output_avx512:
		sub \$8,$len # len -= 8
		jz .Ldone_squeeze_avx512

		sub \$1,%rax # bsz--
		jnz .Loop_squeeze_avx512

		#vpermq @Theta[4],@Theta[4],@Theta[3]
		#vpermq @Theta[3],@Theta[4],@Theta[2]
		#vpermq @Theta[3],@Theta[3],@Theta[1]

		call __KeccakF1600

		vmovdqu64 $A00,40*0-96($A_flat){$k11111}
		vmovdqu64 $A01,40*1-96($A_flat){$k11111}
		vmovdqu64 $A02,40*2-96($A_flat){$k11111}
		vmovdqu64 $A03,40*3-96($A_flat){$k11111}
		vmovdqu64 $A04,40*4-96($A_flat){$k11111}
		vmovdqu64 $A10,40*1-96($A_flat){$k11111}
		vmovdqu64 $A20,40*2-96($A_flat){$k11111}
		vmovdqu64 $A30,40*3-96($A_flat){$k11111}
		vmovdqu64 $A40,40*4-96($A_flat){$k11111}

		lea -96($A_flat),%r9
		mov $bsz,%rax
		jmp .Loop_squeeze_avx512


		.Ltail_squeeze_avx512:
		add \$8,$len
		.Loop_tail_avx512:
		mov %r8b,($out)
		lea 1($out),$out
		shr \$8,%r8
		dec $len
		jnz .Loop_tail_avx512
		mov %r9,%rsi
		mov $out,%rdi
		mov $len,%rcx
		.byte 0xf3,0xa4 # rep movsb

		.Ldone_squeeze_avx512:
		vzeroupper
		@@ -400,19 +494,27 @@ theta_perm:
		.quad 2, 3, 4, 0, 1, 5, 6, 7
		.quad 1, 2, 3, 4, 0, 5, 6, 7

		rhotates:
		rhotates1:
		.quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
		.quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
		.quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
		.quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
		.quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]

		chi_perm:
		.quad 0, 4, 3, 2, 1, 5, 6, 7
		.quad 1, 0, 4, 3, 2, 5, 6, 7
		.quad 2, 1, 0, 4, 3, 5, 6, 7
		.quad 3, 2, 1, 0, 4, 5, 6, 7
		.quad 4, 3, 2, 1, 0, 5, 6, 7
		rhotates0:
		.quad 0, 1, 62, 28, 27, 0, 0, 0
		.quad 36, 44, 6, 55, 20, 0, 0, 0
		.quad 3, 10, 43, 25, 39, 0, 0, 0
		.quad 41, 45, 15, 21, 8, 0, 0, 0
		.quad 18, 2, 61, 56, 14, 0, 0, 0

		pi0_perm:
		.quad 0, 3, 1, 4, 2, 5, 6, 7
		.quad 1, 4, 2, 0, 3, 5, 6, 7
		.quad 2, 0, 3, 1, 4, 5, 6, 7
		.quad 3, 1, 4, 2, 0, 5, 6, 7
		.quad 4, 2, 0, 3, 1, 5, 6, 7


		iotas:
		.quad 0x0000000000000001