Commit 367c5527 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

sha/asm/keccak1600-armv4.pl: add NEON code path.



Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
parent 56676f87
Loading
Loading
Loading
Loading
+530 −20
Original line number Diff line number Diff line
@@ -17,25 +17,31 @@
#
# June 2017.
#
# This is KECCAK_1X variant (see keccak1600.c) with bit interleaving.
# How does it compare to Keccak Code Package? It's as fast, but several
# times smaller, and is endian- and ISA-neutral. ISA neutrality means
# that minimum ISA requirement is ARMv4, yet it can be assembled even
# as ARMv7 Thumb-2.
# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
# interleaving. How does it compare to Keccak Code Package? It's as
# fast, but several times smaller, and is endian- and ISA-neutral. ISA
# neutrality means that minimum ISA requirement is ARMv4, yet it can
# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
# register layout taken from Keccak Code Package. It's also as fast,
# in fact faster by 10-15% on some processors, and endian-neutral.
#
########################################################################
# Numbers are cycles per processed byte accounting even for input bit
# interleaving.
# Numbers are cycles per processed byte. Non-NEON results account even
# for input bit interleaving [which takes ~1/4-1/3 of time].
#
#		r=1600(*)	r=1024
#		r=1600(*),NEON		r=1088(**),NEON
#
# Cortex-A7	71/+180%	103
# Cortex-A8	48/+290%	69
# Cortex-A15	34/+210%	49
# Cortex-A5	80/+220%, 24		110,       36
# Cortex-A7	71/+180%, 23		99,        34
# Cortex-A8	48/+290%, 20		67,        30
# Cortex-A9	48/+290%, 17		66,        26
# Cortex-A15	34/+210%, 12		47,        18
# Snapdragon S4	44/+230%, 16		59,        24
#
# (*)	Not used in real life, meaningful as estimate for single sponge
#	operation performance. Numbers after slash are improvement over
#	compiler-generated KECCAK_1X reference code.
# (*)	Not used in real life, meaningful as estimate for single absorb
#	operation performance. Percentage after slash is improvement
#	over compiler-generated KECCAK_1X reference code.
# (**)	Corresponds to SHA3-256, 8KB message size.

my @C = map("r$_",(0..9));
my @E = map("r$_",(10..12,14));
@@ -72,9 +78,9 @@ $code.=<<___;
.code	32
#endif

.type	iotas,%object
.type	iotas32, %object
.align	5
iotas:
iotas32:
	.long	0x00000001, 0x00000000
	.long	0x00000000, 0x00000089
	.long	0x00000000, 0x8000008b
@@ -99,7 +105,7 @@ iotas:
	.long	0x00000000, 0x80000088
	.long	0x00000001, 0x00008000
	.long	0x00000000, 0x80008082
.size	iotas,.-iotas
.size	iotas32,.-iotas32

.type	KeccakF1600_int, %function
.align	5
@@ -254,7 +260,7 @@ KeccakF1600_enter:
	 ldr	@E[2],[sp,#324]			@ load counter
	eor	@C[2],@C[2],@E[0]
	ror	@C[4],@E[3],#32-22
	 adr	@E[3],iotas
	 adr	@E[3],iotas32
	eor	@C[3],@C[3],@E[1]
	ror	@C[2],@C[2],#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
	 add	@E[3],@E[3],@E[2]
@@ -791,11 +797,515 @@ SHA3_squeeze:
.Lsqueeze_done:
	ldmia	sp!,{r4-r10,pc}
.size	SHA3_squeeze,.-SHA3_squeeze
.asciz	"Keccak-1600 absorb and squeeze for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align	2
___
}

$code.=<<___;
.fpu	neon

.type	iotas64, %object
.align 5
iotas64:
	.quad	0x0000000000000001
	.quad	0x0000000000008082
	.quad	0x800000000000808a
	.quad	0x8000000080008000
	.quad	0x000000000000808b
	.quad	0x0000000080000001
	.quad	0x8000000080008081
	.quad	0x8000000000008009
	.quad	0x000000000000008a
	.quad	0x0000000000000088
	.quad	0x0000000080008009
	.quad	0x000000008000000a
	.quad	0x000000008000808b
	.quad	0x800000000000008b
	.quad	0x8000000000008089
	.quad	0x8000000000008003
	.quad	0x8000000000008002
	.quad	0x8000000000000080
	.quad	0x000000000000800a
	.quad	0x800000008000000a
	.quad	0x8000000080008081
	.quad	0x8000000000008080
	.quad	0x0000000080000001
	.quad	0x8000000080008008
.size	iotas64,.-iotas64

.type	KeccakF1600_neon, %function
.align	5
KeccakF1600_neon:
	add	r1, r0, #16
	adr	r2, iotas64
	mov	r3, #24			@ loop counter
	b	.Loop_neon

.align	4
.Loop_neon:
	@ Theta
	vst1.64		{q4},  [r0:64]		@ offload A[0..1][4]
	veor		q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
	vst1.64		{d18}, [r1:64]		@ offload A[2][4]
	veor		q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
	veor		q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
	veor		d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
	veor		d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
	veor		q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
	veor		q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
	veor		d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
	veor		d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
	veor		d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
	veor		q13, q13, q10		@ C[0..1]^=A[4][0..1]
	veor		q14, q15, q11		@ C[2..3]^=A[4][2..3]
	veor		d25, d25, d24		@ C[4]^=A[4][4]

	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
	vadd.u64	q15, q14, q14		@ C[2..3]<<1
	vadd.u64	d18, d25, d25		@ C[4]<<1
	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
	veor		d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
	veor		q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
	veor		d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
	veor		d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)

	veor		d0,  d0,  d25		@ A[0][0] ^= C[4]
	veor		d1,  d1,  d25		@ A[1][0] ^= C[4]
	veor		d10, d10, d25		@ A[2][0] ^= C[4]
	veor		d11, d11, d25		@ A[3][0] ^= C[4]
	veor		d20, d20, d25		@ A[4][0] ^= C[4]

	veor		d2,  d2,  d26		@ A[0][1] ^= D[1]
	veor		d3,  d3,  d26		@ A[1][1] ^= D[1]
	veor		d12, d12, d26		@ A[2][1] ^= D[1]
	veor		d13, d13, d26		@ A[3][1] ^= D[1]
	veor		d21, d21, d26		@ A[4][1] ^= D[1]
	vmov		d26, d27

	veor		d6,  d6,  d28		@ A[0][3] ^= C[2]
	veor		d7,  d7,  d28		@ A[1][3] ^= C[2]
	veor		d16, d16, d28		@ A[2][3] ^= C[2]
	veor		d17, d17, d28		@ A[3][3] ^= C[2]
	veor		d23, d23, d28		@ A[4][3] ^= C[2]
	vld1.64		{q4},  [r0:64]		@ restore A[0..1][4]
	vmov		d28, d29

	vld1.64		{d18}, [r1:64]		@ restore A[2][4]
	veor		q2,  q2,  q13		@ A[0..1][2] ^= D[2]
	veor		q7,  q7,  q13		@ A[2..3][2] ^= D[2]
	veor		d22, d22, d27		@ A[4][2]    ^= D[2]

	veor		q4,  q4,  q14		@ A[0..1][4] ^= C[3]
	veor		q9,  q9,  q14		@ A[2..3][4] ^= C[3]
	veor		d24, d24, d29		@ A[4][4]    ^= C[3]

	@ Rho + Pi
	vmov		d26, d2			@ C[1] = A[0][1]
	vshl.u64	d2,  d3,  #44
	vmov		d27, d4			@ C[2] = A[0][2]
	vshl.u64	d4,  d14, #43
	vmov		d28, d6			@ C[3] = A[0][3]
	vshl.u64	d6,  d17, #21
	vmov		d29, d8			@ C[4] = A[0][4]
	vshl.u64	d8,  d24, #14
	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])

	vshl.u64	d3,  d9,  #20
	vshl.u64	d14, d16, #25
	vshl.u64	d17, d15, #15
	vshl.u64	d24, d21, #2
	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])

	vshl.u64	d9,  d22, #61
	@ vshl.u64	d16, d19, #8
	vshl.u64	d15, d12, #10
	vshl.u64	d21, d7,  #55
	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
	vext.8		d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])

	vshl.u64	d22, d18, #39
	@ vshl.u64	d19, d23, #56
	vshl.u64	d12, d5,  #6
	vshl.u64	d7,  d13, #45
	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
	vext.8		d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])

	vshl.u64	d18, d20, #18
	vshl.u64	d23, d11, #41
	vshl.u64	d5,  d10, #3
	vshl.u64	d13, d1,  #36
	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])

	vshl.u64	d1,  d28, #28
	vshl.u64	d10, d26, #1
	vshl.u64	d11, d29, #27
	vshl.u64	d20, d27, #62
	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])

	@ Chi + Iota
	vbic		q13, q2,  q1
	vbic		q14, q3,  q2
	vbic		q15, q4,  q3
	veor		q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
	veor		q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
	veor		q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
	vst1.64		{q13}, [r0:64]		@ offload A[0..1][0]
	vbic		q13, q0,  q4
	vbic		q15, q1,  q0
	vmov		q1,  q14		@ A[0..1][1]
	veor		q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
	veor		q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])

	vbic		q13, q7,  q6
	vmov		q0,  q5			@ A[2..3][0]
	vbic		q14, q8,  q7
	vmov		q15, q6			@ A[2..3][1]
	veor		q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
	vbic		q13, q9,  q8
	veor		q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
	vbic		q14, q0,  q9
	veor		q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
	vbic		q13, q15, q0
	veor		q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
	vmov		q14, q10		@ A[4][0..1]
	veor		q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])

	vld1.64		d25, [r2:64]!		@ Iota[i++]
	vbic		d26, d22, d21
	vbic		d27, d23, d22
	vld1.64		{q0}, [r0:64]		@ restore A[0..1][0]
	veor		d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
	vbic		d26, d24, d23
	veor		d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
	vbic		d27, d28, d24
	veor		d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
	vbic		d26, d29, d28
	veor		d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
	veor		d0,  d0,  d25		@ A[0][0] ^= Iota[i]
	veor		d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])

	subs	r3, r3, #1
	bne	.Loop_neon

	bx	lr
.size	KeccakF1600_neon,.-KeccakF1600_neon

.global	SHA3_absorb_neon
.type	SHA3_absorb_neon, %function
.align	5
SHA3_absorb_neon:
	stmdb	sp!, {r4-r6,lr}
	vstmdb	sp!, {d8-d15}

	mov	r4, r1			@ inp
	mov	r5, r2			@ len
	mov	r6, r3			@ bsz

	vld1.32	{d0}, [r0:64]!		@ A[0][0]
	vld1.32	{d2}, [r0:64]!		@ A[0][1]
	vld1.32	{d4}, [r0:64]!		@ A[0][2]
	vld1.32	{d6}, [r0:64]!		@ A[0][3]
	vld1.32	{d8}, [r0:64]!		@ A[0][4]

	vld1.32	{d1}, [r0:64]!		@ A[1][0]
	vld1.32	{d3}, [r0:64]!		@ A[1][1]
	vld1.32	{d5}, [r0:64]!		@ A[1][2]
	vld1.32	{d7}, [r0:64]!		@ A[1][3]
	vld1.32	{d9}, [r0:64]!		@ A[1][4]

	vld1.32	{d10}, [r0:64]!		@ A[2][0]
	vld1.32	{d12}, [r0:64]!		@ A[2][1]
	vld1.32	{d14}, [r0:64]!		@ A[2][2]
	vld1.32	{d16}, [r0:64]!		@ A[2][3]
	vld1.32	{d18}, [r0:64]!		@ A[2][4]

	vld1.32	{d11}, [r0:64]!		@ A[3][0]
	vld1.32	{d13}, [r0:64]!		@ A[3][1]
	vld1.32	{d15}, [r0:64]!		@ A[3][2]
	vld1.32	{d17}, [r0:64]!		@ A[3][3]
	vld1.32	{d19}, [r0:64]!		@ A[3][4]

	vld1.32	{d20-d23}, [r0:64]!	@ A[4][0..3]
	vld1.32	{d24}, [r0:64]		@ A[4][4]
	sub	r0, r0, #24*8		@ rewind
	b	.Loop_absorb_neon

.align	4
.Loop_absorb_neon:
	subs	r12, r5, r6		@ len - bsz
	blo	.Labsorbed_neon
	mov	r5, r12

	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
	cmp	r6, #8*2
	veor	d0, d0, d31		@ A[0][0] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d2, d2, d31		@ A[0][1] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*4
	veor	d4, d4, d31		@ A[0][2] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d6, d6, d31		@ A[0][3] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31},[r4]!
	cmp	r6, #8*6
	veor	d8, d8, d31		@ A[0][4] ^= *inp++
	blo	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	veor	d1, d1, d31		@ A[1][0] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*8
	veor	d3, d3, d31		@ A[1][1] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d5, d5, d31		@ A[1][2] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*10
	veor	d7, d7, d31		@ A[1][3] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d9, d9, d31		@ A[1][4] ^= *inp++
	beq	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	cmp	r6, #8*12
	veor	d10, d10, d31		@ A[2][0] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d12, d12, d31		@ A[2][1] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*14
	veor	d14, d14, d31		@ A[2][2] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d16, d16, d31		@ A[2][3] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*16
	veor	d18, d18, d31		@ A[2][4] ^= *inp++
	blo	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	veor	d11, d11, d31		@ A[3][0] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*18
	veor	d13, d13, d31		@ A[3][1] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d15, d15, d31		@ A[3][2] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*20
	veor	d17, d17, d31		@ A[3][3] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d19, d19, d31		@ A[3][4] ^= *inp++
	beq	.Lprocess_neon

	vld1.8	{d31}, [r4]!
	cmp	r6, #8*22
	veor	d20, d20, d31		@ A[4][0] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d21, d21, d31		@ A[4][1] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	cmp	r6, #8*24
	veor	d22, d22, d31		@ A[4][2] ^= *inp++
	blo	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d23, d23, d31		@ A[4][3] ^= *inp++
	beq	.Lprocess_neon
	vld1.8	{d31}, [r4]!
	veor	d24, d24, d31		@ A[4][4] ^= *inp++

.Lprocess_neon:
	bl	KeccakF1600_neon
	b 	.Loop_absorb_neon

.align	4
.Labsorbed_neon:
	vst1.32	{d0}, [r0:64]!		@ A[0][0..4]
	vst1.32	{d2}, [r0:64]!
	vst1.32	{d4}, [r0:64]!
	vst1.32	{d6}, [r0:64]!
	vst1.32	{d8}, [r0:64]!

	vst1.32	{d1}, [r0:64]!		@ A[1][0..4]
	vst1.32	{d3}, [r0:64]!
	vst1.32	{d5}, [r0:64]!
	vst1.32	{d7}, [r0:64]!
	vst1.32	{d9}, [r0:64]!

	vst1.32	{d10}, [r0:64]!		@ A[2][0..4]
	vst1.32	{d12}, [r0:64]!
	vst1.32	{d14}, [r0:64]!
	vst1.32	{d16}, [r0:64]!
	vst1.32	{d18}, [r0:64]!

	vst1.32	{d11}, [r0:64]!		@ A[3][0..4]
	vst1.32	{d13}, [r0:64]!
	vst1.32	{d15}, [r0:64]!
	vst1.32	{d17}, [r0:64]!
	vst1.32	{d19}, [r0:64]!

	vst1.32	{d20-d23}, [r0:64]!	@ A[4][0..4]
	vst1.32	{d24}, [r0:64]

	mov	r0, r5			@ return value
	vldmia	sp!, {d8-d15}
	ldmia	sp!, {r4-r6,pc}
.size	SHA3_absorb_neon,.-SHA3_absorb_neon

.global	SHA3_squeeze_neon
.type	SHA3_squeeze_neon, %function
.align	5
SHA3_squeeze_neon:
	stmdb	sp!, {r4-r6,lr}

	mov	r4, r1			@ out
	mov	r5, r2			@ len
	mov	r6, r3			@ bsz
	mov	r12, r0			@ A_flat
	mov	r14, r3			@ bsz
	b	.Loop_squeeze_neon

.align	4
.Loop_squeeze_neon:
	cmp	r5, #8
	blo	.Lsqueeze_neon_tail
	vld1.32	{d0}, [r12]!
	vst1.8	{d0}, [r4]!		@ endian-neutral store

	subs	r5, r5, #8		@ len -= 8
	beq	.Lsqueeze_neon_done

	subs	r14, r14, #8		@ bsz -= 8
	bhi	.Loop_squeeze_neon

	vstmdb	sp!,  {d8-d15}

	vld1.32	{d0}, [r0:64]!		@ A[0][0..4]
	vld1.32	{d2}, [r0:64]!
	vld1.32	{d4}, [r0:64]!
	vld1.32	{d6}, [r0:64]!
	vld1.32	{d8}, [r0:64]!

	vld1.32	{d1}, [r0:64]!		@ A[1][0..4]
	vld1.32	{d3}, [r0:64]!
	vld1.32	{d5}, [r0:64]!
	vld1.32	{d7}, [r0:64]!
	vld1.32	{d9}, [r0:64]!

	vld1.32	{d10}, [r0:64]!		@ A[2][0..4]
	vld1.32	{d12}, [r0:64]!
	vld1.32	{d14}, [r0:64]!
	vld1.32	{d16}, [r0:64]!
	vld1.32	{d18}, [r0:64]!

	vld1.32	{d11}, [r0:64]!		@ A[3][0..4]
	vld1.32	{d13}, [r0:64]!
	vld1.32	{d15}, [r0:64]!
	vld1.32	{d17}, [r0:64]!
	vld1.32	{d19}, [r0:64]!

	vld1.32	{d20-d23}, [r0:64]!	@ A[4][0..4]
	vld1.32	{d24}, [r0:64]
	sub	r0, r0, #24*8		@ rewind

	bl	KeccakF1600_neon

	mov	r12, r0			@ A_flat
	vst1.32	{d0}, [r0:64]!		@ A[0][0..4]
	vst1.32	{d2}, [r0:64]!
	vst1.32	{d4}, [r0:64]!
	vst1.32	{d6}, [r0:64]!
	vst1.32	{d8}, [r0:64]!

	vst1.32	{d1}, [r0:64]!		@ A[1][0..4]
	vst1.32	{d3}, [r0:64]!
	vst1.32	{d5}, [r0:64]!
	vst1.32	{d7}, [r0:64]!
	vst1.32	{d9}, [r0:64]!

	vst1.32	{d10}, [r0:64]!		@ A[2][0..4]
	vst1.32	{d12}, [r0:64]!
	vst1.32	{d14}, [r0:64]!
	vst1.32	{d16}, [r0:64]!
	vst1.32	{d18}, [r0:64]!

	vst1.32	{d11}, [r0:64]!		@ A[3][0..4]
	vst1.32	{d13}, [r0:64]!
	vst1.32	{d15}, [r0:64]!
	vst1.32	{d17}, [r0:64]!
	vst1.32	{d19}, [r0:64]!

	vst1.32	{d20-d23}, [r0:64]!	@ A[4][0..4]
	mov	r14, r6			@ bsz
	vst1.32	{d24}, [r0:64]
	mov	r0,  r12		@ rewind

	vldmia	sp!, {d8-d15}
	b	.Loop_squeeze_neon

.align	4
.Lsqueeze_neon_tail:
	ldmia	r12, {r2,r3}
	cmp	r5, #2
	strb	r2, [r4],#1		@ endian-neutral store
	lsr	r2, r2, #8
	blo	.Lsqueeze_neon_done
	strb	r2, [r4], #1
	lsr	r2, r2, #8
	beq	.Lsqueeze_neon_done
	strb	r2, [r4], #1
	lsr	r2, r2, #8
	cmp	r5, #4
	blo	.Lsqueeze_neon_done
	strb	r2, [r4], #1
	beq	.Lsqueeze_neon_done

	strb	r3, [r4], #1
	lsr	r3, r3, #8
	cmp	r5, #6
	blo	.Lsqueeze_neon_done
	strb	r3, [r4], #1
	lsr	r3, r3, #8
	beq	.Lsqueeze_neon_done
	strb	r3, [r4], #1

.Lsqueeze_neon_done:
	ldmia	sp!, {r4-r6,pc}
.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
.asciz	"Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align	2
___

print $code;

close STDOUT; # enforce flush