Commit 2d22e080 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ARM assembler pack: reschedule instructions for dual-issue pipeline.

Modest improvement coefficients mean that code already had some
parallelism and there was not very much room for improvement. Special
thanks to Ted Krovetz for benchmarking the code with such patience.
parent 0852f90c
Loading
Loading
Loading
Loading
+194 −199
Original line number Diff line number Diff line
@@ -16,12 +16,17 @@
# allows to merge logical or arithmetic operation with shift or rotate
# in one instruction and emit combined result every cycle. The module
# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
# key.
# key [on single-issue Xscale PXA250 core].

# May 2007.
#
# AES_set_[en|de]crypt_key is added.

# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

@@ -167,24 +172,24 @@ AES_encrypt:
	ldrb	$t2,[$rounds,#1]
	ldrb	$t3,[$rounds,#0]
	orr	$s0,$s0,$t1,lsl#8
	orr	$s0,$s0,$t2,lsl#16
	orr	$s0,$s0,$t3,lsl#24
	ldrb	$s1,[$rounds,#7]
	orr	$s0,$s0,$t2,lsl#16
	ldrb	$t1,[$rounds,#6]
	orr	$s0,$s0,$t3,lsl#24
	ldrb	$t2,[$rounds,#5]
	ldrb	$t3,[$rounds,#4]
	orr	$s1,$s1,$t1,lsl#8
	orr	$s1,$s1,$t2,lsl#16
	orr	$s1,$s1,$t3,lsl#24
	ldrb	$s2,[$rounds,#11]
	orr	$s1,$s1,$t2,lsl#16
	ldrb	$t1,[$rounds,#10]
	orr	$s1,$s1,$t3,lsl#24
	ldrb	$t2,[$rounds,#9]
	ldrb	$t3,[$rounds,#8]
	orr	$s2,$s2,$t1,lsl#8
	orr	$s2,$s2,$t2,lsl#16
	orr	$s2,$s2,$t3,lsl#24
	ldrb	$s3,[$rounds,#15]
	orr	$s2,$s2,$t2,lsl#16
	ldrb	$t1,[$rounds,#14]
	orr	$s2,$s2,$t3,lsl#24
	ldrb	$t2,[$rounds,#13]
	ldrb	$t3,[$rounds,#12]
	orr	$s3,$s3,$t1,lsl#8
@@ -199,24 +204,24 @@ AES_encrypt:
	mov	$t3,$s0,lsr#8
	strb	$t1,[$rounds,#0]
	strb	$t2,[$rounds,#1]
	strb	$t3,[$rounds,#2]
	strb	$s0,[$rounds,#3]
	mov	$t1,$s1,lsr#24
	strb	$t3,[$rounds,#2]
	mov	$t2,$s1,lsr#16
	strb	$s0,[$rounds,#3]
	mov	$t3,$s1,lsr#8
	strb	$t1,[$rounds,#4]
	strb	$t2,[$rounds,#5]
	strb	$t3,[$rounds,#6]
	strb	$s1,[$rounds,#7]
	mov	$t1,$s2,lsr#24
	strb	$t3,[$rounds,#6]
	mov	$t2,$s2,lsr#16
	strb	$s1,[$rounds,#7]
	mov	$t3,$s2,lsr#8
	strb	$t1,[$rounds,#8]
	strb	$t2,[$rounds,#9]
	strb	$t3,[$rounds,#10]
	strb	$s2,[$rounds,#11]
	mov	$t1,$s3,lsr#24
	strb	$t3,[$rounds,#10]
	mov	$t2,$s3,lsr#16
	strb	$s2,[$rounds,#11]
	mov	$t3,$s3,lsr#8
	strb	$t1,[$rounds,#12]
	strb	$t2,[$rounds,#13]
@@ -233,141 +238,137 @@ AES_encrypt:
.align	2
_armv4_AES_encrypt:
	str	lr,[sp,#-4]!		@ push lr
	ldr	$t1,[$key],#16
	ldr	$t2,[$key,#-12]
	ldr	$t3,[$key,#-8]
	ldr	$i1,[$key,#-4]
	ldr	$rounds,[$key,#240-16]
	ldmia	$key!,{$t1-$i1}
	eor	$s0,$s0,$t1
	ldr	$rounds,[$key,#240-16]
	eor	$s1,$s1,$t2
	eor	$s2,$s2,$t3
	eor	$s3,$s3,$i1
	sub	$rounds,$rounds,#1
	mov	lr,#255

.Lenc_loop:
	and	$i1,lr,$s0
	and	$i2,lr,$s0,lsr#8
	and	$i3,lr,$s0,lsr#16
	and	$i1,lr,$s0
	mov	$s0,$s0,lsr#24
.Lenc_loop:
	ldr	$t1,[$tbl,$i1,lsl#2]	@ Te3[s0>>0]
	ldr	$s0,[$tbl,$s0,lsl#2]	@ Te0[s0>>24]
	ldr	$t2,[$tbl,$i2,lsl#2]	@ Te2[s0>>8]
	ldr	$t3,[$tbl,$i3,lsl#2]	@ Te1[s0>>16]

	and	$i1,lr,$s1,lsr#16	@ i0
	ldr	$t2,[$tbl,$i2,lsl#2]	@ Te2[s0>>8]
	and	$i2,lr,$s1
	ldr	$t3,[$tbl,$i3,lsl#2]	@ Te1[s0>>16]
	and	$i3,lr,$s1,lsr#8
	ldr	$s0,[$tbl,$s0,lsl#2]	@ Te0[s0>>24]
	mov	$s1,$s1,lsr#24

	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te1[s1>>16]
	ldr	$s1,[$tbl,$s1,lsl#2]	@ Te0[s1>>24]
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te3[s1>>0]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te2[s1>>8]
	eor	$s0,$s0,$i1,ror#8
	eor	$s1,$s1,$t1,ror#24
	eor	$t2,$t2,$i2,ror#8
	eor	$t3,$t3,$i3,ror#8

	ldr	$s1,[$tbl,$s1,lsl#2]	@ Te0[s1>>24]
	and	$i1,lr,$s2,lsr#8	@ i0
	eor	$t2,$t2,$i2,ror#8
	and	$i2,lr,$s2,lsr#16	@ i1
	eor	$t3,$t3,$i3,ror#8
	and	$i3,lr,$s2
	mov	$s2,$s2,lsr#24
	eor	$s1,$s1,$t1,ror#24
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
	mov	$s2,$s2,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
	eor	$s0,$s0,$i1,ror#16
	eor	$s1,$s1,$i2,ror#8
	eor	$s2,$s2,$t2,ror#16
	eor	$t3,$t3,$i3,ror#16

	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
	and	$i1,lr,$s3		@ i0
	eor	$s1,$s1,$i2,ror#8
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$t3,$i3,ror#16
	and	$i3,lr,$s3,lsr#16	@ i2
	mov	$s3,$s3,lsr#24
	eor	$s2,$s2,$t2,ror#16
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
	mov	$s3,$s3,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
	eor	$s0,$s0,$i1,ror#24
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
	eor	$s1,$s1,$i2,ror#16
	ldr	$i1,[$key],#16
	eor	$s2,$s2,$i3,ror#8
	ldr	$t1,[$key,#-12]
	eor	$s3,$s3,$t3,ror#8

	ldr	$t1,[$key],#16
	ldr	$t2,[$key,#-12]
	ldr	$t3,[$key,#-8]
	ldr	$i1,[$key,#-4]
	eor	$s0,$s0,$t1
	eor	$s1,$s1,$t2
	eor	$s2,$s2,$t3
	eor	$s3,$s3,$i1
	ldr	$t2,[$key,#-8]
	eor	$s0,$s0,$i1
	ldr	$t3,[$key,#-4]
	and	$i1,lr,$s0
	eor	$s1,$s1,$t1
	and	$i2,lr,$s0,lsr#8
	eor	$s2,$s2,$t2
	and	$i3,lr,$s0,lsr#16
	eor	$s3,$s3,$t3
	mov	$s0,$s0,lsr#24

	subs	$rounds,$rounds,#1
	bne	.Lenc_loop

	add	$tbl,$tbl,#2

	and	$i1,lr,$s0
	and	$i2,lr,$s0,lsr#8
	and	$i3,lr,$s0,lsr#16
	mov	$s0,$s0,lsr#24
	ldrb	$t1,[$tbl,$i1,lsl#2]	@ Te4[s0>>0]
	ldrb	$s0,[$tbl,$s0,lsl#2]	@ Te4[s0>>24]
	ldrb	$t2,[$tbl,$i2,lsl#2]	@ Te4[s0>>8]
	ldrb	$t3,[$tbl,$i3,lsl#2]	@ Te4[s0>>16]

	and	$i1,lr,$s1,lsr#16	@ i0
	ldrb	$t2,[$tbl,$i2,lsl#2]	@ Te4[s0>>8]
	and	$i2,lr,$s1
	ldrb	$t3,[$tbl,$i3,lsl#2]	@ Te4[s0>>16]
	and	$i3,lr,$s1,lsr#8
	ldrb	$s0,[$tbl,$s0,lsl#2]	@ Te4[s0>>24]
	mov	$s1,$s1,lsr#24

	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s1>>16]
	ldrb	$s1,[$tbl,$s1,lsl#2]	@ Te4[s1>>24]
	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s1>>0]
	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s1>>8]
	eor	$s0,$i1,$s0,lsl#8
	eor	$s1,$t1,$s1,lsl#24
	eor	$t2,$i2,$t2,lsl#8
	eor	$t3,$i3,$t3,lsl#8

	ldrb	$s1,[$tbl,$s1,lsl#2]	@ Te4[s1>>24]
	and	$i1,lr,$s2,lsr#8	@ i0
	eor	$t2,$i2,$t2,lsl#8
	and	$i2,lr,$s2,lsr#16	@ i1
	eor	$t3,$i3,$t3,lsl#8
	and	$i3,lr,$s2
	mov	$s2,$s2,lsr#24
	eor	$s1,$t1,$s1,lsl#24
	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
	mov	$s2,$s2,lsr#24

	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
	eor	$s0,$i1,$s0,lsl#8
	eor	$s1,$s1,$i2,lsl#16
	eor	$s2,$t2,$s2,lsl#24
	eor	$t3,$i3,$t3,lsl#8

	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
	and	$i1,lr,$s3		@ i0
	eor	$s1,$s1,$i2,lsl#16
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$i3,$t3,lsl#8
	and	$i3,lr,$s3,lsr#16	@ i2
	mov	$s3,$s3,lsr#24
	eor	$s2,$t2,$s2,lsl#24
	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
	mov	$s3,$s3,lsr#24

	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
	eor	$s0,$i1,$s0,lsl#8
	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
	ldr	$i1,[$key,#0]
	eor	$s1,$s1,$i2,lsl#8
	ldr	$t1,[$key,#4]
	eor	$s2,$s2,$i3,lsl#16
	ldr	$t2,[$key,#8]
	eor	$s3,$t3,$s3,lsl#24
	ldr	$t3,[$key,#12]

	ldr	lr,[sp],#4		@ pop lr
	ldr	$t1,[$key,#0]
	ldr	$t2,[$key,#4]
	ldr	$t3,[$key,#8]
	ldr	$i1,[$key,#12]
	eor	$s0,$s0,$t1
	eor	$s1,$s1,$t2
	eor	$s2,$s2,$t3
	eor	$s3,$s3,$i1
	eor	$s0,$s0,$i1
	eor	$s1,$s1,$t1
	eor	$s2,$s2,$t2
	eor	$s3,$s3,$t3

	sub	$tbl,$tbl,#2
	mov	pc,lr			@ return
	ldr	pc,[sp],#4		@ pop and return
.size	_armv4_AES_encrypt,.-_armv4_AES_encrypt

.global AES_set_encrypt_key
@@ -402,31 +403,31 @@ AES_set_encrypt_key:
	ldrb	$t2,[$rounds,#1]
	ldrb	$t3,[$rounds,#0]
	orr	$s0,$s0,$t1,lsl#8
	orr	$s0,$s0,$t2,lsl#16
	orr	$s0,$s0,$t3,lsl#24
	ldrb	$s1,[$rounds,#7]
	orr	$s0,$s0,$t2,lsl#16
	ldrb	$t1,[$rounds,#6]
	orr	$s0,$s0,$t3,lsl#24
	ldrb	$t2,[$rounds,#5]
	ldrb	$t3,[$rounds,#4]
	orr	$s1,$s1,$t1,lsl#8
	orr	$s1,$s1,$t2,lsl#16
	orr	$s1,$s1,$t3,lsl#24
	ldrb	$s2,[$rounds,#11]
	orr	$s1,$s1,$t2,lsl#16
	ldrb	$t1,[$rounds,#10]
	orr	$s1,$s1,$t3,lsl#24
	ldrb	$t2,[$rounds,#9]
	ldrb	$t3,[$rounds,#8]
	orr	$s2,$s2,$t1,lsl#8
	orr	$s2,$s2,$t2,lsl#16
	orr	$s2,$s2,$t3,lsl#24
	ldrb	$s3,[$rounds,#15]
	orr	$s2,$s2,$t2,lsl#16
	ldrb	$t1,[$rounds,#14]
	orr	$s2,$s2,$t3,lsl#24
	ldrb	$t2,[$rounds,#13]
	ldrb	$t3,[$rounds,#12]
	orr	$s3,$s3,$t1,lsl#8
	orr	$s3,$s3,$t2,lsl#16
	orr	$s3,$s3,$t3,lsl#24
	str	$s0,[$key],#16
	orr	$s3,$s3,$t2,lsl#16
	str	$s1,[$key,#-12]
	orr	$s3,$s3,$t3,lsl#24
	str	$s2,[$key,#-8]
	str	$s3,[$key,#-4]

@@ -440,27 +441,26 @@ AES_set_encrypt_key:
.L128_loop:
	and	$t2,lr,$s3,lsr#24
	and	$i1,lr,$s3,lsr#16
	and	$i2,lr,$s3,lsr#8
	and	$i3,lr,$s3
	ldrb	$t2,[$tbl,$t2]
	and	$i2,lr,$s3,lsr#8
	ldrb	$i1,[$tbl,$i1]
	and	$i3,lr,$s3
	ldrb	$i2,[$tbl,$i2]
	ldrb	$i3,[$tbl,$i3]
	ldr	$t1,[$t3],#4			@ rcon[i++]
	orr	$t2,$t2,$i1,lsl#24
	ldrb	$i3,[$tbl,$i3]
	orr	$t2,$t2,$i2,lsl#16
	ldr	$t1,[$t3],#4			@ rcon[i++]
	orr	$t2,$t2,$i3,lsl#8
	eor	$t2,$t2,$t1
	eor	$s0,$s0,$t2			@ rk[4]=rk[0]^...
	eor	$s1,$s1,$s0			@ rk[5]=rk[1]^rk[4]
	eor	$s2,$s2,$s1			@ rk[6]=rk[2]^rk[5]
	eor	$s3,$s3,$s2			@ rk[7]=rk[3]^rk[6]
	str	$s0,[$key],#16
	eor	$s2,$s2,$s1			@ rk[6]=rk[2]^rk[5]
	str	$s1,[$key,#-12]
	eor	$s3,$s3,$s2			@ rk[7]=rk[3]^rk[6]
	str	$s2,[$key,#-8]
	str	$s3,[$key,#-4]

	subs	$rounds,$rounds,#1
	str	$s3,[$key,#-4]
	bne	.L128_loop
	sub	r2,$key,#176
	b	.Ldone
@@ -471,16 +471,16 @@ AES_set_encrypt_key:
	ldrb	$t2,[$rounds,#17]
	ldrb	$t3,[$rounds,#16]
	orr	$i2,$i2,$t1,lsl#8
	orr	$i2,$i2,$t2,lsl#16
	orr	$i2,$i2,$t3,lsl#24
	ldrb	$i3,[$rounds,#23]
	orr	$i2,$i2,$t2,lsl#16
	ldrb	$t1,[$rounds,#22]
	orr	$i2,$i2,$t3,lsl#24
	ldrb	$t2,[$rounds,#21]
	ldrb	$t3,[$rounds,#20]
	orr	$i3,$i3,$t1,lsl#8
	orr	$i3,$i3,$t2,lsl#16
	orr	$i3,$i3,$t3,lsl#24
	str	$i2,[$key],#8
	orr	$i3,$i3,$t3,lsl#24
	str	$i3,[$key,#-4]

	teq	lr,#192
@@ -494,27 +494,26 @@ AES_set_encrypt_key:
.L192_loop:
	and	$t2,lr,$i3,lsr#24
	and	$i1,lr,$i3,lsr#16
	and	$i2,lr,$i3,lsr#8
	and	$i3,lr,$i3
	ldrb	$t2,[$tbl,$t2]
	and	$i2,lr,$i3,lsr#8
	ldrb	$i1,[$tbl,$i1]
	and	$i3,lr,$i3
	ldrb	$i2,[$tbl,$i2]
	ldrb	$i3,[$tbl,$i3]
	ldr	$t1,[$t3],#4			@ rcon[i++]
	orr	$t2,$t2,$i1,lsl#24
	ldrb	$i3,[$tbl,$i3]
	orr	$t2,$t2,$i2,lsl#16
	ldr	$t1,[$t3],#4			@ rcon[i++]
	orr	$t2,$t2,$i3,lsl#8
	eor	$i3,$t2,$t1
	eor	$s0,$s0,$i3			@ rk[6]=rk[0]^...
	eor	$s1,$s1,$s0			@ rk[7]=rk[1]^rk[6]
	eor	$s2,$s2,$s1			@ rk[8]=rk[2]^rk[7]
	eor	$s3,$s3,$s2			@ rk[9]=rk[3]^rk[8]
	str	$s0,[$key],#24
	eor	$s2,$s2,$s1			@ rk[8]=rk[2]^rk[7]
	str	$s1,[$key,#-20]
	eor	$s3,$s3,$s2			@ rk[9]=rk[3]^rk[8]
	str	$s2,[$key,#-16]
	str	$s3,[$key,#-12]

	subs	$rounds,$rounds,#1
	str	$s3,[$key,#-12]
	subeq	r2,$key,#216
	beq	.Ldone

@@ -532,16 +531,16 @@ AES_set_encrypt_key:
	ldrb	$t2,[$rounds,#25]
	ldrb	$t3,[$rounds,#24]
	orr	$i2,$i2,$t1,lsl#8
	orr	$i2,$i2,$t2,lsl#16
	orr	$i2,$i2,$t3,lsl#24
	ldrb	$i3,[$rounds,#31]
	orr	$i2,$i2,$t2,lsl#16
	ldrb	$t1,[$rounds,#30]
	orr	$i2,$i2,$t3,lsl#24
	ldrb	$t2,[$rounds,#29]
	ldrb	$t3,[$rounds,#28]
	orr	$i3,$i3,$t1,lsl#8
	orr	$i3,$i3,$t2,lsl#16
	orr	$i3,$i3,$t3,lsl#24
	str	$i2,[$key],#8
	orr	$i3,$i3,$t3,lsl#24
	str	$i3,[$key,#-4]

	mov	$rounds,#14
@@ -553,52 +552,51 @@ AES_set_encrypt_key:
.L256_loop:
	and	$t2,lr,$i3,lsr#24
	and	$i1,lr,$i3,lsr#16
	and	$i2,lr,$i3,lsr#8
	and	$i3,lr,$i3
	ldrb	$t2,[$tbl,$t2]
	and	$i2,lr,$i3,lsr#8
	ldrb	$i1,[$tbl,$i1]
	and	$i3,lr,$i3
	ldrb	$i2,[$tbl,$i2]
	ldrb	$i3,[$tbl,$i3]
	ldr	$t1,[$t3],#4			@ rcon[i++]
	orr	$t2,$t2,$i1,lsl#24
	ldrb	$i3,[$tbl,$i3]
	orr	$t2,$t2,$i2,lsl#16
	ldr	$t1,[$t3],#4			@ rcon[i++]
	orr	$t2,$t2,$i3,lsl#8
	eor	$i3,$t2,$t1
	eor	$s0,$s0,$i3			@ rk[8]=rk[0]^...
	eor	$s1,$s1,$s0			@ rk[9]=rk[1]^rk[8]
	eor	$s2,$s2,$s1			@ rk[10]=rk[2]^rk[9]
	eor	$s3,$s3,$s2			@ rk[11]=rk[3]^rk[10]
	str	$s0,[$key],#32
	eor	$s2,$s2,$s1			@ rk[10]=rk[2]^rk[9]
	str	$s1,[$key,#-28]
	eor	$s3,$s3,$s2			@ rk[11]=rk[3]^rk[10]
	str	$s2,[$key,#-24]
	str	$s3,[$key,#-20]

	subs	$rounds,$rounds,#1
	str	$s3,[$key,#-20]
	subeq	r2,$key,#256
	beq	.Ldone

	and	$t2,lr,$s3
	and	$i1,lr,$s3,lsr#8
	and	$i2,lr,$s3,lsr#16
	and	$i3,lr,$s3,lsr#24
	ldrb	$t2,[$tbl,$t2]
	and	$i2,lr,$s3,lsr#16
	ldrb	$i1,[$tbl,$i1]
	and	$i3,lr,$s3,lsr#24
	ldrb	$i2,[$tbl,$i2]
	ldrb	$i3,[$tbl,$i3]
	orr	$t2,$t2,$i1,lsl#8
	ldrb	$i3,[$tbl,$i3]
	orr	$t2,$t2,$i2,lsl#16
	ldr	$t1,[$key,#-48]
	orr	$t2,$t2,$i3,lsl#24

	ldr	$t1,[$key,#-48]
	ldr	$i1,[$key,#-44]
	ldr	$i2,[$key,#-40]
	ldr	$i3,[$key,#-36]
	eor	$t1,$t1,$t2			@ rk[12]=rk[4]^...
	ldr	$i3,[$key,#-36]
	eor	$i1,$i1,$t1			@ rk[13]=rk[5]^rk[12]
	eor	$i2,$i2,$i1			@ rk[14]=rk[6]^rk[13]
	eor	$i3,$i3,$i2			@ rk[15]=rk[7]^rk[14]
	str	$t1,[$key,#-16]
	eor	$i2,$i2,$i1			@ rk[14]=rk[6]^rk[13]
	str	$i1,[$key,#-12]
	eor	$i3,$i3,$i2			@ rk[15]=rk[7]^rk[14]
	str	$i2,[$key,#-8]
	str	$i3,[$key,#-4]
	b	.L256_loop
@@ -819,24 +817,24 @@ AES_decrypt:
	ldrb	$t2,[$rounds,#1]
	ldrb	$t3,[$rounds,#0]
	orr	$s0,$s0,$t1,lsl#8
	orr	$s0,$s0,$t2,lsl#16
	orr	$s0,$s0,$t3,lsl#24
	ldrb	$s1,[$rounds,#7]
	orr	$s0,$s0,$t2,lsl#16
	ldrb	$t1,[$rounds,#6]
	orr	$s0,$s0,$t3,lsl#24
	ldrb	$t2,[$rounds,#5]
	ldrb	$t3,[$rounds,#4]
	orr	$s1,$s1,$t1,lsl#8
	orr	$s1,$s1,$t2,lsl#16
	orr	$s1,$s1,$t3,lsl#24
	ldrb	$s2,[$rounds,#11]
	orr	$s1,$s1,$t2,lsl#16
	ldrb	$t1,[$rounds,#10]
	orr	$s1,$s1,$t3,lsl#24
	ldrb	$t2,[$rounds,#9]
	ldrb	$t3,[$rounds,#8]
	orr	$s2,$s2,$t1,lsl#8
	orr	$s2,$s2,$t2,lsl#16
	orr	$s2,$s2,$t3,lsl#24
	ldrb	$s3,[$rounds,#15]
	orr	$s2,$s2,$t2,lsl#16
	ldrb	$t1,[$rounds,#14]
	orr	$s2,$s2,$t3,lsl#24
	ldrb	$t2,[$rounds,#13]
	ldrb	$t3,[$rounds,#12]
	orr	$s3,$s3,$t1,lsl#8
@@ -851,24 +849,24 @@ AES_decrypt:
	mov	$t3,$s0,lsr#8
	strb	$t1,[$rounds,#0]
	strb	$t2,[$rounds,#1]
	strb	$t3,[$rounds,#2]
	strb	$s0,[$rounds,#3]
	mov	$t1,$s1,lsr#24
	strb	$t3,[$rounds,#2]
	mov	$t2,$s1,lsr#16
	strb	$s0,[$rounds,#3]
	mov	$t3,$s1,lsr#8
	strb	$t1,[$rounds,#4]
	strb	$t2,[$rounds,#5]
	strb	$t3,[$rounds,#6]
	strb	$s1,[$rounds,#7]
	mov	$t1,$s2,lsr#24
	strb	$t3,[$rounds,#6]
	mov	$t2,$s2,lsr#16
	strb	$s1,[$rounds,#7]
	mov	$t3,$s2,lsr#8
	strb	$t1,[$rounds,#8]
	strb	$t2,[$rounds,#9]
	strb	$t3,[$rounds,#10]
	strb	$s2,[$rounds,#11]
	mov	$t1,$s3,lsr#24
	strb	$t3,[$rounds,#10]
	mov	$t2,$s3,lsr#16
	strb	$s2,[$rounds,#11]
	mov	$t3,$s3,lsr#8
	strb	$t1,[$rounds,#12]
	strb	$t2,[$rounds,#13]
@@ -885,146 +883,143 @@ AES_decrypt:
.align	2
_armv4_AES_decrypt:
	str	lr,[sp,#-4]!		@ push lr
	ldr	$t1,[$key],#16
	ldr	$t2,[$key,#-12]
	ldr	$t3,[$key,#-8]
	ldr	$i1,[$key,#-4]
	ldr	$rounds,[$key,#240-16]
	ldmia	$key!,{$t1-$i1}
	eor	$s0,$s0,$t1
	ldr	$rounds,[$key,#240-16]
	eor	$s1,$s1,$t2
	eor	$s2,$s2,$t3
	eor	$s3,$s3,$i1
	sub	$rounds,$rounds,#1
	mov	lr,#255

.Ldec_loop:
	and	$i1,lr,$s0,lsr#16
	and	$i2,lr,$s0,lsr#8
	and	$i3,lr,$s0
	mov	$s0,$s0,lsr#24
.Ldec_loop:
	ldr	$t1,[$tbl,$i1,lsl#2]	@ Td1[s0>>16]
	ldr	$s0,[$tbl,$s0,lsl#2]	@ Td0[s0>>24]
	ldr	$t2,[$tbl,$i2,lsl#2]	@ Td2[s0>>8]
	ldr	$t3,[$tbl,$i3,lsl#2]	@ Td3[s0>>0]

	and	$i1,lr,$s1		@ i0
	ldr	$t2,[$tbl,$i2,lsl#2]	@ Td2[s0>>8]
	and	$i2,lr,$s1,lsr#16
	ldr	$t3,[$tbl,$i3,lsl#2]	@ Td3[s0>>0]
	and	$i3,lr,$s1,lsr#8
	ldr	$s0,[$tbl,$s0,lsl#2]	@ Td0[s0>>24]
	mov	$s1,$s1,lsr#24

	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td3[s1>>0]
	ldr	$s1,[$tbl,$s1,lsl#2]	@ Td0[s1>>24]
	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td1[s1>>16]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td2[s1>>8]
	eor	$s0,$s0,$i1,ror#24
	eor	$s1,$s1,$t1,ror#8
	eor	$t2,$i2,$t2,ror#8
	eor	$t3,$i3,$t3,ror#8

	ldr	$s1,[$tbl,$s1,lsl#2]	@ Td0[s1>>24]
	and	$i1,lr,$s2,lsr#8	@ i0
	eor	$t2,$i2,$t2,ror#8
	and	$i2,lr,$s2		@ i1
	eor	$t3,$i3,$t3,ror#8
	and	$i3,lr,$s2,lsr#16
	mov	$s2,$s2,lsr#24
	eor	$s1,$s1,$t1,ror#8
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
	mov	$s2,$s2,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
	eor	$s0,$s0,$i1,ror#16
	eor	$s1,$s1,$i2,ror#24
	eor	$s2,$s2,$t2,ror#8
	eor	$t3,$i3,$t3,ror#8

	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
	and	$i1,lr,$s3,lsr#16	@ i0
	eor	$s1,$s1,$i2,ror#24
	and	$i2,lr,$s3,lsr#8	@ i1
	eor	$t3,$i3,$t3,ror#8
	and	$i3,lr,$s3		@ i2
	mov	$s3,$s3,lsr#24
	eor	$s2,$s2,$t2,ror#8
	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
	mov	$s3,$s3,lsr#24

	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
	eor	$s0,$s0,$i1,ror#8
	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
	eor	$s1,$s1,$i2,ror#16
	eor	$s2,$s2,$i3,ror#24
	ldr	$i1,[$key],#16
	eor	$s3,$s3,$t3,ror#8

	ldr	$t1,[$key],#16
	ldr	$t2,[$key,#-12]
	ldr	$t3,[$key,#-8]
	ldr	$i1,[$key,#-4]
	eor	$s0,$s0,$t1
	eor	$s1,$s1,$t2
	eor	$s2,$s2,$t3
	eor	$s3,$s3,$i1
	ldr	$t1,[$key,#-12]
	ldr	$t2,[$key,#-8]
	eor	$s0,$s0,$i1
	ldr	$t3,[$key,#-4]
	and	$i1,lr,$s0,lsr#16
	eor	$s1,$s1,$t1
	and	$i2,lr,$s0,lsr#8
	eor	$s2,$s2,$t2
	and	$i3,lr,$s0
	eor	$s3,$s3,$t3
	mov	$s0,$s0,lsr#24

	subs	$rounds,$rounds,#1
	bne	.Ldec_loop

	add	$tbl,$tbl,#1024

	ldr	$t1,[$tbl,#0]		@ prefetch Td4
	ldr	$t2,[$tbl,#32]
	ldr	$t3,[$tbl,#64]
	ldr	$i1,[$tbl,#96]
	ldr	$i2,[$tbl,#128]
	ldr	$i3,[$tbl,#160]
	ldr	$t1,[$tbl,#192]
	ldr	$t2,[$tbl,#224]
	ldr	$t2,[$tbl,#0]		@ prefetch Td4
	ldr	$t3,[$tbl,#32]
	ldr	$t1,[$tbl,#64]
	ldr	$t2,[$tbl,#96]
	ldr	$t3,[$tbl,#128]
	ldr	$t1,[$tbl,#160]
	ldr	$t2,[$tbl,#192]
	ldr	$t3,[$tbl,#224]

	and	$i1,lr,$s0,lsr#16
	and	$i2,lr,$s0,lsr#8
	and	$i3,lr,$s0
	ldrb	$s0,[$tbl,$s0,lsr#24]	@ Td4[s0>>24]
	ldrb	$s0,[$tbl,$s0]		@ Td4[s0>>24]
	ldrb	$t1,[$tbl,$i1]		@ Td4[s0>>16]
	ldrb	$t2,[$tbl,$i2]		@ Td4[s0>>8]
	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]

	and	$i1,lr,$s1		@ i0
	ldrb	$t2,[$tbl,$i2]		@ Td4[s0>>8]
	and	$i2,lr,$s1,lsr#16
	ldrb	$t3,[$tbl,$i3]		@ Td4[s0>>0]
	and	$i3,lr,$s1,lsr#8

	ldrb	$i1,[$tbl,$i1]		@ Td4[s1>>0]
	ldrb	$s1,[$tbl,$s1,lsr#24]	@ Td4[s1>>24]
	ldrb	$i2,[$tbl,$i2]		@ Td4[s1>>16]
	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8]
	eor	$s0,$i1,$s0,lsl#24
	ldrb	$i3,[$tbl,$i3]		@ Td4[s1>>8]
	eor	$s1,$t1,$s1,lsl#8
	eor	$t2,$t2,$i2,lsl#8
	eor	$t3,$t3,$i3,lsl#8

	and	$i1,lr,$s2,lsr#8	@ i0
	eor	$t2,$t2,$i2,lsl#8
	and	$i2,lr,$s2		@ i1
	and	$i3,lr,$s2,lsr#16
	eor	$t3,$t3,$i3,lsl#8
	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
	and	$i3,lr,$s2,lsr#16

	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
	eor	$s0,$s0,$i1,lsl#8
	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
	eor	$s1,$i2,$s1,lsl#16
	eor	$s2,$t2,$s2,lsl#16
	eor	$t3,$t3,$i3,lsl#16

	and	$i1,lr,$s3,lsr#16	@ i0
	eor	$s2,$t2,$s2,lsl#16
	and	$i2,lr,$s3,lsr#8	@ i1
	and	$i3,lr,$s3		@ i2
	eor	$t3,$t3,$i3,lsl#16
	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
	and	$i3,lr,$s3		@ i2

	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
	eor	$s0,$s0,$i1,lsl#16
	ldr	$i1,[$key,#0]
	eor	$s1,$s1,$i2,lsl#8
	ldr	$t1,[$key,#4]
	eor	$s2,$i3,$s2,lsl#8
	ldr	$t2,[$key,#8]
	eor	$s3,$t3,$s3,lsl#24
	ldr	$t3,[$key,#12]

	ldr	lr,[sp],#4		@ pop lr
	ldr	$t1,[$key,#0]
	ldr	$t2,[$key,#4]
	ldr	$t3,[$key,#8]
	ldr	$i1,[$key,#12]
	eor	$s0,$s0,$t1
	eor	$s1,$s1,$t2
	eor	$s2,$s2,$t3
	eor	$s3,$s3,$i1
	eor	$s0,$s0,$i1
	eor	$s1,$s1,$t1
	eor	$s2,$s2,$t2
	eor	$s3,$s3,$t3

	sub	$tbl,$tbl,#1024
	mov	pc,lr			@ return
	ldr	pc,[sp],#4		@ pop and return
.size	_armv4_AES_decrypt,.-_armv4_AES_decrypt
.asciz	"AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align	2
+18 −12
Original line number Diff line number Diff line
@@ -19,6 +19,12 @@
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
@@ -123,12 +129,12 @@ gcm_ghash_4bit:

	add	$Zhh,$Htbl,$nlo,lsl#4
	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
	add	$Thh,$Htbl,$nhi
	ldrb	$nlo,[$inp,#14]

	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	mov	$nhi,$nhi,lsl#1
	add	$nhi,$nhi,$nhi
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
	eor	$Zll,$Zll,$Zlh,lsl#28
@@ -139,15 +145,15 @@ gcm_ghash_4bit:
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	eor	$nlo,$nlo,$nhi
	eor	$Zhh,$Zhh,$Tll,lsl#16
	and	$nhi,$nlo,#0xf0
	and	$nlo,$nlo,#0x0f
	eor	$Zhh,$Zhh,$Tll,lsl#16

.Loop:
	add	$Thh,$Htbl,$nlo,lsl#4
	subs	$cnt,$cnt,#1
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
	and	$nlo,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
	add	$nlo,$nlo,$nlo
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
@@ -161,22 +167,22 @@ gcm_ghash_4bit:

	add	$Thh,$Htbl,$nhi
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	add	$nhi,$nhi,$nhi
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
	eor	$Zll,$Zll,$Zlh,lsl#28
	ldrplb	$nhi,[$Xi,$cnt]
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	ldrplb	$nhi,[$Xi,$cnt]
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	eorpl	$nlo,$nlo,$nhi
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	eor	$Zhh,$Thh,$Zhh,lsr#4
	andpl	$nhi,$nlo,#0xf0
	andpl	$nlo,$nlo,#0x0f
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	bpl	.Loop

	ldr	$len,[sp,#32]		@ re-load $len/end
@@ -212,7 +218,7 @@ gcm_gmult_4bit:
	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	mov	$nhi,$nhi,lsl#1
	add	$nhi,$nhi,$nhi
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
	eor	$Zll,$Zll,$Zlh,lsl#28
@@ -228,8 +234,8 @@ gcm_gmult_4bit:
.Loop2:
	add	$Thh,$Htbl,$nlo,lsl#4
	subs	$cnt,$cnt,#1
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
	and	$nlo,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
	add	$nlo,$nlo,$nlo
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
@@ -243,8 +249,8 @@ gcm_gmult_4bit:

	add	$Thh,$Htbl,$nhi
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	add	$nhi,$nhi,$nhi
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
@@ -255,8 +261,8 @@ gcm_gmult_4bit:
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	andpl	$nhi,$nlo,#0xf0
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	andpl	$nlo,$nlo,#0x0f
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	bpl	.Loop2
___
	&Zsmash();
+18 −13
Original line number Diff line number Diff line
@@ -11,7 +11,12 @@

# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte.
# byte [on single-issue Xscale PXA250 core].

# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
___
$code.=<<___;
	ldr	$t2,[$Ktbl],#4			@ *K256++
	str	$T1,[sp,#`$i%16`*4]
	mov	$t0,$e,ror#$Sigma1[0]
	str	$T1,[sp,#`$i%16`*4]
	eor	$t0,$t0,$e,ror#$Sigma1[1]
	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
	add	$T1,$T1,$t0
	eor	$t1,$f,$g
	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
	and	$t1,$t1,$e
	add	$T1,$T1,$t0
	eor	$t1,$t1,$g			@ Ch(e,f,g)
	add	$T1,$T1,$t1
	add	$T1,$T1,$h
	add	$T1,$T1,$t2
	mov	$h,$a,ror#$Sigma0[0]
	add	$T1,$T1,$t1
	eor	$h,$h,$a,ror#$Sigma0[1]
	add	$T1,$T1,$t2
	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
	orr	$t0,$a,$b
	and	$t0,$t0,$c
	and	$t1,$a,$b
	and	$t0,$t0,$c
	add	$h,$h,$T1
	orr	$t0,$t0,$t1			@ Maj(a,b,c)
	add	$h,$h,$t0
	add	$d,$d,$T1
	add	$h,$h,$T1
	add	$h,$h,$t0
___
}

@@ -83,16 +88,16 @@ $code.=<<___;
	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
	ldr	$t2,[sp,#`($i+14)%16`*4]
	ldr	$T1,[sp,#`($i+0)%16`*4]
	ldr	$inp,[sp,#`($i+9)%16`*4]
	mov	$t0,$t1,ror#$sigma0[0]
	ldr	$inp,[sp,#`($i+9)%16`*4]
	eor	$t0,$t0,$t1,ror#$sigma0[1]
	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
	mov	$t1,$t2,ror#$sigma1[0]
	add	$T1,$T1,$t0
	eor	$t1,$t1,$t2,ror#$sigma1[1]
	add	$T1,$T1,$inp
	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
	add	$T1,$T1,$t0
	add	$T1,$T1,$t1
	add	$T1,$T1,$inp
___
	&BODY_00_15(@_);
}
+17 −13
Original line number Diff line number Diff line
@@ -10,7 +10,13 @@
# SHA512 block procedure for ARMv4. September 2007.

# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte. 
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.

# Byte order [in]dependence. =========================================
#
@@ -73,33 +79,31 @@ $code.=<<___;
	eor	$t0,$t0,$Elo,lsl#23
	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
	adds	$Tlo,$Tlo,$t0
	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
	adds	$Tlo,$Tlo,$t2
	adc	$Thi,$Thi,$t3		@ T += h

	ldr	$t0,[sp,#$Foff+0]	@ f.lo
	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
	ldr	$t1,[sp,#$Foff+4]	@ f.hi
	adds	$Tlo,$Tlo,$t2
	ldr	$t2,[sp,#$Goff+0]	@ g.lo
	adc	$Thi,$Thi,$t3		@ T += h
	ldr	$t3,[sp,#$Goff+4]	@ g.hi
	str	$Elo,[sp,#$Eoff+0]
	str	$Ehi,[sp,#$Eoff+4]
	str	$Alo,[sp,#$Aoff+0]
	str	$Ahi,[sp,#$Aoff+4]

	eor	$t0,$t0,$t2
	str	$Elo,[sp,#$Eoff+0]
	eor	$t1,$t1,$t3
	str	$Ehi,[sp,#$Eoff+4]
	and	$t0,$t0,$Elo
	str	$Alo,[sp,#$Aoff+0]
	and	$t1,$t1,$Ehi
	str	$Ahi,[sp,#$Aoff+4]
	eor	$t0,$t0,$t2
	eor	$t1,$t1,$t3		@ Ch(e,f,g)

	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
	eor	$t1,$t1,$t3		@ Ch(e,f,g)
	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi

	adds	$Tlo,$Tlo,$t0
	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
	adds	$Tlo,$Tlo,$t2
	adc	$Thi,$Thi,$t3		@ T += K[i]
	adds	$Elo,$Elo,$Tlo