Commit ce24d2ed authored by Andy Polyakov's avatar Andy Polyakov
Browse files

aes/asm/vpaes-ppc.pl: eliminate overhung stores in misaligned cases.

parent 0ee05b73
Loading
Loading
Loading
Loading
+136 −62
Original line number Diff line number Diff line
@@ -337,24 +337,27 @@ Lenc_entry:
	addi	$inp, $inp, 15		# 15 is not a typo
	 ?lvsr	$outperm, 0, $out
	?lvsl	$keyperm, 0, $key	# prepare for unaligned access
	 vnor	$outmask, v7, v7	# 0xff..ff
	lvx	$inptail, 0, $inp	# redundant in aligned case
	 ?vperm	$outmask, v7, $outmask, $outperm
	 lvx	$outhead, 0, $out
	?vperm	v0, v0, $inptail, $inpperm

	bl	_vpaes_encrypt_core

	andi.	r8, $out, 15
	li	r9, 16
	beq	Lenc_out_aligned

	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v1, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v1, 0, $out
	addi	$out, $out, 15		# 15 is not a typo
	########
	mtctr	r9
Lenc_out_unaligned:
	stvebx	v0, 0, $out
	addi	$out, $out, 1
	bdnz	Lenc_out_unaligned
	b	Lenc_done

	lvx	v1, 0, $out		# redundant in aligned case
	vsel	v1, $outhead, v1, $outmask
	stvx	v1, 0, $out
.align	4
Lenc_out_aligned:
	stvx	v0, 0, $out
Lenc_done:

	li	r10,`15+6*$SIZE_T`
	li	r11,`31+6*$SIZE_T`
@@ -566,24 +569,27 @@ Ldec_entry:
	addi	$inp, $inp, 15		# 15 is not a typo
	 ?lvsr	$outperm, 0, $out
	?lvsl	$keyperm, 0, $key
	 vnor	$outmask, v7, v7	# 0xff..ff
	lvx	$inptail, 0, $inp	# redundant in aligned case
	 ?vperm	$outmask, v7, $outmask, $outperm
	 lvx	$outhead, 0, $out
	?vperm	v0, v0, $inptail, $inpperm

	bl	_vpaes_decrypt_core

	andi.	r8, $out, 15
	li	r9, 16
	beq	Ldec_out_aligned

	vperm	v0, v0, v0, $outperm	# rotate right/left
	vsel	v1, $outhead, v0, $outmask
	vmr	$outhead, v0
	stvx	v1, 0, $out
	addi	$out, $out, 15		# 15 is not a typo
	########
	mtctr	r9
Ldec_out_unaligned:
	stvebx	v0, 0, $out
	addi	$out, $out, 1
	bdnz	Ldec_out_unaligned
	b	Ldec_done

	lvx	v1, 0, $out		# redundant in aligned case
	vsel	v1, $outhead, v1, $outmask
	stvx	v1, 0, $out
.align	4
Ldec_out_aligned:
	stvx	v0, 0, $out
Ldec_done:

	li	r10,`15+6*$SIZE_T`
	li	r11,`31+6*$SIZE_T`
@@ -658,11 +664,11 @@ Ldec_entry:
	$PUSH	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)

	and	r30, r5, r9		# copy length&-16
	andi.	r9, $out, 15		# is $out aligned?
	mr	r5, r6			# copy pointer to key
	mr	r31, r7			# copy pointer to iv
	blt	Lcbc_abort
	cmpwi	r8, 0			# test direction
	li	r6, -1
	mcrf	cr1, cr0		# put aside $out alignment flag
	mr	r7, r12			# copy vrsave
	mtspr	256, r6			# preserve all AltiVec registers

@@ -672,6 +678,7 @@ Ldec_entry:
	lvx	v25, r9, r31
	?vperm	v24, v24, v25, $inpperm

	cmpwi	r8, 0			# test direction
	neg	r8, $inp		# prepare for unaligned access
	 vxor	v7, v7, v7
	?lvsl	$keyperm, 0, $key
@@ -681,13 +688,37 @@ Ldec_entry:
	lvx	$inptail, 0, $inp
	 ?vperm	$outmask, v7, $outmask, $outperm
	addi	$inp, $inp, 15		# 15 is not a typo
	 lvx	$outhead, 0, $out

	beq	Lcbc_decrypt

	bl	_vpaes_encrypt_preheat
	li	r0, 16

	beq	cr1, Lcbc_enc_loop	# $out is aligned

	vmr	v0, $inptail
	lvx	$inptail, 0, $inp
	addi	$inp, $inp, 16
	?vperm	v0, v0, $inptail, $inpperm
	vxor	v0, v0, v24		# ^= iv

	bl	_vpaes_encrypt_core

	andi.	r8, $out, 15
	vmr	v24, v0			# put aside iv
	sub	r9, $out, r8
	vperm	$outhead, v0, v0, $outperm	# rotate right/left

Lcbc_enc_head:
	stvebx	$outhead, r8, r9
	cmpwi	r8, 15
	addi	r8, r8, 1
	bne	Lcbc_enc_head

	sub.	r30, r30, r0		# len -= 16
	addi	$out, $out, 16
	beq	Lcbc_unaligned_done

Lcbc_enc_loop:
	vmr	v0, $inptail
	lvx	$inptail, 0, $inp
@@ -713,6 +744,32 @@ Lcbc_decrypt:
	bl	_vpaes_decrypt_preheat
	li	r0, 16

	beq	cr1, Lcbc_dec_loop	# $out is aligned

	vmr	v0, $inptail
	lvx	$inptail, 0, $inp
	addi	$inp, $inp, 16
	?vperm	v0, v0, $inptail, $inpperm
	vmr	v25, v0			# put aside input

	bl	_vpaes_decrypt_core

	andi.	r8, $out, 15
	vxor	v0, v0, v24		# ^= iv
	vmr	v24, v25
	sub	r9, $out, r8
	vperm	$outhead, v0, v0, $outperm	# rotate right/left

Lcbc_dec_head:
	stvebx	$outhead, r8, r9
	cmpwi	r8, 15
	addi	r8, r8, 1
	bne	Lcbc_dec_head

	sub.	r30, r30, r0		# len -= 16
	addi	$out, $out, 16
	beq	Lcbc_unaligned_done

Lcbc_dec_loop:
	vmr	v0, $inptail
	lvx	$inptail, 0, $inp
@@ -733,23 +790,29 @@ Lcbc_dec_loop:
	bne	Lcbc_dec_loop

Lcbc_done:
	addi	$out, $out, -1
	lvx	v1, 0, $out		# redundant in aligned case
	vsel	v1, $outhead, v1, $outmask
	stvx	v1, 0, $out

	beq	cr1, Lcbc_write_iv	# $out is aligned

Lcbc_unaligned_done:
	andi.	r8, $out, 15
	sub	$out, $out, r8
	li	r9, 0
Lcbc_tail:
	stvebx	$outhead, r9, $out
	addi	r9, r9, 1
	cmpw	r9, r8
	bne	Lcbc_tail

Lcbc_write_iv:
	neg	r8, r31			# write [potentially unaligned] iv
	li	r10, 4
	?lvsl	$outperm, 0, r8
	li	r6, 15
	vnor	$outmask, v7, v7	# 0xff..ff
	?vperm	$outmask, v7, $outmask, $outperm
	lvx	$outhead, 0, r31
	li	r11, 8
	li	r12, 12
	vperm	v24, v24, v24, $outperm	# rotate right/left
	vsel	v0, $outhead, v24, $outmask
	lvx	v1, r6, r31
	stvx	v0, 0, r31
	vsel	v1, v24, v1, $outmask
	stvx	v1, r6, r31
	stvewx	v24, 0, r31		# ivp is at least 32-bit aligned
	stvewx	v24, r10, r31
	stvewx	v24, r11, r31
	stvewx	v24, r12, r31

	mtspr	256, r7			# restore vrsave
	li	r10,`15+6*$SIZE_T`
@@ -872,18 +935,21 @@ _vpaes_schedule_core:

	# encrypting, output zeroth round key after transform
	li	r8, 0x30		# mov	\$0x30,%r8d
	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
	li	r9, 4
	li	r10, 8
	li	r11, 12

	?lvsr	$outperm, 0, $out	# prepare for unaligned access
	vnor	$outmask, v9, v9	# 0xff..ff
	lvx	$outhead, 0, $out
	?vperm	$outmask, v9, $outmask, $outperm

	#stvx	v0, 0, $out		# vmovdqu	%xmm0,	(%rdx)
	vperm	v1, v0, v0, $outperm	# rotate right/left
	vsel	v2, $outhead, v1, $outmask
	vmr	$outhead, v1
	stvx	v2, 0, $out
	vperm	$outhead, v0, v0, $outperm	# rotate right/left
	stvewx	$outhead, 0, $out	# some are superfluous
	stvewx	$outhead, r9, $out
	stvewx	$outhead, r10, $out
	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
	stvewx	$outhead, r11, $out
	b	Lschedule_go

Lschedule_am_decrypting:
@@ -893,20 +959,24 @@ Lschedule_am_decrypting:
	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
	# decrypting, output zeroth round key after shiftrows
	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
	li	r9, 4
	li	r10, 8
	li	r11, 12
	vperm	v4, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3

	neg	r0, $out		# prepare for unaligned access
	?lvsl	$outperm, 0, r0
	addi	$out, $out, 15		# 15 is not typo
	vnor	$outmask, v9, v9	# 0xff..ff
	lvx	$outhead, 0, $out
	?vperm	$outmask, $outmask, v9, $outperm

	#stvx	v4, 0, $out		# vmovdqu	%xmm3,	(%rdx)
	vperm	v4, v4, v4, $outperm	# rotate right/left
	vsel	v2, $outhead, v4, $outmask
	vmr	$outhead, v4
	stvx	v2, 0, $out
	vperm	$outhead, v4, v4, $outperm	# rotate right/left
	stvewx	$outhead, 0, $out	# some are superfluous
	stvewx	$outhead, r9, $out
	stvewx	$outhead, r10, $out
	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
	stvewx	$outhead, r11, $out
	addi	$out, $out, 15		# 15 is not typo
	xori	r8, r8, 0x30		# xor	\$0x30, %r8

Lschedule_go:
@@ -1038,14 +1108,15 @@ Lschedule_mangle_last:

	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
	vperm	v0, v0, v0, $outperm	# rotate right/left
	li	r10, 4
	vsel	v2, $outhead, v0, $outmask
	vmr	$outhead, v0
	li	r11, 8
	stvx	v2, 0, $out

	addi	$out, $out, 15		# 15 is not typo
	lvx	v1, 0, $out		# redundant in aligned case
	vsel	v1, $outhead, v1, $outmask
	stvx	v1, 0, $out
	li	r12, 12
	stvewx	v0, 0, $out		# some (or all) are redundant
	stvewx	v0, r10, $out
	stvewx	v0, r11, $out
	stvewx	v0, r12, $out
	b	Lschedule_mangle_done

.align	4
@@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec:
	bl	_vpaes_schedule_transform	# output transform

	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
	addi	r9, $out, -15		# -15 is not typo
	vperm	v0, v0, v0, $outperm	# rotate right/left
	li	r10, 4
	vsel	v2, $outhead, v0, $outmask
	vmr	$outhead, v0
	li	r11, 8
	stvx	v2, 0, $out
	li	r12, 12
	stvewx	v0, 0, r9		# some (or all) are redundant
	stvewx	v0, r10, r9
	stvewx	v0, r11, r9
	stvewx	v0, r12, r9

	addi	$out, $out, -15		# -15 is not typo
	lvx	v1, 0, $out		# redundant in aligned case
	vsel	v1, $outhead, v1, $outmask
	stvx	v1, 0, $out

Lschedule_mangle_done:
	mtlr	r7