aesni-x86_64.pl: optimize CTR even further. (1bc4d009) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/aesni-x86_64.pl

+85 −25

Original line number	Original line	Diff line number	Diff line
	@@ -130,7 +130,7 @@
	# Further data for other parallelizable modes:		# Further data for other parallelizable modes:
	#		#
	# CBC decrypt 1.16 0.93 0.93		# CBC decrypt 1.16 0.93 0.93
	# CTR 1.14 0.91 0.90		# CTR 1.14 0.91 0.86
	#		#
	# Well, given 3x column it's probably inappropriate to call the limit		# Well, given 3x column it's probably inappropriate to call the limit
	# asymptotic, if it can be surpassed, isn't it? What happens there?		# asymptotic, if it can be surpassed, isn't it? What happens there?
	@@ -160,7 +160,7 @@
	######################################################################		######################################################################
	# For reference, AMD Bulldozer spends 5.77 cycles per byte processed		# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
	# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70		# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
	# in ECB, 0.76 in CTR, 0.95 in XTS... This means that aes[enc\|dec]		# in ECB, 0.73 in CTR, 0.95 in XTS... This means that aes[enc\|dec]
	# instruction latency is 9 cycles and that they can be issued every		# instruction latency is 9 cycles and that they can be issued every
	# cycle.		# cycle.

	@@ -1062,38 +1062,38 @@ $code.=<<___;
	$movkey ($key_),$rndkey0		$movkey ($key_),$rndkey0
	shr \$1,$rounds		shr \$1,$rounds
	shr \$1,$rnds_		shr \$1,$rnds_
			movdqa $rndkey0,$inout0
			movdqa $rndkey0,$inout1
			movdqa $rndkey0,$inout2
			movdqa $rndkey0,$inout3
			movdqa $rndkey0,$inout4
			movdqa $rndkey0,$inout5
			movdqa $rndkey0,$inout6
			movdqa $rndkey0,$inout7
			$movkey 16($key_),$rndkey1
	sub \$8,$len		sub \$8,$len
	jmp .Lctr32_loop8		jmp .Lctr32_loop8

	.align 16		.align 16
	.Lctr32_loop8:		.Lctr32_loop8:
	$movkey 16($key_),$rndkey1
	movdqa $rndkey0,$inout0
	movdqa $rndkey0,$inout1
	pxor $ivec,$inout0		pxor $ivec,$inout0
	paddb $one,$ivec		paddb $one,$ivec
	movdqa $rndkey0,$inout2
	aesenc $rndkey1,$inout0		aesenc $rndkey1,$inout0
	pxor $ivec,$inout1		pxor $ivec,$inout1
	paddb $one,$ivec		paddb $one,$ivec
	lea 32($key_),$key		lea 32($key_),$key
	movdqa $rndkey0,$inout3
	aesenc $rndkey1,$inout1		aesenc $rndkey1,$inout1
	pxor $ivec,$inout2		pxor $ivec,$inout2
	paddb $one,$ivec		paddb $one,$ivec
	movdqa $rndkey0,$inout4
	aesenc $rndkey1,$inout2		aesenc $rndkey1,$inout2
	pxor $ivec,$inout3		pxor $ivec,$inout3
	paddb $one,$ivec		paddb $one,$ivec
	movdqa $rndkey0,$inout5
	aesenc $rndkey1,$inout3		aesenc $rndkey1,$inout3
	pxor $ivec,$inout4		pxor $ivec,$inout4
	paddb $one,$ivec		paddb $one,$ivec
	movdqa $rndkey0,$inout6
	aesenc $rndkey1,$inout4		aesenc $rndkey1,$inout4
	pxor $ivec,$inout5		pxor $ivec,$inout5
	paddb $one,$ivec		paddb $one,$ivec
	movdqa $rndkey0,$inout7
	aesenc $rndkey1,$inout5		aesenc $rndkey1,$inout5
	pxor $ivec,$inout6		pxor $ivec,$inout6
	paddb $one,$ivec		paddb $one,$ivec
	@@ -1104,37 +1104,97 @@ $code.=<<___;
	dec $rounds		dec $rounds
	aesenc $rndkey1,$inout7		aesenc $rndkey1,$inout7
	$movkey 16($key),$rndkey1		$movkey 16($key),$rndkey1

			aesenc $rndkey0,$inout0
			aesenc $rndkey0,$inout1
			lea 32($key),$key
			aesenc $rndkey0,$inout2
	movups ($inp),$in0 # load input		movups ($inp),$in0 # load input
			aesenc $rndkey0,$inout3
	movups 0x10($inp),$in1		movups 0x10($inp),$in1
			aesenc $rndkey0,$inout4
	movups 0x20($inp),$in2		movups 0x20($inp),$in2
			aesenc $rndkey0,$inout5
	movups 0x30($inp),$in3		movups 0x30($inp),$in3
			aesenc $rndkey0,$inout6
			movups 0x40($inp),$one
			aesenc $rndkey0,$inout7
			$movkey ($key),$rndkey0

	call .Lenc_loop8_enter		.Lctr32_enc_loop8:
			aesenc $rndkey1,$inout0
			aesenc $rndkey1,$inout1
			dec $rounds
			aesenc $rndkey1,$inout2
			aesenc $rndkey1,$inout3
			aesenc $rndkey1,$inout4
			aesenc $rndkey1,$inout5
			aesenc $rndkey1,$inout6
			aesenc $rndkey1,$inout7
			$movkey 16($key),$rndkey1

	xorps $in0,$inout0 # xor		aesenc $rndkey0,$inout0
	movups 0x40($inp),$in0		aesenc $rndkey0,$inout1
	xorps $in1,$inout1		lea 32($key),$key
	movups 0x50($inp),$in1		aesenc $rndkey0,$inout2
	xorps $in2,$inout2		aesenc $rndkey0,$inout3
	movups 0x60($inp),$in2		aesenc $rndkey0,$inout4
	xorps $in3,$inout3		aesenc $rndkey0,$inout5
	movups 0x70($inp),$in3		aesenc $rndkey0,$inout6
			aesenc $rndkey0,$inout7
			$movkey ($key),$rndkey0
			jnz .Lctr32_enc_loop8

			aesenc $rndkey1,$inout0
			pxor $rndkey0,$in0
			aesenc $rndkey1,$inout1
			pxor $rndkey0,$in1
			aesenc $rndkey1,$inout2
			pxor $rndkey0,$in2
			aesenc $rndkey1,$inout3
			pxor $rndkey0,$in3
			aesenc $rndkey1,$inout4
			pxor $rndkey0,$one
			aesenc $rndkey1,$inout5
			aesenc $rndkey1,$inout6
			aesenc $rndkey1,$inout7
			movdqu 0x50($inp),$rndkey1
			aesenclast $in0,$inout0
			movdqu 0x60($inp),$in0
			pxor $rndkey0,$rndkey1
			aesenclast $in1,$inout1
			movdqu 0x70($inp),$in1
			pxor $rndkey0,$in0
			aesenclast $in2,$inout2
			pxor $rndkey0,$in1
			$movkey ($key_),$rndkey0
			aesenclast $in3,$inout3
	lea 0x80($inp),$inp		lea 0x80($inp),$inp
	xorps $in0,$inout4		aesenclast $one,$inout4
			movdqa .Lincrement1(%rip),$one
			aesenclast $rndkey1,$inout5
			$movkey 16($key_),$rndkey1
			aesenclast $in0,$inout6
			aesenclast $in1,$inout7

	movups $inout0,($out) # store output		movups $inout0,($out) # store output
	xorps $in1,$inout5		movdqa $rndkey0,$inout0
	movups $inout1,0x10($out)		movups $inout1,0x10($out)
	xorps $in2,$inout6		movdqa $rndkey0,$inout1
	movups $inout2,0x20($out)		movups $inout2,0x20($out)
	xorps $in3,$inout7		movdqa $rndkey0,$inout2
	movups $inout3,0x30($out)		movups $inout3,0x30($out)
			movdqa $rndkey0,$inout3
	movups $inout4,0x40($out)		movups $inout4,0x40($out)
			movdqa $rndkey0,$inout4
	movups $inout5,0x50($out)		movups $inout5,0x50($out)
			movdqa $rndkey0,$inout5
	movups $inout6,0x60($out)		movups $inout6,0x60($out)
			movdqa $rndkey0,$inout6
	movups $inout7,0x70($out)		movups $inout7,0x70($out)
			movdqa $rndkey0,$inout7
	lea 0x80($out),$out		lea 0x80($out),$out

	$movkey ($key_),$rndkey0
	mov $rnds_,$rounds		mov $rnds_,$rounds
	sub \$8,$len		sub \$8,$len
	jnc .Lctr32_loop8		jnc .Lctr32_loop8