Various minor updates to AES assembler modules. (96b0f6c1) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/aes/asm/aes-586.pl

+31 −34

Original line number	Diff line number	Diff line
		@@ -2,8 +2,9 @@
		#
		# ====================================================================
		# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
		# project. Rights for redistribution and usage in source and binary
		# forms are granted according to the OpenSSL license.
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================
		#
		# Version 4.3.
		@@ -105,6 +106,7 @@
		# P4 56[60] 84[100] 23
		# AMD K8 48[44] 70[79] 18
		# PIII 41[50] 61[91] 24
		# Core 2 32[38] 45[70] 18.5
		# Pentium 120 160 77
		#
		# Version 4.1 switches to compact S-box even in key schedule setup.
		@@ -184,7 +186,8 @@
		# Current implementation accesses all cache-lines within ~50 cycles
		# window, which is actually less than RDTSC latency on Intel P4!

		push(@INC,"perlasm","../../perlasm");
		$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
		push(@INC,"${dir}","${dir}../../perlasm");
		require "x86asm.pl";

		&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
		@@ -474,11 +477,10 @@ sub enctransform()
		&mov ($acc,$s[$i]);
		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($r2,$s[$i]);
		&shr ($tmp,7);
		&and ($r2,0x7f7f7f7f);
		&lea ($r2,&DWP(0,$s[$i],$s[$i]));
		&sub ($acc,$tmp);
		&lea ($r2,&DWP(0,$r2,$r2));
		&and ($r2,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&mov ($tmp,$s[$i]);
		&xor ($acc,$r2); # r2
		@@ -1273,54 +1275,51 @@ sub dectransform()
		&mov ($acc,$s[$i]);
		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($tp2,$s[$i]);
		&shr ($tmp,7);
		&and ($tp2,0x7f7f7f7f);
		&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
		&sub ($acc,$tmp);
		&add ($tp2,$tp2);
		&and ($tp2,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($acc,$tp2);
		&mov ($tp2,$acc);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($tp4,$tp2);
		&xor ($tp2,$s[$i]); # tp2^tp1
		&shr ($tmp,7);
		&and ($tp4,0x7f7f7f7f);
		&lea ($tp4,&DWP(0,$tp2,$tp2));
		&sub ($acc,$tmp);
		&add ($tp4,$tp4);
		&and ($tp4,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($tp2,$s[$i]); # tp2^tp1
		&xor ($acc,$tp4);
		&mov ($tp4,$acc);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($tp8,$tp4);
		&xor ($tp4,$s[$i]); # tp4^tp1
		&shr ($tmp,7);
		&and ($tp8,0x7f7f7f7f);
		&lea ($tp8,&DWP(0,$tp4,$tp4));
		&sub ($acc,$tmp);
		&add ($tp8,$tp8);
		&and ($tp8,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($tp4,$s[$i]); # tp4^tp1
		&rotl ($s[$i],8); # = ROTATE(tp1,8)
		&xor ($tp8,$acc);

		&xor ($s[$i],$tp2);
		&xor ($tp2,$tp8);
		&xor ($s[$i],$tp4);
		&rotl ($tp2,24);
		&xor ($s[$i],$tp4);
		&xor ($tp4,$tp8);
		&xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
		&rotl ($tp4,16);
		&xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
		&xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
		&rotl ($tp8,8);
		&xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
		&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
		&xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)

		&mov ($s[0],$__s0) if($i==2); #prefetch $s0
		&mov ($s[1],$__s1) if($i==3); #prefetch $s1
		&mov ($s[2],$__s2) if($i==1);
		&xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)

		&mov ($s[3],$__s3) if($i==1);
		&mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
		}
		@@ -2872,35 +2871,32 @@ sub deckey()
		&mov ($acc,$tp1);
		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($tp2,$tp1);
		&shr ($tmp,7);
		&and ($tp2,0x7f7f7f7f);
		&lea ($tp2,&DWP(0,$tp1,$tp1));
		&sub ($acc,$tmp);
		&add ($tp2,$tp2);
		&and ($tp2,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($acc,$tp2);
		&mov ($tp2,$acc);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($tp4,$tp2);
		&xor ($tp2,$tp1); # tp2^tp1
		&shr ($tmp,7);
		&and ($tp4,0x7f7f7f7f);
		&lea ($tp4,&DWP(0,$tp2,$tp2));
		&sub ($acc,$tmp);
		&add ($tp4,$tp4);
		&and ($tp4,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&xor ($tp2,$tp1); # tp2^tp1
		&xor ($acc,$tp4);
		&mov ($tp4,$acc);

		&and ($acc,0x80808080);
		&mov ($tmp,$acc);
		&mov ($tp8,$tp4);
		&xor ($tp4,$tp1); # tp4^tp1
		&shr ($tmp,7);
		&and ($tp8,0x7f7f7f7f);
		&lea ($tp8,&DWP(0,$tp4,$tp4));
		&xor ($tp4,$tp1); # tp4^tp1
		&sub ($acc,$tmp);
		&add ($tp8,$tp8);
		&and ($tp8,0xfefefefe);
		&and ($acc,0x1b1b1b1b);
		&rotl ($tp1,8); # = ROTATE(tp1,8)
		&xor ($tp8,$acc);
		@@ -2992,5 +2988,6 @@ sub deckey()

		&xor ("eax","eax"); # return success
		&function_end("AES_set_decrypt_key");
		&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");

		&asm_finish();

crypto/aes/asm/aes-ppc.pl

+3 −3

Original line number	Diff line number	Diff line
		@@ -12,9 +12,9 @@
		# ppc_AES_[en\|de]crypt perform at 18 cycles per byte processed with
		# 128-bit key, which is ~40% better than 64-bit code generated by gcc
		# 4.0. But these are not the ones currently used! Their "compact"
		# counterparts are, for security reason. ppc_AES_crypt_compact runs at
		# 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - at 1/3
		# of ppc_AES_decrypt.
		# counterparts are, for security reason. ppc_AES_encrypt_compact runs
		# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
		# at 1/3 of ppc_AES_decrypt.

		$output = shift;

crypto/aes/asm/aes-s390x.pl

+13 −36

Original line number	Diff line number	Diff line
		@@ -738,14 +738,8 @@ AES_set_encrypt_key:
		tmhl %r0,`0x8000>>2`
		jz .Lekey_internal

		l $t1,0($inp) # just copy 128 bits...
		l $t2,4($inp)
		l $bits,8($inp)
		l $inp,12($inp)
		st $t1,0($key)
		st $t2,4($key)
		st $bits,8($key)
		st $inp,12($key)
		lmg $t1,$t2,0($inp) # just copy 128 bits...
		stmg $t1,$t2,0($key)
		lghi $t1,10
		st $t1,236($key) # ... postpone key setup
		st $t1,240($key)
		@@ -754,7 +748,7 @@ AES_set_encrypt_key:

		.align 16
		.Lekey_internal:
		stmg %r6,%r13,48($sp) # all volatile regs, but $ra!
		stmg %r6,%r13,48($sp) # all non-volatile regs

		bras $tbl,1f
		1: aghi $tbl,AES_Te+2048-.
		@@ -949,7 +943,7 @@ AES_set_encrypt_key:
		.align 16
		AES_set_decrypt_key:
		stg $key,32($sp) # I rely on AES_set_encrypt_key to
		stg $ra,112($sp) # save [other] volatile registers!
		stg $ra,112($sp) # save non-volatile registers!
		bras $ra,AES_set_encrypt_key
		lg $key,32($sp)
		lg $ra,112($sp)
		@@ -963,14 +957,8 @@ AES_set_decrypt_key:
		c $t1,236($key)
		je .Lgo

		l $t1,0($key) # just copy 128 bits otherwise
		l $t2,4($key)
		l $t3,8($key)
		l $bits,12($key)
		st $t1,160($key)
		st $t2,164($key)
		st $t3,168($key)
		st $bits,172($key)
		lmg $t1,$t2,0($key) # just copy 128 bits otherwise
		stmg $t1,$t2,160($key)
		lghi %r2,0
		br $ra

		@@ -983,27 +971,16 @@ AES_set_decrypt_key:
		lg $ra,40($sp)

		.Lgo: llgf $rounds,240($key)
		lghi $i1,0
		la $i1,0($key)
		sllg $i2,$rounds,4
		la $i2,0($i2,$key)
		srl $rounds,1

		.align 8
		.Linv: l $s0,0($i1,$key)
		l $s1,4($i1,$key)
		l $s2,8($i1,$key)
		l $s3,12($i1,$key)
		l $t1,0($i2,$key)
		l $t2,4($i2,$key)
		l $t3,8($i2,$key)
		l $i3,12($i2,$key)
		st $s0,0($i2,$key)
		st $s1,4($i2,$key)
		st $s2,8($i2,$key)
		st $s3,12($i2,$key)
		st $t1,0($i1,$key)
		st $t2,4($i1,$key)
		st $t3,8($i1,$key)
		st $i3,12($i1,$key)
		.Linv: lmg $s0,$s1,0($i1)
		lmg $s2,$s3,0($i2)
		stmg $s0,$s1,0($i2)
		stmg $s2,$s3,0($i1)
		aghi $i1,16
		aghi $i2,-16
		brct $rounds,.Linv
		@@ -1070,7 +1047,7 @@ $code.=<<___;
		la $key,4($key)
		brct $rounds,.Lmix

		lmg %r6,%r13,48($sp)# this was saved by AES_set_encrypt_key!
		lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
		lghi %r2,0
		br $ra
		.size AES_set_decrypt_key,.-AES_set_decrypt_key