Loading crypto/aes/asm/aes-586.pl +31 −34 Original line number Diff line number Diff line Loading @@ -2,8 +2,9 @@ # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # project. Rights for redistribution and usage in source and binary # forms are granted according to the OpenSSL license. # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # Version 4.3. Loading Loading @@ -105,6 +106,7 @@ # P4 56[60] 84[100] 23 # AMD K8 48[44] 70[79] 18 # PIII 41[50] 61[91] 24 # Core 2 32[38] 45[70] 18.5 # Pentium 120 160 77 # # Version 4.1 switches to compact S-box even in key schedule setup. Loading Loading @@ -184,7 +186,8 @@ # Current implementation accesses *all* cache-lines within ~50 cycles # window, which is actually *less* than RDTSC latency on Intel P4! push(@INC,"perlasm","../../perlasm"); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386"); Loading Loading @@ -474,11 +477,10 @@ sub enctransform() &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($r2,$s[$i]); &shr ($tmp,7); &and ($r2,0x7f7f7f7f); &lea ($r2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &lea ($r2,&DWP(0,$r2,$r2)); &and ($r2,0xfefefefe); &and ($acc,0x1b1b1b1b); &mov ($tmp,$s[$i]); &xor ($acc,$r2); # r2 Loading Loading @@ -1273,54 +1275,51 @@ sub dectransform() &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp2,$s[$i]); &shr ($tmp,7); &and ($tp2,0x7f7f7f7f); &lea ($tp2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &add ($tp2,$tp2); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp4,$tp2); &xor ($tp2,$s[$i]); # tp2^tp1 &shr ($tmp,7); &and ($tp4,0x7f7f7f7f); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &add ($tp4,$tp4); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$s[$i]); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp8,$tp4); &xor ($tp4,$s[$i]); # tp4^tp1 &shr ($tmp,7); &and ($tp8,0x7f7f7f7f); &lea ($tp8,&DWP(0,$tp4,$tp4)); &sub ($acc,$tmp); &add ($tp8,$tp8); &and ($tp8,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp4,$s[$i]); # tp4^tp1 &rotl ($s[$i],8); # = ROTATE(tp1,8) &xor ($tp8,$acc); &xor ($s[$i],$tp2); &xor ($tp2,$tp8); &xor ($s[$i],$tp4); &rotl ($tp2,24); &xor ($s[$i],$tp4); &xor ($tp4,$tp8); &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) &rotl ($tp4,16); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) &rotl ($tp8,8); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) &mov ($s[0],$__s0) if($i==2); #prefetch $s0 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 &mov ($s[2],$__s2) if($i==1); &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) &mov ($s[3],$__s3) if($i==1); &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2); } Loading Loading @@ -2872,35 +2871,32 @@ sub deckey() &mov ($acc,$tp1); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp2,$tp1); &shr ($tmp,7); &and ($tp2,0x7f7f7f7f); &lea ($tp2,&DWP(0,$tp1,$tp1)); &sub ($acc,$tmp); &add ($tp2,$tp2); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp4,$tp2); &xor ($tp2,$tp1); # tp2^tp1 &shr ($tmp,7); &and ($tp4,0x7f7f7f7f); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &add ($tp4,$tp4); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$tp1); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp8,$tp4); &xor ($tp4,$tp1); # tp4^tp1 &shr ($tmp,7); &and ($tp8,0x7f7f7f7f); &lea ($tp8,&DWP(0,$tp4,$tp4)); &xor ($tp4,$tp1); # tp4^tp1 &sub ($acc,$tmp); &add ($tp8,$tp8); &and ($tp8,0xfefefefe); &and ($acc,0x1b1b1b1b); &rotl ($tp1,8); # = ROTATE(tp1,8) &xor ($tp8,$acc); Loading Loading @@ -2992,5 +2988,6 @@ sub deckey() &xor ("eax","eax"); # return success &function_end("AES_set_decrypt_key"); &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); crypto/aes/asm/aes-ppc.pl +3 −3 Original line number Diff line number Diff line Loading @@ -12,9 +12,9 @@ # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with # 128-bit key, which is ~40% better than 64-bit code generated by gcc # 4.0. But these are not the ones currently used! Their "compact" # counterparts are, for security reason. ppc_AES_crypt_compact runs at # 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - at 1/3 # of ppc_AES_decrypt. # counterparts are, for security reason. ppc_AES_encrypt_compact runs # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - # at 1/3 of ppc_AES_decrypt. $output = shift; Loading crypto/aes/asm/aes-s390x.pl +13 −36 Original line number Diff line number Diff line Loading @@ -738,14 +738,8 @@ AES_set_encrypt_key: tmhl %r0,`0x8000>>2` jz .Lekey_internal l $t1,0($inp) # just copy 128 bits... l $t2,4($inp) l $bits,8($inp) l $inp,12($inp) st $t1,0($key) st $t2,4($key) st $bits,8($key) st $inp,12($key) lmg $t1,$t2,0($inp) # just copy 128 bits... stmg $t1,$t2,0($key) lghi $t1,10 st $t1,236($key) # ... postpone key setup st $t1,240($key) Loading @@ -754,7 +748,7 @@ AES_set_encrypt_key: .align 16 .Lekey_internal: stmg %r6,%r13,48($sp) # all volatile regs, but $ra! stmg %r6,%r13,48($sp) # all non-volatile regs bras $tbl,1f 1: aghi $tbl,AES_Te+2048-. Loading Loading @@ -949,7 +943,7 @@ AES_set_encrypt_key: .align 16 AES_set_decrypt_key: stg $key,32($sp) # I rely on AES_set_encrypt_key to stg $ra,112($sp) # save [other] volatile registers! stg $ra,112($sp) # save non-volatile registers! bras $ra,AES_set_encrypt_key lg $key,32($sp) lg $ra,112($sp) Loading @@ -963,14 +957,8 @@ AES_set_decrypt_key: c $t1,236($key) je .Lgo l $t1,0($key) # just copy 128 bits otherwise l $t2,4($key) l $t3,8($key) l $bits,12($key) st $t1,160($key) st $t2,164($key) st $t3,168($key) st $bits,172($key) lmg $t1,$t2,0($key) # just copy 128 bits otherwise stmg $t1,$t2,160($key) lghi %r2,0 br $ra Loading @@ -983,27 +971,16 @@ AES_set_decrypt_key: lg $ra,40($sp) .Lgo: llgf $rounds,240($key) lghi $i1,0 la $i1,0($key) sllg $i2,$rounds,4 la $i2,0($i2,$key) srl $rounds,1 .align 8 .Linv: l $s0,0($i1,$key) l $s1,4($i1,$key) l $s2,8($i1,$key) l $s3,12($i1,$key) l $t1,0($i2,$key) l $t2,4($i2,$key) l $t3,8($i2,$key) l $i3,12($i2,$key) st $s0,0($i2,$key) st $s1,4($i2,$key) st $s2,8($i2,$key) st $s3,12($i2,$key) st $t1,0($i1,$key) st $t2,4($i1,$key) st $t3,8($i1,$key) st $i3,12($i1,$key) .Linv: lmg $s0,$s1,0($i1) lmg $s2,$s3,0($i2) stmg $s0,$s1,0($i2) stmg $s2,$s3,0($i1) aghi $i1,16 aghi $i2,-16 brct $rounds,.Linv Loading Loading @@ -1070,7 +1047,7 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix lmg %r6,%r13,48($sp)# this was saved by AES_set_encrypt_key! lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra .size AES_set_decrypt_key,.-AES_set_decrypt_key Loading Loading
crypto/aes/asm/aes-586.pl +31 −34 Original line number Diff line number Diff line Loading @@ -2,8 +2,9 @@ # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # project. Rights for redistribution and usage in source and binary # forms are granted according to the OpenSSL license. # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # Version 4.3. Loading Loading @@ -105,6 +106,7 @@ # P4 56[60] 84[100] 23 # AMD K8 48[44] 70[79] 18 # PIII 41[50] 61[91] 24 # Core 2 32[38] 45[70] 18.5 # Pentium 120 160 77 # # Version 4.1 switches to compact S-box even in key schedule setup. Loading Loading @@ -184,7 +186,8 @@ # Current implementation accesses *all* cache-lines within ~50 cycles # window, which is actually *less* than RDTSC latency on Intel P4! push(@INC,"perlasm","../../perlasm"); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386"); Loading Loading @@ -474,11 +477,10 @@ sub enctransform() &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($r2,$s[$i]); &shr ($tmp,7); &and ($r2,0x7f7f7f7f); &lea ($r2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &lea ($r2,&DWP(0,$r2,$r2)); &and ($r2,0xfefefefe); &and ($acc,0x1b1b1b1b); &mov ($tmp,$s[$i]); &xor ($acc,$r2); # r2 Loading Loading @@ -1273,54 +1275,51 @@ sub dectransform() &mov ($acc,$s[$i]); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp2,$s[$i]); &shr ($tmp,7); &and ($tp2,0x7f7f7f7f); &lea ($tp2,&DWP(0,$s[$i],$s[$i])); &sub ($acc,$tmp); &add ($tp2,$tp2); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp4,$tp2); &xor ($tp2,$s[$i]); # tp2^tp1 &shr ($tmp,7); &and ($tp4,0x7f7f7f7f); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &add ($tp4,$tp4); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$s[$i]); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp8,$tp4); &xor ($tp4,$s[$i]); # tp4^tp1 &shr ($tmp,7); &and ($tp8,0x7f7f7f7f); &lea ($tp8,&DWP(0,$tp4,$tp4)); &sub ($acc,$tmp); &add ($tp8,$tp8); &and ($tp8,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp4,$s[$i]); # tp4^tp1 &rotl ($s[$i],8); # = ROTATE(tp1,8) &xor ($tp8,$acc); &xor ($s[$i],$tp2); &xor ($tp2,$tp8); &xor ($s[$i],$tp4); &rotl ($tp2,24); &xor ($s[$i],$tp4); &xor ($tp4,$tp8); &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) &rotl ($tp4,16); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) &rotl ($tp8,8); &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) &mov ($s[0],$__s0) if($i==2); #prefetch $s0 &mov ($s[1],$__s1) if($i==3); #prefetch $s1 &mov ($s[2],$__s2) if($i==1); &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) &mov ($s[3],$__s3) if($i==1); &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2); } Loading Loading @@ -2872,35 +2871,32 @@ sub deckey() &mov ($acc,$tp1); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp2,$tp1); &shr ($tmp,7); &and ($tp2,0x7f7f7f7f); &lea ($tp2,&DWP(0,$tp1,$tp1)); &sub ($acc,$tmp); &add ($tp2,$tp2); &and ($tp2,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($acc,$tp2); &mov ($tp2,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp4,$tp2); &xor ($tp2,$tp1); # tp2^tp1 &shr ($tmp,7); &and ($tp4,0x7f7f7f7f); &lea ($tp4,&DWP(0,$tp2,$tp2)); &sub ($acc,$tmp); &add ($tp4,$tp4); &and ($tp4,0xfefefefe); &and ($acc,0x1b1b1b1b); &xor ($tp2,$tp1); # tp2^tp1 &xor ($acc,$tp4); &mov ($tp4,$acc); &and ($acc,0x80808080); &mov ($tmp,$acc); &mov ($tp8,$tp4); &xor ($tp4,$tp1); # tp4^tp1 &shr ($tmp,7); &and ($tp8,0x7f7f7f7f); &lea ($tp8,&DWP(0,$tp4,$tp4)); &xor ($tp4,$tp1); # tp4^tp1 &sub ($acc,$tmp); &add ($tp8,$tp8); &and ($tp8,0xfefefefe); &and ($acc,0x1b1b1b1b); &rotl ($tp1,8); # = ROTATE(tp1,8) &xor ($tp8,$acc); Loading Loading @@ -2992,5 +2988,6 @@ sub deckey() &xor ("eax","eax"); # return success &function_end("AES_set_decrypt_key"); &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish();
crypto/aes/asm/aes-ppc.pl +3 −3 Original line number Diff line number Diff line Loading @@ -12,9 +12,9 @@ # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with # 128-bit key, which is ~40% better than 64-bit code generated by gcc # 4.0. But these are not the ones currently used! Their "compact" # counterparts are, for security reason. ppc_AES_crypt_compact runs at # 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - at 1/3 # of ppc_AES_decrypt. # counterparts are, for security reason. ppc_AES_encrypt_compact runs # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - # at 1/3 of ppc_AES_decrypt. $output = shift; Loading
crypto/aes/asm/aes-s390x.pl +13 −36 Original line number Diff line number Diff line Loading @@ -738,14 +738,8 @@ AES_set_encrypt_key: tmhl %r0,`0x8000>>2` jz .Lekey_internal l $t1,0($inp) # just copy 128 bits... l $t2,4($inp) l $bits,8($inp) l $inp,12($inp) st $t1,0($key) st $t2,4($key) st $bits,8($key) st $inp,12($key) lmg $t1,$t2,0($inp) # just copy 128 bits... stmg $t1,$t2,0($key) lghi $t1,10 st $t1,236($key) # ... postpone key setup st $t1,240($key) Loading @@ -754,7 +748,7 @@ AES_set_encrypt_key: .align 16 .Lekey_internal: stmg %r6,%r13,48($sp) # all volatile regs, but $ra! stmg %r6,%r13,48($sp) # all non-volatile regs bras $tbl,1f 1: aghi $tbl,AES_Te+2048-. Loading Loading @@ -949,7 +943,7 @@ AES_set_encrypt_key: .align 16 AES_set_decrypt_key: stg $key,32($sp) # I rely on AES_set_encrypt_key to stg $ra,112($sp) # save [other] volatile registers! stg $ra,112($sp) # save non-volatile registers! bras $ra,AES_set_encrypt_key lg $key,32($sp) lg $ra,112($sp) Loading @@ -963,14 +957,8 @@ AES_set_decrypt_key: c $t1,236($key) je .Lgo l $t1,0($key) # just copy 128 bits otherwise l $t2,4($key) l $t3,8($key) l $bits,12($key) st $t1,160($key) st $t2,164($key) st $t3,168($key) st $bits,172($key) lmg $t1,$t2,0($key) # just copy 128 bits otherwise stmg $t1,$t2,160($key) lghi %r2,0 br $ra Loading @@ -983,27 +971,16 @@ AES_set_decrypt_key: lg $ra,40($sp) .Lgo: llgf $rounds,240($key) lghi $i1,0 la $i1,0($key) sllg $i2,$rounds,4 la $i2,0($i2,$key) srl $rounds,1 .align 8 .Linv: l $s0,0($i1,$key) l $s1,4($i1,$key) l $s2,8($i1,$key) l $s3,12($i1,$key) l $t1,0($i2,$key) l $t2,4($i2,$key) l $t3,8($i2,$key) l $i3,12($i2,$key) st $s0,0($i2,$key) st $s1,4($i2,$key) st $s2,8($i2,$key) st $s3,12($i2,$key) st $t1,0($i1,$key) st $t2,4($i1,$key) st $t3,8($i1,$key) st $i3,12($i1,$key) .Linv: lmg $s0,$s1,0($i1) lmg $s2,$s3,0($i2) stmg $s0,$s1,0($i2) stmg $s2,$s3,0($i1) aghi $i1,16 aghi $i2,-16 brct $rounds,.Linv Loading Loading @@ -1070,7 +1047,7 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix lmg %r6,%r13,48($sp)# this was saved by AES_set_encrypt_key! lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra .size AES_set_decrypt_key,.-AES_set_decrypt_key Loading