Loading crypto/modes/asm/ghash-alpha.pl +3 −9 Original line number Diff line number Diff line Loading @@ -31,10 +31,10 @@ $Thi1="t5"; $Tlo1="t6"; $rem="t7"; # $8 ################# $Xi="a0"; # $16 $Xi="a0"; # $16, input argument block $Htbl="a1"; $inp="a2"; $len="a3"; $nlo="a4"; # $20 $nhi="a5"; $Zhi="t8"; Loading Loading @@ -314,12 +314,6 @@ $code.=<<___; .end gcm_gmult_4bit ___ # argument block for gcm_ghash_4bit $inp="a0"; # $16 $len="a1"; $Xi ="a2"; $Htbl="a3"; $inhi="s0"; $inlo="s1"; Loading crypto/modes/asm/ghash-ia64.pl +4 −4 Original line number Diff line number Diff line Loading @@ -142,13 +142,13 @@ gcm_ghash_4bit: .prologue { .mmi; .save ar.pfs,prevfs alloc prevfs=ar.pfs,4,4,0,8 $ADDP inp=15,in0 // &inp[15] $ADDP inp=15,in2 // &inp[15] mov rem_4bitp=ip } { .mmi; $ADDP end=in1,in0 // &inp[len] $ADDP Xi=15,in2 // &Xi[15] { .mmi; $ADDP end=in3,in2 // &inp[len] $ADDP Xi=15,in0 // &Xi[15] .save ar.lc,prevlc mov prevlc=ar.lc };; { .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo { .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo mov mask0xf0=0xf0 .save pr,prevpr mov prevpr=pr } Loading crypto/modes/asm/ghash-sparcv9.pl +4 −6 Original line number Diff line number Diff line Loading @@ -54,10 +54,10 @@ $remi="%l5"; $Htblo="%l6"; $cnt="%l7"; $inp="%i0"; # input arguments for gcm_ghash_4bit $len="%i1"; $Xi="%i2"; $Htbl="%i3"; $Xi="%i0"; # input argument block $Htbl="%i1"; $inp="%i2"; $len="%i3"; $code.=<<___; .section ".text",#alloc,#execinstr Loading Loading @@ -208,8 +208,6 @@ gcm_ghash_4bit: .size gcm_ghash_4bit,(.-gcm_ghash_4bit) ___ $Xi="%i0"; # input arguments for gcm_gmult_4bit $Htbl="%i1"; undef $inp; undef $len; Loading crypto/modes/asm/ghash-x86.pl +17 −17 Original line number Diff line number Diff line Loading @@ -23,7 +23,7 @@ # PIII 63 /77 16 24 # P4 96 /122 30 84(***) # Opteron 50 /71 21 30 # Core2 63 /102 19 28 # Core2 54 /68 13 18 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, Loading Loading @@ -317,12 +317,12 @@ if ($unroll) { &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); &mov ($inp,&wparam(0)); # load in &mov ($Zlh,&wparam(1)); # load len &mov ($Zhh,&wparam(2)); # load Xi &mov ($Htbl,&wparam(3)); # load Htable &mov ($Zhh,&wparam(0)); # load Xi &mov ($Htbl,&wparam(1)); # load Htable &mov ($inp,&wparam(2)); # load in &mov ($Zlh,&wparam(3)); # load len &add ($Zlh,$inp); &mov (&wparam(1),$Zlh); # len to point at the end of input &mov (&wparam(3),$Zlh); # len to point at the end of input &stack_push(4+1); # +1 for stack alignment &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] &mov ($Zhl,&DWP(4,$Zhh)); Loading @@ -344,10 +344,10 @@ if ($unroll) { &mmx_loop("esp","eax"); &lea ($inp,&DWP(16,$inp)); &cmp ($inp,&wparam(1)); &cmp ($inp,&wparam(3)); &jb (&label("mmx_outer_loop")); &mov ($inp,&wparam(2)); # load Xi &mov ($inp,&wparam(0)); # load Xi &emms (); &mov (&DWP(12,$inp),$Zll); &mov (&DWP(4,$inp),$Zhl); Loading @@ -359,12 +359,12 @@ if ($unroll) { &set_label("x86",16); } &stack_push(16+4+1); # +1 for 64-bit alignment &mov ($inp,&wparam(0)); # load in &mov ("ecx",&wparam(1)); # load len &mov ($Zll,&wparam(2)); # load Xi &mov ($Htbl,&wparam(3)); # load Htable &mov ($Zll,&wparam(0)); # load Xi &mov ($Htbl,&wparam(1)); # load Htable &mov ($inp,&wparam(2)); # load in &mov ("ecx",&wparam(3)); # load len &add ("ecx",$inp); &mov (&wparam(1),"ecx"); &mov (&wparam(3),"ecx"); &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] &mov ($Zhl,&DWP(4,$Zll)); Loading @@ -390,14 +390,14 @@ if ($unroll) { &call ("_x86_gmult_4bit_inner"); } else { &x86_loop(0); &mov ($inp,&wparam(0)); &mov ($inp,&wparam(2)); } &lea ($inp,&DWP(16,$inp)); &cmp ($inp,&wparam(1)); &mov (&wparam(0),$inp) if (!$unroll); &cmp ($inp,&wparam(3)); &mov (&wparam(2),$inp) if (!$unroll); &jb (&label("x86_outer_loop")); &mov ($inp,&wparam(2)); # load Xi &mov ($inp,&wparam(0)); # load Xi &mov (&DWP(12,$inp),$Zll); &mov (&DWP(8,$inp),$Zlh); &mov (&DWP(4,$inp),$Zhl); Loading crypto/modes/asm/ghash-x86_64.pl +4 −6 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ # gcc 3.4.x assembler # # Opteron 18.5 10.2 +80% # Core2 26.0 16.4 +58% # Core2 17.5 11.0 +59% $flavour = shift; $output = shift; Loading @@ -41,10 +41,10 @@ $Zhi="%r9"; $tmp="%r10"; $rem_4bit = "%r11"; # per-function register layout $Xi="%rdi"; $Htbl="%rsi"; # per-function register layout $cnt="%rcx"; $rem="%rdx"; Loading Loading @@ -159,10 +159,8 @@ ___ # per-function register layout $inp="%rdi"; $len="%rsi"; $Xi="%rdx"; $Htbl="%rcx"; $inp="%rdx"; $len="%rcx"; $cnt="%rbp"; $rem="%r12"; Loading Loading
crypto/modes/asm/ghash-alpha.pl +3 −9 Original line number Diff line number Diff line Loading @@ -31,10 +31,10 @@ $Thi1="t5"; $Tlo1="t6"; $rem="t7"; # $8 ################# $Xi="a0"; # $16 $Xi="a0"; # $16, input argument block $Htbl="a1"; $inp="a2"; $len="a3"; $nlo="a4"; # $20 $nhi="a5"; $Zhi="t8"; Loading Loading @@ -314,12 +314,6 @@ $code.=<<___; .end gcm_gmult_4bit ___ # argument block for gcm_ghash_4bit $inp="a0"; # $16 $len="a1"; $Xi ="a2"; $Htbl="a3"; $inhi="s0"; $inlo="s1"; Loading
crypto/modes/asm/ghash-ia64.pl +4 −4 Original line number Diff line number Diff line Loading @@ -142,13 +142,13 @@ gcm_ghash_4bit: .prologue { .mmi; .save ar.pfs,prevfs alloc prevfs=ar.pfs,4,4,0,8 $ADDP inp=15,in0 // &inp[15] $ADDP inp=15,in2 // &inp[15] mov rem_4bitp=ip } { .mmi; $ADDP end=in1,in0 // &inp[len] $ADDP Xi=15,in2 // &Xi[15] { .mmi; $ADDP end=in3,in2 // &inp[len] $ADDP Xi=15,in0 // &Xi[15] .save ar.lc,prevlc mov prevlc=ar.lc };; { .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo { .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo mov mask0xf0=0xf0 .save pr,prevpr mov prevpr=pr } Loading
crypto/modes/asm/ghash-sparcv9.pl +4 −6 Original line number Diff line number Diff line Loading @@ -54,10 +54,10 @@ $remi="%l5"; $Htblo="%l6"; $cnt="%l7"; $inp="%i0"; # input arguments for gcm_ghash_4bit $len="%i1"; $Xi="%i2"; $Htbl="%i3"; $Xi="%i0"; # input argument block $Htbl="%i1"; $inp="%i2"; $len="%i3"; $code.=<<___; .section ".text",#alloc,#execinstr Loading Loading @@ -208,8 +208,6 @@ gcm_ghash_4bit: .size gcm_ghash_4bit,(.-gcm_ghash_4bit) ___ $Xi="%i0"; # input arguments for gcm_gmult_4bit $Htbl="%i1"; undef $inp; undef $len; Loading
crypto/modes/asm/ghash-x86.pl +17 −17 Original line number Diff line number Diff line Loading @@ -23,7 +23,7 @@ # PIII 63 /77 16 24 # P4 96 /122 30 84(***) # Opteron 50 /71 21 30 # Core2 63 /102 19 28 # Core2 54 /68 13 18 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, Loading Loading @@ -317,12 +317,12 @@ if ($unroll) { &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); &mov ($inp,&wparam(0)); # load in &mov ($Zlh,&wparam(1)); # load len &mov ($Zhh,&wparam(2)); # load Xi &mov ($Htbl,&wparam(3)); # load Htable &mov ($Zhh,&wparam(0)); # load Xi &mov ($Htbl,&wparam(1)); # load Htable &mov ($inp,&wparam(2)); # load in &mov ($Zlh,&wparam(3)); # load len &add ($Zlh,$inp); &mov (&wparam(1),$Zlh); # len to point at the end of input &mov (&wparam(3),$Zlh); # len to point at the end of input &stack_push(4+1); # +1 for stack alignment &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] &mov ($Zhl,&DWP(4,$Zhh)); Loading @@ -344,10 +344,10 @@ if ($unroll) { &mmx_loop("esp","eax"); &lea ($inp,&DWP(16,$inp)); &cmp ($inp,&wparam(1)); &cmp ($inp,&wparam(3)); &jb (&label("mmx_outer_loop")); &mov ($inp,&wparam(2)); # load Xi &mov ($inp,&wparam(0)); # load Xi &emms (); &mov (&DWP(12,$inp),$Zll); &mov (&DWP(4,$inp),$Zhl); Loading @@ -359,12 +359,12 @@ if ($unroll) { &set_label("x86",16); } &stack_push(16+4+1); # +1 for 64-bit alignment &mov ($inp,&wparam(0)); # load in &mov ("ecx",&wparam(1)); # load len &mov ($Zll,&wparam(2)); # load Xi &mov ($Htbl,&wparam(3)); # load Htable &mov ($Zll,&wparam(0)); # load Xi &mov ($Htbl,&wparam(1)); # load Htable &mov ($inp,&wparam(2)); # load in &mov ("ecx",&wparam(3)); # load len &add ("ecx",$inp); &mov (&wparam(1),"ecx"); &mov (&wparam(3),"ecx"); &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] &mov ($Zhl,&DWP(4,$Zll)); Loading @@ -390,14 +390,14 @@ if ($unroll) { &call ("_x86_gmult_4bit_inner"); } else { &x86_loop(0); &mov ($inp,&wparam(0)); &mov ($inp,&wparam(2)); } &lea ($inp,&DWP(16,$inp)); &cmp ($inp,&wparam(1)); &mov (&wparam(0),$inp) if (!$unroll); &cmp ($inp,&wparam(3)); &mov (&wparam(2),$inp) if (!$unroll); &jb (&label("x86_outer_loop")); &mov ($inp,&wparam(2)); # load Xi &mov ($inp,&wparam(0)); # load Xi &mov (&DWP(12,$inp),$Zll); &mov (&DWP(8,$inp),$Zlh); &mov (&DWP(4,$inp),$Zhl); Loading
crypto/modes/asm/ghash-x86_64.pl +4 −6 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ # gcc 3.4.x assembler # # Opteron 18.5 10.2 +80% # Core2 26.0 16.4 +58% # Core2 17.5 11.0 +59% $flavour = shift; $output = shift; Loading @@ -41,10 +41,10 @@ $Zhi="%r9"; $tmp="%r10"; $rem_4bit = "%r11"; # per-function register layout $Xi="%rdi"; $Htbl="%rsi"; # per-function register layout $cnt="%rcx"; $rem="%rdx"; Loading Loading @@ -159,10 +159,8 @@ ___ # per-function register layout $inp="%rdi"; $len="%rsi"; $Xi="%rdx"; $Htbl="%rcx"; $inp="%rdx"; $len="%rcx"; $cnt="%rbp"; $rem="%r12"; Loading