Loading crypto/bn/asm/sparcv9-gf2m.pl +9 −17 Original line number Diff line number Diff line Loading @@ -18,23 +18,8 @@ # ~100-230% faster than gcc-generated code and ~35-90% faster than # the pure SPARCv9 code path. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $locals=16*8; $code.=<<___; #include <sparc_arch.h> .section ".text",#alloc,#execinstr ___ $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $tab="%l0"; @T=("%g2","%g3"); Loading @@ -44,6 +29,13 @@ $tab="%l0"; ($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo; $code.=<<___; #include <sparc_arch.h> #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif #ifdef __PIC__ SPARC_PIC_THUNK(%g1) #endif Loading Loading @@ -74,7 +66,7 @@ bn_GF2m_mul_2x2: .align 16 .Lsoftware: save %sp,-$frame-$locals,%sp save %sp,-STACK_FRAME-$locals,%sp sllx %i1,32,$a mov -1,$a12 Loading @@ -83,7 +75,7 @@ bn_GF2m_mul_2x2: srlx $a12,1,$a48 ! 0x7fff... or %i4,$b,$b srlx $a12,2,$a12 ! 0x3fff... add %sp,$bias+$frame,$tab add %sp,STACK_BIAS+STACK_FRAME,$tab sllx $a,2,$a4 mov $a,$a1 Loading crypto/md5/asm/md5-sparcv9.pl +9 −13 Original line number Diff line number Diff line Loading @@ -17,11 +17,6 @@ # single-process result on 8-core processor, or ~11GBps per 2.85GHz # socket. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $output=shift; open STDOUT,">$output"; Loading Loading @@ -198,13 +193,14 @@ $code.=<<___; ___ } $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $code.=<<___; #include "sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif .section ".text",#alloc,#execinstr #ifdef __PIC__ Loading Loading @@ -246,7 +242,7 @@ md5_block_asm_data_order: .word 0x81b02800 ! MD5 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop bne,pt SIZE_T_CC, .Lhw_loop nop .Lhwfinish: Loading Loading @@ -287,7 +283,7 @@ md5_block_asm_data_order: .word 0x81b02800 ! MD5 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f26, %f26, %f10 ! %f10=%f26 ba .Lhwfinish Loading @@ -295,7 +291,7 @@ md5_block_asm_data_order: .align 16 .Lsoftware: save %sp,-$frame,%sp save %sp,-STACK_FRAME,%sp rd %asi,$saved_asi wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE Loading Loading @@ -355,7 +351,7 @@ $code.=<<___; add $t2,$C,$C add $CD,$D,$D srl $B,0,$B ! clruw $B bne `$bits==64?"%xcc":"%icc"`,.Loop bne SIZE_T_CC,.Loop srl $D,0,$D ! clruw $D st $A,[$ctx+0] ! write out ctx Loading crypto/sha/asm/sha1-sparcv9.pl +9 −13 Original line number Diff line number Diff line Loading @@ -25,11 +25,6 @@ # single-process result on 8-core processor, or ~9GBps per 2.85GHz # socket. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $output=shift; open STDOUT,">$output"; Loading Loading @@ -185,13 +180,14 @@ $code.=<<___; ___ } $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $code.=<<___; #include "sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif .section ".text",#alloc,#execinstr #ifdef __PIC__ Loading Loading @@ -231,7 +227,7 @@ sha1_block_data_order: .word 0x81b02820 ! SHA1 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop bne,pt SIZE_T_CC, .Lhw_loop nop .Lhwfinish: Loading Loading @@ -271,7 +267,7 @@ sha1_block_data_order: .word 0x81b02820 ! SHA1 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f26, %f26, %f10 ! %f10=%f26 ba .Lhwfinish Loading @@ -279,7 +275,7 @@ sha1_block_data_order: .align 16 .Lsoftware: save %sp,-$frame,%sp save %sp,-STACK_FRAME,%sp sllx $len,6,$len add $inp,$len,$len Loading Loading @@ -359,7 +355,7 @@ $code.=<<___; add $E,@X[4],$E st $E,[$ctx+16] bne `$bits==64?"%xcc":"%icc"`,.Lloop bne SIZE_T_CC,.Lloop andn $inp,7,$tmp0 ret Loading crypto/sha/asm/sha512-sparcv9.pl +30 −35 Original line number Diff line number Diff line Loading @@ -49,12 +49,6 @@ # saturates at 11.5x single-process result on 8-core processor, or # ~11/16GBps per 2.85GHz socket. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $output=shift; open STDOUT,">$output"; Loading Loading @@ -191,29 +185,29 @@ $code.=<<___ if ($i<15); or @pair[1],$tmp2,$tmp2 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` add $h,$tmp2,$T1 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] ___ $code.=<<___ if ($i==12); bnz,a,pn %icc,.+8 ld [$inp+128],%l0 ___ $code.=<<___ if ($i==15); ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) add $tmp31,32,$tmp0 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 sllx @pair[0],$tmp0,$tmp1 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 srlx @pair[2],$tmp32,@pair[1] or $tmp1,$tmp2,$tmp2 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 or @pair[1],$tmp2,$tmp2 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 add $h,$tmp2,$T1 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 ___ } if ($SZ==8); Loading Loading @@ -349,9 +343,9 @@ $code.=<<___; or %l3,$tmp0,$tmp0 srlx $tmp0,@sigma0[0],$T1 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 sllx $tmp0,`64-@sigma0[2]`,$tmp1 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 srlx $tmp0,@sigma0[1],$tmp0 xor $tmp1,$T1,$T1 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 Loading @@ -363,9 +357,9 @@ $code.=<<___; or %l7,$tmp2,$tmp2 srlx $tmp2,@sigma1[0],$tmp1 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 sllx $tmp2,`64-@sigma1[2]`,$tmp0 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 srlx $tmp2,@sigma1[1],$tmp2 xor $tmp0,$tmp1,$tmp1 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 Loading @@ -374,29 +368,30 @@ $code.=<<___; xor $tmp0,$tmp1,$tmp1 sllx %l4,32,$tmp0 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 or %l5,$tmp0,$tmp0 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 sllx %l0,32,$tmp2 add $tmp1,$T1,$T1 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 or %l1,$tmp2,$tmp2 add $tmp0,$T1,$T1 ! +=X[$i+9] ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 add $tmp2,$T1,$T1 ! +=X[$i] $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`] $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`] ___ &BODY_00_15(@_); } if ($SZ==8); $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $code.=<<___; #include "sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif .section ".text",#alloc,#execinstr .align 64 Loading Loading @@ -519,7 +514,7 @@ $code.=<<___ if ($SZ==8); # SHA512 .word 0x81b02860 ! SHA512 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop bne,pt SIZE_T_CC, .Lhwaligned_loop nop .Lhwfinish: Loading Loading @@ -579,7 +574,7 @@ $code.=<<___ if ($SZ==8); # SHA512 .word 0x81b02860 ! SHA512 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f50, %f50, %f18 ! %f18=%f50 ba .Lhwfinish Loading Loading @@ -612,7 +607,7 @@ $code.=<<___ if ($SZ==4); # SHA256 .word 0x81b02840 ! SHA256 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwloop bne,pt SIZE_T_CC, .Lhwloop nop .Lhwfinish: Loading Loading @@ -655,7 +650,7 @@ $code.=<<___ if ($SZ==4); # SHA256 .word 0x81b02840 ! SHA256 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f26, %f26, %f10 ! %f10=%f26 ba .Lhwfinish Loading @@ -664,7 +659,7 @@ ___ $code.=<<___; .align 16 .Lsoftware: save %sp,`-$frame-$locals`,%sp save %sp,-STACK_FRAME-$locals,%sp and $inp,`$align-1`,$tmp31 sllx $len,`log(16*$SZ)/log(2)`,$len andn $inp,`$align-1`,$inp Loading Loading @@ -783,7 +778,7 @@ ___ $code.=<<___; add $inp,`16*$SZ`,$inp ! advance inp cmp $inp,$len bne `$bits==64?"%xcc":"%icc"`,.Lloop bne SIZE_T_CC,.Lloop sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl ret Loading crypto/sparc_arch.h +13 −4 Original line number Diff line number Diff line Loading @@ -32,6 +32,10 @@ # define __PIC__ #endif #if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__) # define __arch64__ #endif #define SPARC_PIC_THUNK(reg) \ .align 32; \ .Lpic_thunk: \ Loading @@ -53,18 +57,23 @@ add %o7, reg, reg #endif #if (defined(__GNUC__) && defined(__arch64__)) || \ (defined(__SUNPRO_C) && defined(__sparcv9)) #if defined(__arch64__) # define SPARC_LOAD_ADDRESS(SYM, reg) \ setx SYM, %o7, reg; # define LDPTR ldx # define SIZE_T_CC %xcc # define STACK_FRAME 192 # define STACK_BIAS 2047 #else # define SPARC_LOAD_ADDRESS(SYM, reg) \ set SYM, reg; # define LDPTR ld # define SIZE_T_CC %icc # define STACK_FRAME 112 # define STACK_BIAS 0 # define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg) #endif Loading Loading
crypto/bn/asm/sparcv9-gf2m.pl +9 −17 Original line number Diff line number Diff line Loading @@ -18,23 +18,8 @@ # ~100-230% faster than gcc-generated code and ~35-90% faster than # the pure SPARCv9 code path. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $locals=16*8; $code.=<<___; #include <sparc_arch.h> .section ".text",#alloc,#execinstr ___ $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $tab="%l0"; @T=("%g2","%g3"); Loading @@ -44,6 +29,13 @@ $tab="%l0"; ($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo; $code.=<<___; #include <sparc_arch.h> #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif #ifdef __PIC__ SPARC_PIC_THUNK(%g1) #endif Loading Loading @@ -74,7 +66,7 @@ bn_GF2m_mul_2x2: .align 16 .Lsoftware: save %sp,-$frame-$locals,%sp save %sp,-STACK_FRAME-$locals,%sp sllx %i1,32,$a mov -1,$a12 Loading @@ -83,7 +75,7 @@ bn_GF2m_mul_2x2: srlx $a12,1,$a48 ! 0x7fff... or %i4,$b,$b srlx $a12,2,$a12 ! 0x3fff... add %sp,$bias+$frame,$tab add %sp,STACK_BIAS+STACK_FRAME,$tab sllx $a,2,$a4 mov $a,$a1 Loading
crypto/md5/asm/md5-sparcv9.pl +9 −13 Original line number Diff line number Diff line Loading @@ -17,11 +17,6 @@ # single-process result on 8-core processor, or ~11GBps per 2.85GHz # socket. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $output=shift; open STDOUT,">$output"; Loading Loading @@ -198,13 +193,14 @@ $code.=<<___; ___ } $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $code.=<<___; #include "sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif .section ".text",#alloc,#execinstr #ifdef __PIC__ Loading Loading @@ -246,7 +242,7 @@ md5_block_asm_data_order: .word 0x81b02800 ! MD5 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop bne,pt SIZE_T_CC, .Lhw_loop nop .Lhwfinish: Loading Loading @@ -287,7 +283,7 @@ md5_block_asm_data_order: .word 0x81b02800 ! MD5 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f26, %f26, %f10 ! %f10=%f26 ba .Lhwfinish Loading @@ -295,7 +291,7 @@ md5_block_asm_data_order: .align 16 .Lsoftware: save %sp,-$frame,%sp save %sp,-STACK_FRAME,%sp rd %asi,$saved_asi wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE Loading Loading @@ -355,7 +351,7 @@ $code.=<<___; add $t2,$C,$C add $CD,$D,$D srl $B,0,$B ! clruw $B bne `$bits==64?"%xcc":"%icc"`,.Loop bne SIZE_T_CC,.Loop srl $D,0,$D ! clruw $D st $A,[$ctx+0] ! write out ctx Loading
crypto/sha/asm/sha1-sparcv9.pl +9 −13 Original line number Diff line number Diff line Loading @@ -25,11 +25,6 @@ # single-process result on 8-core processor, or ~9GBps per 2.85GHz # socket. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $output=shift; open STDOUT,">$output"; Loading Loading @@ -185,13 +180,14 @@ $code.=<<___; ___ } $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $code.=<<___; #include "sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif .section ".text",#alloc,#execinstr #ifdef __PIC__ Loading Loading @@ -231,7 +227,7 @@ sha1_block_data_order: .word 0x81b02820 ! SHA1 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop bne,pt SIZE_T_CC, .Lhw_loop nop .Lhwfinish: Loading Loading @@ -271,7 +267,7 @@ sha1_block_data_order: .word 0x81b02820 ! SHA1 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f26, %f26, %f10 ! %f10=%f26 ba .Lhwfinish Loading @@ -279,7 +275,7 @@ sha1_block_data_order: .align 16 .Lsoftware: save %sp,-$frame,%sp save %sp,-STACK_FRAME,%sp sllx $len,6,$len add $inp,$len,$len Loading Loading @@ -359,7 +355,7 @@ $code.=<<___; add $E,@X[4],$E st $E,[$ctx+16] bne `$bits==64?"%xcc":"%icc"`,.Lloop bne SIZE_T_CC,.Lloop andn $inp,7,$tmp0 ret Loading
crypto/sha/asm/sha512-sparcv9.pl +30 −35 Original line number Diff line number Diff line Loading @@ -49,12 +49,6 @@ # saturates at 11.5x single-process result on 8-core processor, or # ~11/16GBps per 2.85GHz socket. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } if ($bits==64) { $bias=2047; $frame=192; } else { $bias=0; $frame=112; } $output=shift; open STDOUT,">$output"; Loading Loading @@ -191,29 +185,29 @@ $code.=<<___ if ($i<15); or @pair[1],$tmp2,$tmp2 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` add $h,$tmp2,$T1 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] ___ $code.=<<___ if ($i==12); bnz,a,pn %icc,.+8 ld [$inp+128],%l0 ___ $code.=<<___ if ($i==15); ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) add $tmp31,32,$tmp0 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 sllx @pair[0],$tmp0,$tmp1 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 srlx @pair[2],$tmp32,@pair[1] or $tmp1,$tmp2,$tmp2 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 or @pair[1],$tmp2,$tmp2 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 add $h,$tmp2,$T1 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 ___ } if ($SZ==8); Loading Loading @@ -349,9 +343,9 @@ $code.=<<___; or %l3,$tmp0,$tmp0 srlx $tmp0,@sigma0[0],$T1 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 sllx $tmp0,`64-@sigma0[2]`,$tmp1 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 srlx $tmp0,@sigma0[1],$tmp0 xor $tmp1,$T1,$T1 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 Loading @@ -363,9 +357,9 @@ $code.=<<___; or %l7,$tmp2,$tmp2 srlx $tmp2,@sigma1[0],$tmp1 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 sllx $tmp2,`64-@sigma1[2]`,$tmp0 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 srlx $tmp2,@sigma1[1],$tmp2 xor $tmp0,$tmp1,$tmp1 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 Loading @@ -374,29 +368,30 @@ $code.=<<___; xor $tmp0,$tmp1,$tmp1 sllx %l4,32,$tmp0 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 or %l5,$tmp0,$tmp0 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 sllx %l0,32,$tmp2 add $tmp1,$T1,$T1 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 or %l1,$tmp2,$tmp2 add $tmp0,$T1,$T1 ! +=X[$i+9] ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 add $tmp2,$T1,$T1 ! +=X[$i] $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`] $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`] ___ &BODY_00_15(@_); } if ($SZ==8); $code.=<<___ if ($bits==64); .register %g2,#scratch .register %g3,#scratch ___ $code.=<<___; #include "sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch #endif .section ".text",#alloc,#execinstr .align 64 Loading Loading @@ -519,7 +514,7 @@ $code.=<<___ if ($SZ==8); # SHA512 .word 0x81b02860 ! SHA512 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop bne,pt SIZE_T_CC, .Lhwaligned_loop nop .Lhwfinish: Loading Loading @@ -579,7 +574,7 @@ $code.=<<___ if ($SZ==8); # SHA512 .word 0x81b02860 ! SHA512 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f50, %f50, %f18 ! %f18=%f50 ba .Lhwfinish Loading Loading @@ -612,7 +607,7 @@ $code.=<<___ if ($SZ==4); # SHA256 .word 0x81b02840 ! SHA256 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwloop bne,pt SIZE_T_CC, .Lhwloop nop .Lhwfinish: Loading Loading @@ -655,7 +650,7 @@ $code.=<<___ if ($SZ==4); # SHA256 .word 0x81b02840 ! SHA256 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop bne,pt SIZE_T_CC, .Lhwunaligned_loop for %f26, %f26, %f10 ! %f10=%f26 ba .Lhwfinish Loading @@ -664,7 +659,7 @@ ___ $code.=<<___; .align 16 .Lsoftware: save %sp,`-$frame-$locals`,%sp save %sp,-STACK_FRAME-$locals,%sp and $inp,`$align-1`,$tmp31 sllx $len,`log(16*$SZ)/log(2)`,$len andn $inp,`$align-1`,$inp Loading Loading @@ -783,7 +778,7 @@ ___ $code.=<<___; add $inp,`16*$SZ`,$inp ! advance inp cmp $inp,$len bne `$bits==64?"%xcc":"%icc"`,.Lloop bne SIZE_T_CC,.Lloop sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl ret Loading
crypto/sparc_arch.h +13 −4 Original line number Diff line number Diff line Loading @@ -32,6 +32,10 @@ # define __PIC__ #endif #if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__) # define __arch64__ #endif #define SPARC_PIC_THUNK(reg) \ .align 32; \ .Lpic_thunk: \ Loading @@ -53,18 +57,23 @@ add %o7, reg, reg #endif #if (defined(__GNUC__) && defined(__arch64__)) || \ (defined(__SUNPRO_C) && defined(__sparcv9)) #if defined(__arch64__) # define SPARC_LOAD_ADDRESS(SYM, reg) \ setx SYM, %o7, reg; # define LDPTR ldx # define SIZE_T_CC %xcc # define STACK_FRAME 192 # define STACK_BIAS 2047 #else # define SPARC_LOAD_ADDRESS(SYM, reg) \ set SYM, reg; # define LDPTR ld # define SIZE_T_CC %icc # define STACK_FRAME 112 # define STACK_BIAS 0 # define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg) #endif Loading