Loading crypto/sha/asm/sha512-armv8.pl +313 −4 Original line number Diff line number Diff line Loading @@ -37,6 +37,20 @@ # indication of some compiler "pathology", most notably code # generated with -mgeneral-regs-only is significanty faster # and the gap is only 40-90%. # # October 2016. # # Originally it was reckoned that it makes no sense to implement NEON # version of SHA256 for 64-bit processors. This is because performance # improvement on most wide-spread Cortex-A5x processors was observed # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was # observed that 32-bit NEON SHA256 performs significantly better than # 64-bit scalar version on *some* of the more recent processors. As # result 64-bit NEON version of SHA256 was added to provide best # all-round performance. For example it executes ~30% faster on X-Gene # and Mongoose. [For reference, NEON version of SHA512 is bound to # deliver much less improvement, likely *negative* on Cortex-A5x. # Which is why NEON support is limited to SHA256.] $output=pop; $flavour=pop; Loading Loading @@ -195,6 +209,8 @@ $code.=<<___ if ($SZ==4); ldr w16,[x16] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry tst w16,#ARMV7_NEON b.ne .Lneon_entry #endif ___ $code.=<<___; Loading Loading @@ -425,6 +441,296 @@ $code.=<<___; ___ } if ($SZ==4) { ######################################### NEON stuff # # You'll surely note a lot of similarities with sha256-armv4 module, # and of course it's not a coincidence. sha256-armv4 was used as # initial template, but was adapted for ARMv8 instruction set and # extensively re-tuned for all-round performance. my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); my $Ktbl="x16"; my $Xfer="x17"; my @X = map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); my $j=0; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); &ushr_32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] eval(shift(@insns)); &sli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T4,$T7,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T4,$T7,32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T5,$T7,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T7,$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_u32 ($T3,$T7,32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T6,@X[0],$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T7,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T6,@X[0],32-$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T5,@X[0],$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T7,$T7,$T6); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T5,@X[0],32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl], #16"); eval(shift(@insns)); &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T5); eval(shift(@insns)); eval(shift(@insns)); &mov (&Dhi($T5), &Dlo($T7)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); while($#insns>=1) { eval(shift(@insns)); } &st1_32 ("{$T0}","[$Xfer], #16"); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); &ld1_8 ("{@X[0]}","[$inp],#16"); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl],#16"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &rev32 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &st1_32 ("{$T0}","[$Xfer], #16"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past '&and ($t1,$f,$e)', '&bic ($t4,$g,$e)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&orr ($t1,$t1,$t4)', # Ch(e,f,g) '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ror ($t0,$t0,"#$Sigma1[0]")', '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t0)', # h+=Sigma1(e) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&ror ($t4,$t4,"#$Sigma0[0]")', '&add ($d,$d,$h)', # d+=h '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; #ifdef __KERNEL__ .globl sha256_block_neon #endif .type sha256_block_neon,%function .align 4 sha256_block_neon: .Lneon_entry: stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 adr $Ktbl,.LK256 add $num,$inp,$num,lsl#6 // len to point at the end of inp ld1.8 {@X[0]},[$inp], #16 ld1.8 {@X[1]},[$inp], #16 ld1.8 {@X[2]},[$inp], #16 ld1.8 {@X[3]},[$inp], #16 ld1.32 {$T0},[$Ktbl], #16 ld1.32 {$T1},[$Ktbl], #16 ld1.32 {$T2},[$Ktbl], #16 ld1.32 {$T3},[$Ktbl], #16 rev32 @X[0],@X[0] // yes, even on rev32 @X[1],@X[1] // big-endian rev32 @X[2],@X[2] rev32 @X[3],@X[3] mov $Xfer,sp add.32 $T0,$T0,@X[0] add.32 $T1,$T1,@X[1] add.32 $T2,$T2,@X[2] st1.32 {$T0-$T1},[$Xfer], #32 add.32 $T3,$T3,@X[3] st1.32 {$T2-$T3},[$Xfer] sub $Xfer,$Xfer,#32 ldp $A,$B,[$ctx] ldp $C,$D,[$ctx,#8] ldp $E,$F,[$ctx,#16] ldp $G,$H,[$ctx,#24] ldr $t1,[sp,#0] mov $t2,wzr eor $t3,$B,$C mov $t4,wzr b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; cmp $t1,#0 // check for K256 terminator ldr $t1,[sp,#0] sub $Xfer,$Xfer,#64 bne .L_00_48 sub $Ktbl,$Ktbl,#256 // rewind $Ktbl cmp $inp,$num mov $Xfer, #64 csel $Xfer, $Xfer, xzr, eq sub $inp,$inp,$Xfer // avoid SEGV mov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; add $A,$A,$t4 // h+=Sigma0(a) from the past ldp $t0,$t1,[$ctx,#0] add $A,$A,$t2 // h+=Maj(a,b,c) from the past ldp $t2,$t3,[$ctx,#8] add $A,$A,$t0 // accumulate add $B,$B,$t1 ldp $t0,$t1,[$ctx,#16] add $C,$C,$t2 add $D,$D,$t3 ldp $t2,$t3,[$ctx,#24] add $E,$E,$t0 add $F,$F,$t1 ldr $t1,[sp,#0] stp $A,$B,[$ctx,#0] add $G,$G,$t2 mov $t2,wzr stp $C,$D,[$ctx,#8] add $H,$H,$t3 stp $E,$F,[$ctx,#16] eor $t3,$B,$C stp $G,$H,[$ctx,#24] mov $t4,wzr mov $Xfer,sp b.ne .L_00_48 ldr x29,[x29] add sp,sp,#16*4+16 ret .size sha256_block_neon,.-sha256_block_neon ___ } $code.=<<___; #ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 Loading Loading @@ -456,12 +762,15 @@ close SELF; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers s/\.\w?32\b//o and s/\.16b/\.4s/go; m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; s/\.[ui]?8(\s)/$1/; s/\.\w?32\b// and s/\.16b/\.4s/g; m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; } Loading Loading
crypto/sha/asm/sha512-armv8.pl +313 −4 Original line number Diff line number Diff line Loading @@ -37,6 +37,20 @@ # indication of some compiler "pathology", most notably code # generated with -mgeneral-regs-only is significanty faster # and the gap is only 40-90%. # # October 2016. # # Originally it was reckoned that it makes no sense to implement NEON # version of SHA256 for 64-bit processors. This is because performance # improvement on most wide-spread Cortex-A5x processors was observed # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was # observed that 32-bit NEON SHA256 performs significantly better than # 64-bit scalar version on *some* of the more recent processors. As # result 64-bit NEON version of SHA256 was added to provide best # all-round performance. For example it executes ~30% faster on X-Gene # and Mongoose. [For reference, NEON version of SHA512 is bound to # deliver much less improvement, likely *negative* on Cortex-A5x. # Which is why NEON support is limited to SHA256.] $output=pop; $flavour=pop; Loading Loading @@ -195,6 +209,8 @@ $code.=<<___ if ($SZ==4); ldr w16,[x16] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry tst w16,#ARMV7_NEON b.ne .Lneon_entry #endif ___ $code.=<<___; Loading Loading @@ -425,6 +441,296 @@ $code.=<<___; ___ } if ($SZ==4) { ######################################### NEON stuff # # You'll surely note a lot of similarities with sha256-armv4 module, # and of course it's not a coincidence. sha256-armv4 was used as # initial template, but was adapted for ARMv8 instruction set and # extensively re-tuned for all-round performance. my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); my $Ktbl="x16"; my $Xfer="x17"; my @X = map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); my $j=0; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); &ushr_32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] eval(shift(@insns)); &sli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T4,$T7,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T4,$T7,32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T5,$T7,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T7,$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_u32 ($T3,$T7,32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T6,@X[0],$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T7,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T6,@X[0],32-$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T5,@X[0],$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T7,$T7,$T6); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T5,@X[0],32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl], #16"); eval(shift(@insns)); &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T5); eval(shift(@insns)); eval(shift(@insns)); &mov (&Dhi($T5), &Dlo($T7)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); while($#insns>=1) { eval(shift(@insns)); } &st1_32 ("{$T0}","[$Xfer], #16"); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); &ld1_8 ("{@X[0]}","[$inp],#16"); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl],#16"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &rev32 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &st1_32 ("{$T0}","[$Xfer], #16"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past '&and ($t1,$f,$e)', '&bic ($t4,$g,$e)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&orr ($t1,$t1,$t4)', # Ch(e,f,g) '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ror ($t0,$t0,"#$Sigma1[0]")', '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t0)', # h+=Sigma1(e) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&ror ($t4,$t4,"#$Sigma0[0]")', '&add ($d,$d,$h)', # d+=h '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; #ifdef __KERNEL__ .globl sha256_block_neon #endif .type sha256_block_neon,%function .align 4 sha256_block_neon: .Lneon_entry: stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 adr $Ktbl,.LK256 add $num,$inp,$num,lsl#6 // len to point at the end of inp ld1.8 {@X[0]},[$inp], #16 ld1.8 {@X[1]},[$inp], #16 ld1.8 {@X[2]},[$inp], #16 ld1.8 {@X[3]},[$inp], #16 ld1.32 {$T0},[$Ktbl], #16 ld1.32 {$T1},[$Ktbl], #16 ld1.32 {$T2},[$Ktbl], #16 ld1.32 {$T3},[$Ktbl], #16 rev32 @X[0],@X[0] // yes, even on rev32 @X[1],@X[1] // big-endian rev32 @X[2],@X[2] rev32 @X[3],@X[3] mov $Xfer,sp add.32 $T0,$T0,@X[0] add.32 $T1,$T1,@X[1] add.32 $T2,$T2,@X[2] st1.32 {$T0-$T1},[$Xfer], #32 add.32 $T3,$T3,@X[3] st1.32 {$T2-$T3},[$Xfer] sub $Xfer,$Xfer,#32 ldp $A,$B,[$ctx] ldp $C,$D,[$ctx,#8] ldp $E,$F,[$ctx,#16] ldp $G,$H,[$ctx,#24] ldr $t1,[sp,#0] mov $t2,wzr eor $t3,$B,$C mov $t4,wzr b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; cmp $t1,#0 // check for K256 terminator ldr $t1,[sp,#0] sub $Xfer,$Xfer,#64 bne .L_00_48 sub $Ktbl,$Ktbl,#256 // rewind $Ktbl cmp $inp,$num mov $Xfer, #64 csel $Xfer, $Xfer, xzr, eq sub $inp,$inp,$Xfer // avoid SEGV mov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; add $A,$A,$t4 // h+=Sigma0(a) from the past ldp $t0,$t1,[$ctx,#0] add $A,$A,$t2 // h+=Maj(a,b,c) from the past ldp $t2,$t3,[$ctx,#8] add $A,$A,$t0 // accumulate add $B,$B,$t1 ldp $t0,$t1,[$ctx,#16] add $C,$C,$t2 add $D,$D,$t3 ldp $t2,$t3,[$ctx,#24] add $E,$E,$t0 add $F,$F,$t1 ldr $t1,[sp,#0] stp $A,$B,[$ctx,#0] add $G,$G,$t2 mov $t2,wzr stp $C,$D,[$ctx,#8] add $H,$H,$t3 stp $E,$F,[$ctx,#16] eor $t3,$B,$C stp $G,$H,[$ctx,#24] mov $t4,wzr mov $Xfer,sp b.ne .L_00_48 ldr x29,[x29] add sp,sp,#16*4+16 ret .size sha256_block_neon,.-sha256_block_neon ___ } $code.=<<___; #ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 Loading Loading @@ -456,12 +762,15 @@ close SELF; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers s/\.\w?32\b//o and s/\.16b/\.4s/go; m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; s/\.[ui]?8(\s)/$1/; s/\.\w?32\b// and s/\.16b/\.4s/g; m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; } Loading