Loading crypto/sha/asm/sha256-armv4.pl +318 −17 Original line number Diff line number Diff line #!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. Loading @@ -21,7 +21,15 @@ # February 2011. # # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~16.4 cycles per processed byte. # improvement on Cortex A8 core and ~15.4 cycles per processed byte. # September 2013. # # Add NEON implementation. On Cortex A8 it was measured to process one # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only # code (meaning that latter performs sub-optimally, nothing was done # about it). while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading Loading @@ -56,10 +64,10 @@ $code.=<<___ if ($i<16); # if $i==15 str $inp,[sp,#17*4] @ make room for $t4 # endif mov $t0,$e,ror#$Sigma1[0] eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` add $a,$a,$t2 @ h+=Maj(a,b,c) from the past eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) rev $t1,$t1 eor $t0,$t0,$e,ror#$Sigma1[1] #else @ ldrb $t1,[$inp,#3] @ $i add $a,$a,$t2 @ h+=Maj(a,b,c) from the past Loading @@ -71,9 +79,9 @@ $code.=<<___ if ($i<16); # if $i==15 str $inp,[sp,#17*4] @ make room for $t4 # endif mov $t0,$e,ror#$Sigma1[0] eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` orr $t1,$t1,$t2,lsl#24 eor $t0,$t0,$e,ror#$Sigma1[1] eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) #endif ___ $code.=<<___; Loading @@ -81,12 +89,11 @@ $code.=<<___; add $h,$h,$t1 @ h+=X[i] str $t1,[sp,#`$i%16`*4] eor $t1,$f,$g eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) and $t1,$t1,$e add $h,$h,$t0 @ h+=Sigma1(e) eor $t1,$t1,$g @ Ch(e,f,g) add $h,$h,$t2 @ h+=K256[i] mov $t0,$a,ror#$Sigma0[0] eor $t1,$t1,$g @ Ch(e,f,g) eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` add $h,$h,$t1 @ h+=Ch(e,f,g) #if $i==31 and $t2,$t2,#0xff Loading @@ -104,12 +111,11 @@ $code.=<<___; eor $t2,$a,$b @ a^b, b^c in next round ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx #endif eor $t0,$t0,$a,ror#$Sigma0[1] eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) and $t3,$t3,$t2 @ (b^c)&=(a^b) add $d,$d,$h @ d+=h eor $t0,$t0,$a,ror#$Sigma0[2] @ Sigma0(a) eor $t3,$t3,$b @ Maj(a,b,c) add $h,$h,$t0 @ h+=Sigma0(a) add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) @ add $h,$h,$t3 @ h+=Maj(a,b,c) ___ ($t2,$t3)=($t3,$t2); Loading @@ -132,9 +138,9 @@ $code.=<<___; ldr $t4,[sp,#`($i+9)%16`*4] add $t2,$t2,$t0 mov $t0,$e,ror#$Sigma1[0] @ from BODY_00_15 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 add $t1,$t1,$t2 eor $t0,$t0,$e,ror#$Sigma1[1] @ from BODY_00_15 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) add $t1,$t1,$t4 @ X[i] ___ &BODY_00_15(@_); Loading Loading @@ -166,15 +172,25 @@ K256: .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator .LOPENSSL_armcap: .word OPENSSL_armcap_P-sha256_block_data_order .align 5 .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: sub r3,pc,#8 @ sha256_block_data_order add $len,$inp,$len,lsl#6 @ len to point at the end of inp #if __ARM_ARCH__>=7 ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P tst r12,#1 bne .LNEON #endif stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} sub $Ktbl,r3,#256 @ K256 sub $Ktbl,r3,#256+32 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: # if __ARM_ARCH__>=7 Loading Loading @@ -225,9 +241,294 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif ___ ###################################################################### # NEON stuff # {{{ my @X=map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); my $Xfer=$t4; my $j=0; sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vld1_32 ("{$T0}","[$Ktbl,:128]!"); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 ($T0,$T0,@X[0]); while($#insns>=2) { eval(shift(@insns)); } &vst1_32 ("{$T0}","[$Xfer,:128]!"); eval(shift(@insns)); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vld1_32 ("{$T0}","[$Ktbl,:128]!"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vrev32_8 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &vst1_32 ("{$T0}","[$Xfer,:128]!"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&eor ($t1,$f,$g)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&and ($t1,$t1,$e)', '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&eor ($t1,$t1,$g)', # Ch(e,f,g) '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&ldr ($t1,"[sp,#64]") if ($j==31)', '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&add ($d,$d,$h)', # d+=h '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; #if __ARM_ARCH__>=7 .fpu neon .align 4 .LNEON: stmdb sp!,{r4-r12,lr} mov $t2,sp sub sp,sp,#16*4+16 @ alloca sub $Ktbl,r3,#256+32 @ K256 bic sp,sp,#15 @ align for 128-bit stores vld1.8 {@X[0]},[$inp]! vld1.8 {@X[1]},[$inp]! vld1.8 {@X[2]},[$inp]! vld1.8 {@X[3]},[$inp]! vld1.32 {$T0},[$Ktbl,:128]! vld1.32 {$T1},[$Ktbl,:128]! vld1.32 {$T2},[$Ktbl,:128]! vld1.32 {$T3},[$Ktbl,:128]! vrev32.8 @X[0],@X[0] @ yes, even on str $ctx,[sp,#64] vrev32.8 @X[1],@X[1] @ big-endian str $inp,[sp,#68] mov $Xfer,sp vrev32.8 @X[2],@X[2] str $len,[sp,#72] vrev32.8 @X[3],@X[3] str $t2,[sp,#76] @ save original sp vadd.i32 $T0,$T0,@X[0] vadd.i32 $T1,$T1,@X[1] vst1.32 {$T0},[$Xfer,:128]! vadd.i32 $T2,$T2,@X[2] vst1.32 {$T1},[$Xfer,:128]! vadd.i32 $T3,$T3,@X[3] vst1.32 {$T2},[$Xfer,:128]! vst1.32 {$T3},[$Xfer,:128]! ldmia $ctx,{$A-$H} sub $Xfer,$Xfer,#64 ldr $t1,[sp,#0] eor $t2,$t2,$t2 eor $t3,$B,$C b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; teq $t1,#0 @ check for K256 terminator ldr $t1,[sp,#0] sub $Xfer,$Xfer,#64 bne .L_00_48 ldr $inp,[sp,#68] ldr $t0,[sp,#72] sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl teq $inp,$t0 subeq $inp,$inp,#64 @ avoid SEGV vld1.8 {@X[0]},[$inp]! @ load next input block vld1.8 {@X[1]},[$inp]! vld1.8 {@X[2]},[$inp]! vld1.8 {@X[3]},[$inp]! strne $inp,[sp,#68] mov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; ldr $t0,[$t1,#0] add $A,$A,$t2 @ h+=Maj(a,b,c) from the past ldr $t2,[$t1,#4] ldr $t3,[$t1,#8] ldr $t4,[$t1,#12] add $A,$A,$t0 @ accumulate ldr $t0,[$t1,#16] add $B,$B,$t2 ldr $t2,[$t1,#20] add $C,$C,$t3 ldr $t3,[$t1,#24] add $D,$D,$t4 ldr $t4,[$t1,#28] add $E,$E,$t0 str $A,[$t1],#4 add $F,$F,$t2 str $B,[$t1],#4 add $G,$G,$t3 str $C,[$t1],#4 add $H,$H,$t4 str $D,[$t1],#4 stmia $t1,{$E-$H} movne $Xfer,sp ldrne $t1,[sp,#0] eorne $t2,$t2,$t2 ldreq sp,[sp,#76] @ restore original sp eorne $t3,$B,$C bne .L_00_48 ldmia sp!,{r4-r12,pc} #endif ___ }}} $code.=<<___; .size sha256_block_data_order,.-sha256_block_data_order .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" .asciz "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 .comm OPENSSL_armcap_P,4,4 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading Loading
crypto/sha/asm/sha256-armv4.pl +318 −17 Original line number Diff line number Diff line #!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. Loading @@ -21,7 +21,15 @@ # February 2011. # # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~16.4 cycles per processed byte. # improvement on Cortex A8 core and ~15.4 cycles per processed byte. # September 2013. # # Add NEON implementation. On Cortex A8 it was measured to process one # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only # code (meaning that latter performs sub-optimally, nothing was done # about it). while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading Loading @@ -56,10 +64,10 @@ $code.=<<___ if ($i<16); # if $i==15 str $inp,[sp,#17*4] @ make room for $t4 # endif mov $t0,$e,ror#$Sigma1[0] eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` add $a,$a,$t2 @ h+=Maj(a,b,c) from the past eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) rev $t1,$t1 eor $t0,$t0,$e,ror#$Sigma1[1] #else @ ldrb $t1,[$inp,#3] @ $i add $a,$a,$t2 @ h+=Maj(a,b,c) from the past Loading @@ -71,9 +79,9 @@ $code.=<<___ if ($i<16); # if $i==15 str $inp,[sp,#17*4] @ make room for $t4 # endif mov $t0,$e,ror#$Sigma1[0] eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` orr $t1,$t1,$t2,lsl#24 eor $t0,$t0,$e,ror#$Sigma1[1] eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) #endif ___ $code.=<<___; Loading @@ -81,12 +89,11 @@ $code.=<<___; add $h,$h,$t1 @ h+=X[i] str $t1,[sp,#`$i%16`*4] eor $t1,$f,$g eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) and $t1,$t1,$e add $h,$h,$t0 @ h+=Sigma1(e) eor $t1,$t1,$g @ Ch(e,f,g) add $h,$h,$t2 @ h+=K256[i] mov $t0,$a,ror#$Sigma0[0] eor $t1,$t1,$g @ Ch(e,f,g) eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` add $h,$h,$t1 @ h+=Ch(e,f,g) #if $i==31 and $t2,$t2,#0xff Loading @@ -104,12 +111,11 @@ $code.=<<___; eor $t2,$a,$b @ a^b, b^c in next round ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx #endif eor $t0,$t0,$a,ror#$Sigma0[1] eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) and $t3,$t3,$t2 @ (b^c)&=(a^b) add $d,$d,$h @ d+=h eor $t0,$t0,$a,ror#$Sigma0[2] @ Sigma0(a) eor $t3,$t3,$b @ Maj(a,b,c) add $h,$h,$t0 @ h+=Sigma0(a) add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) @ add $h,$h,$t3 @ h+=Maj(a,b,c) ___ ($t2,$t3)=($t3,$t2); Loading @@ -132,9 +138,9 @@ $code.=<<___; ldr $t4,[sp,#`($i+9)%16`*4] add $t2,$t2,$t0 mov $t0,$e,ror#$Sigma1[0] @ from BODY_00_15 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 add $t1,$t1,$t2 eor $t0,$t0,$e,ror#$Sigma1[1] @ from BODY_00_15 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) add $t1,$t1,$t4 @ X[i] ___ &BODY_00_15(@_); Loading Loading @@ -166,15 +172,25 @@ K256: .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator .LOPENSSL_armcap: .word OPENSSL_armcap_P-sha256_block_data_order .align 5 .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: sub r3,pc,#8 @ sha256_block_data_order add $len,$inp,$len,lsl#6 @ len to point at the end of inp #if __ARM_ARCH__>=7 ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P tst r12,#1 bne .LNEON #endif stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} sub $Ktbl,r3,#256 @ K256 sub $Ktbl,r3,#256+32 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: # if __ARM_ARCH__>=7 Loading Loading @@ -225,9 +241,294 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif ___ ###################################################################### # NEON stuff # {{{ my @X=map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); my $Xfer=$t4; my $j=0; sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vld1_32 ("{$T0}","[$Ktbl,:128]!"); eval(shift(@insns)); eval(shift(@insns)); &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &veor ($T5,$T5,$T4); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 ($T0,$T0,@X[0]); while($#insns>=2) { eval(shift(@insns)); } &vst1_32 ("{$T0}","[$Xfer,:128]!"); eval(shift(@insns)); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vld1_32 ("{$T0}","[$Ktbl,:128]!"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vrev32_8 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vadd_i32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &vst1_32 ("{$T0}","[$Xfer,:128]!"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&eor ($t1,$f,$g)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&and ($t1,$t1,$e)', '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&eor ($t1,$t1,$g)', # Ch(e,f,g) '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&ldr ($t1,"[sp,#64]") if ($j==31)', '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&add ($d,$d,$h)', # d+=h '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; #if __ARM_ARCH__>=7 .fpu neon .align 4 .LNEON: stmdb sp!,{r4-r12,lr} mov $t2,sp sub sp,sp,#16*4+16 @ alloca sub $Ktbl,r3,#256+32 @ K256 bic sp,sp,#15 @ align for 128-bit stores vld1.8 {@X[0]},[$inp]! vld1.8 {@X[1]},[$inp]! vld1.8 {@X[2]},[$inp]! vld1.8 {@X[3]},[$inp]! vld1.32 {$T0},[$Ktbl,:128]! vld1.32 {$T1},[$Ktbl,:128]! vld1.32 {$T2},[$Ktbl,:128]! vld1.32 {$T3},[$Ktbl,:128]! vrev32.8 @X[0],@X[0] @ yes, even on str $ctx,[sp,#64] vrev32.8 @X[1],@X[1] @ big-endian str $inp,[sp,#68] mov $Xfer,sp vrev32.8 @X[2],@X[2] str $len,[sp,#72] vrev32.8 @X[3],@X[3] str $t2,[sp,#76] @ save original sp vadd.i32 $T0,$T0,@X[0] vadd.i32 $T1,$T1,@X[1] vst1.32 {$T0},[$Xfer,:128]! vadd.i32 $T2,$T2,@X[2] vst1.32 {$T1},[$Xfer,:128]! vadd.i32 $T3,$T3,@X[3] vst1.32 {$T2},[$Xfer,:128]! vst1.32 {$T3},[$Xfer,:128]! ldmia $ctx,{$A-$H} sub $Xfer,$Xfer,#64 ldr $t1,[sp,#0] eor $t2,$t2,$t2 eor $t3,$B,$C b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; teq $t1,#0 @ check for K256 terminator ldr $t1,[sp,#0] sub $Xfer,$Xfer,#64 bne .L_00_48 ldr $inp,[sp,#68] ldr $t0,[sp,#72] sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl teq $inp,$t0 subeq $inp,$inp,#64 @ avoid SEGV vld1.8 {@X[0]},[$inp]! @ load next input block vld1.8 {@X[1]},[$inp]! vld1.8 {@X[2]},[$inp]! vld1.8 {@X[3]},[$inp]! strne $inp,[sp,#68] mov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; ldr $t0,[$t1,#0] add $A,$A,$t2 @ h+=Maj(a,b,c) from the past ldr $t2,[$t1,#4] ldr $t3,[$t1,#8] ldr $t4,[$t1,#12] add $A,$A,$t0 @ accumulate ldr $t0,[$t1,#16] add $B,$B,$t2 ldr $t2,[$t1,#20] add $C,$C,$t3 ldr $t3,[$t1,#24] add $D,$D,$t4 ldr $t4,[$t1,#28] add $E,$E,$t0 str $A,[$t1],#4 add $F,$F,$t2 str $B,[$t1],#4 add $G,$G,$t3 str $C,[$t1],#4 add $H,$H,$t4 str $D,[$t1],#4 stmia $t1,{$E-$H} movne $Xfer,sp ldrne $t1,[sp,#0] eorne $t2,$t2,$t2 ldreq sp,[sp,#76] @ restore original sp eorne $t3,$B,$C bne .L_00_48 ldmia sp!,{r4-r12,pc} #endif ___ }}} $code.=<<___; .size sha256_block_data_order,.-sha256_block_data_order .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" .asciz "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 .comm OPENSSL_armcap_P,4,4 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading