Loading crypto/sha/asm/sha1-armv4-large.pl +30 −48 Original line number Diff line number Diff line Loading @@ -41,6 +41,13 @@ # issue Cortex A8 core was measured to process input block in # ~990 cycles. # August 2010. # # Rescheduling for dual-issue pipeline resulted in 13% improvement on # Cortex A8 core and in absolute terms ~870 cycles per input block # [or 13.6 cycles per byte]. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading @@ -60,43 +67,22 @@ $t3="r12"; $Xi="r14"; @V=($a,$b,$c,$d,$e); # One can optimize this for aligned access on big-endian architecture, # but code's endian neutrality makes it too pretty:-) sub Xload { my ($a,$b,$c,$d,$e)=@_; $code.=<<___; ldrb $t0,[$inp],#4 ldrb $t1,[$inp,#-3] ldrb $t2,[$inp,#-2] ldrb $t3,[$inp,#-1] add $e,$K,$e,ror#2 @ E+=K_00_19 orr $t0,$t1,$t0,lsl#8 add $e,$e,$a,ror#27 @ E+=ROR(A,27) orr $t0,$t2,$t0,lsl#8 eor $t1,$c,$d @ F_xx_xx orr $t0,$t3,$t0,lsl#8 add $e,$e,$t0 @ E+=X[i] str $t0,[$Xi,#-4]! ___ } sub Xupdate { my ($a,$b,$c,$d,$e,$flag)=@_; my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; $code.=<<___; ldr $t0,[$Xi,#15*4] ldr $t1,[$Xi,#13*4] ldr $t2,[$Xi,#7*4] ldr $t3,[$Xi,#2*4] add $e,$K,$e,ror#2 @ E+=K_xx_xx ldr $t3,[$Xi,#2*4] eor $t0,$t0,$t1 eor $t2,$t2,$t3 eor $t0,$t0,$t2 add $e,$e,$a,ror#27 @ E+=ROR(A,27) ___ $code.=<<___ if (!defined($flag)); eor $t1,$c,$d @ F_xx_xx, but not in 40_59 ___ $code.=<<___; eor $t1,$c,$d @ F_xx_xx mov $t0,$t0,ror#31 add $e,$e,$a,ror#27 @ E+=ROR(A,27) eor $t0,$t0,$t2,ror#31 $opt1 @ F_xx_xx $opt2 @ F_xx_xx add $e,$e,$t0 @ E+=X[i] str $t0,[$Xi,#-4]! ___ Loading @@ -104,19 +90,29 @@ ___ sub BODY_00_15 { my ($a,$b,$c,$d,$e)=@_; &Xload(@_); $code.=<<___; ldrb $t0,[$inp],#4 ldrb $t1,[$inp,#-1] ldrb $t2,[$inp,#-2] add $e,$K,$e,ror#2 @ E+=K_00_19 ldrb $t3,[$inp,#-3] add $e,$e,$a,ror#27 @ E+=ROR(A,27) orr $t0,$t1,$t0,lsl#24 eor $t1,$c,$d @ F_xx_xx orr $t0,$t0,$t2,lsl#8 orr $t0,$t0,$t3,lsl#16 and $t1,$b,$t1,ror#2 add $e,$e,$t0 @ E+=X[i] eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) str $t0,[$Xi,#-4]! add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ } sub BODY_16_19 { my ($a,$b,$c,$d,$e)=@_; &Xupdate(@_); &Xupdate(@_,"and $t1,$b,$t1,ror#2"); $code.=<<___; and $t1,$b,$t1,ror#2 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ Loading @@ -124,34 +120,20 @@ ___ sub BODY_20_39 { my ($a,$b,$c,$d,$e)=@_; &Xupdate(@_); &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); $code.=<<___; eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D) add $e,$e,$t1 @ E+=F_20_39(B,C,D) ___ } sub BODY_40_59 { my ($a,$b,$c,$d,$e)=@_; if (1) { &Xupdate(@_); &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); $code.=<<___; and $t2,$c,$d and $t1,$b,$t1,ror#2 add $e,$e,$t2,ror#2 add $e,$e,$t1 @ E+=F_40_59(B,C,D) ___ } else { &Xupdate(@_,1); $code.=<<___; and $t1,$b,$c,ror#2 orr $t2,$b,$c,ror#2 and $t2,$t2,$d,ror#2 orr $t1,$t1,$t2 @ F_40_59(B,C,D) add $e,$e,$t1 @ E+=F_40_59(B,C,D) add $e,$e,$t2,ror#2 ___ } } $code=<<___; .text Loading Loading
crypto/sha/asm/sha1-armv4-large.pl +30 −48 Original line number Diff line number Diff line Loading @@ -41,6 +41,13 @@ # issue Cortex A8 core was measured to process input block in # ~990 cycles. # August 2010. # # Rescheduling for dual-issue pipeline resulted in 13% improvement on # Cortex A8 core and in absolute terms ~870 cycles per input block # [or 13.6 cycles per byte]. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; Loading @@ -60,43 +67,22 @@ $t3="r12"; $Xi="r14"; @V=($a,$b,$c,$d,$e); # One can optimize this for aligned access on big-endian architecture, # but code's endian neutrality makes it too pretty:-) sub Xload { my ($a,$b,$c,$d,$e)=@_; $code.=<<___; ldrb $t0,[$inp],#4 ldrb $t1,[$inp,#-3] ldrb $t2,[$inp,#-2] ldrb $t3,[$inp,#-1] add $e,$K,$e,ror#2 @ E+=K_00_19 orr $t0,$t1,$t0,lsl#8 add $e,$e,$a,ror#27 @ E+=ROR(A,27) orr $t0,$t2,$t0,lsl#8 eor $t1,$c,$d @ F_xx_xx orr $t0,$t3,$t0,lsl#8 add $e,$e,$t0 @ E+=X[i] str $t0,[$Xi,#-4]! ___ } sub Xupdate { my ($a,$b,$c,$d,$e,$flag)=@_; my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; $code.=<<___; ldr $t0,[$Xi,#15*4] ldr $t1,[$Xi,#13*4] ldr $t2,[$Xi,#7*4] ldr $t3,[$Xi,#2*4] add $e,$K,$e,ror#2 @ E+=K_xx_xx ldr $t3,[$Xi,#2*4] eor $t0,$t0,$t1 eor $t2,$t2,$t3 eor $t0,$t0,$t2 add $e,$e,$a,ror#27 @ E+=ROR(A,27) ___ $code.=<<___ if (!defined($flag)); eor $t1,$c,$d @ F_xx_xx, but not in 40_59 ___ $code.=<<___; eor $t1,$c,$d @ F_xx_xx mov $t0,$t0,ror#31 add $e,$e,$a,ror#27 @ E+=ROR(A,27) eor $t0,$t0,$t2,ror#31 $opt1 @ F_xx_xx $opt2 @ F_xx_xx add $e,$e,$t0 @ E+=X[i] str $t0,[$Xi,#-4]! ___ Loading @@ -104,19 +90,29 @@ ___ sub BODY_00_15 { my ($a,$b,$c,$d,$e)=@_; &Xload(@_); $code.=<<___; ldrb $t0,[$inp],#4 ldrb $t1,[$inp,#-1] ldrb $t2,[$inp,#-2] add $e,$K,$e,ror#2 @ E+=K_00_19 ldrb $t3,[$inp,#-3] add $e,$e,$a,ror#27 @ E+=ROR(A,27) orr $t0,$t1,$t0,lsl#24 eor $t1,$c,$d @ F_xx_xx orr $t0,$t0,$t2,lsl#8 orr $t0,$t0,$t3,lsl#16 and $t1,$b,$t1,ror#2 add $e,$e,$t0 @ E+=X[i] eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) str $t0,[$Xi,#-4]! add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ } sub BODY_16_19 { my ($a,$b,$c,$d,$e)=@_; &Xupdate(@_); &Xupdate(@_,"and $t1,$b,$t1,ror#2"); $code.=<<___; and $t1,$b,$t1,ror#2 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ Loading @@ -124,34 +120,20 @@ ___ sub BODY_20_39 { my ($a,$b,$c,$d,$e)=@_; &Xupdate(@_); &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); $code.=<<___; eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D) add $e,$e,$t1 @ E+=F_20_39(B,C,D) ___ } sub BODY_40_59 { my ($a,$b,$c,$d,$e)=@_; if (1) { &Xupdate(@_); &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); $code.=<<___; and $t2,$c,$d and $t1,$b,$t1,ror#2 add $e,$e,$t2,ror#2 add $e,$e,$t1 @ E+=F_40_59(B,C,D) ___ } else { &Xupdate(@_,1); $code.=<<___; and $t1,$b,$c,ror#2 orr $t2,$b,$c,ror#2 and $t2,$t2,$d,ror#2 orr $t1,$t1,$t2 @ F_40_59(B,C,D) add $e,$e,$t1 @ E+=F_40_59(B,C,D) add $e,$e,$t2,ror#2 ___ } } $code=<<___; .text Loading