Loading Configure +1 −1 Original line number Diff line number Diff line Loading @@ -139,8 +139,8 @@ my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::"; my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::"; my $ppc32_asm=$ppc64_asm; my $no_asm=":::::::::::::::void"; # As for $BSDthreads. Idea is to maintain "collective" set of flags, Loading crypto/sha/asm/sha512-ppc.pl +290 −2 Original line number Diff line number Diff line #!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. Loading Loading @@ -91,6 +91,10 @@ if ($output =~ /512/) { $FRAME=32*$SIZE_T+16*$SZ; $LOCALS=6*$SIZE_T; if ($SZ==8 && $SIZE_T==4) { $FRAME+=16*$SZ; $XOFF=$LOCALS+16*$SZ; } $sp ="r1"; $toc="r2"; Loading Loading @@ -118,7 +122,7 @@ $H ="r15"; @X=("r16","r17","r18","r19","r20","r21","r22","r23", "r24","r25","r26","r27","r28","r29","r30","r31"); $inp="r31"; # reassigned $inp! aliases with @X[15] $inp="r31" if($SZ==4 || $SIZE_T==8); # reassigned $inp! aliases with @X[15] sub ROUND_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; Loading Loading @@ -212,7 +216,10 @@ $func: $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) ___ if ($SZ==4 || $SIZE_T==8) { $code.=<<___; $LD $A,`0*$SZ`($ctx) mr $inp,r4 ; incarnate $inp $LD $B,`1*$SZ`($ctx) Loading @@ -222,7 +229,16 @@ $func: $LD $F,`5*$SZ`($ctx) $LD $G,`6*$SZ`($ctx) $LD $H,`7*$SZ`($ctx) ___ } else { for ($i=16;$i<32;$i++) { $code.=<<___; lwz r$i,`4*($i-16)`($ctx) ___ } } $code.=<<___; bl LPICmeup LPICedup: andi. r0,$inp,3 Loading Loading @@ -258,6 +274,9 @@ Lunaligned: Lcross_page: li $t1,`16*$SZ/4` mtctr $t1 ___ if ($SZ==4 || $SIZE_T==8) { $code.=<<___; addi r20,$sp,$LOCALS ; aligned spot below the frame Lmemcpy: lbz r16,0($inp) Loading @@ -271,7 +290,26 @@ Lmemcpy: stb r19,3(r20) addi r20,r20,4 bdnz Lmemcpy ___ } else { $code.=<<___; addi r12,$sp,$LOCALS ; aligned spot below the frame Lmemcpy: lbz r8,0($inp) lbz r9,1($inp) lbz r10,2($inp) lbz r11,3($inp) addi $inp,$inp,4 stb r8,0(r12) stb r9,1(r12) stb r10,2(r12) stb r11,3(r12) addi r12,r12,4 bdnz Lmemcpy ___ } $code.=<<___; $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer addi $inp,$sp,$LOCALS ; fictitious inp pointer Loading Loading @@ -310,7 +348,10 @@ Ldone: .long 0 .byte 0,12,4,1,0x80,18,3,0 .long 0 ___ if ($SZ==4 || $SIZE_T==8) { $code.=<<___; .align 4 Lsha2_block_private: $LD $t1,0($Tbl) Loading Loading @@ -380,6 +421,253 @@ $code.=<<___; .long 0 .byte 0,12,0x14,0,0,0,0,0 ___ } else { ######################################################################## # SHA512 for PPC32, X vector is off-loaded to stack... # # | sha512 # | -m32 # ----------------------+----------------------- # PPC74x0,gcc-4.0.1 | +48% # POWER6,gcc-4.4.6 | +124%(*) # POWER7,gcc-4.4.6 | +79%(*) # e300,gcc-4.1.0 | +167% # # (*) ~1/3 of -m64 result [and ~20% better than -m32 code generated # by xlc-12.1] my @V=map("r$_",(16..31)); # A..H my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15)); my ($x0,$x1)=("r3","r4"); # zaps $ctx and $inp sub ROUND_00_15_ppc32 { my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_; $code.=<<___; lwz $t2,`$SZ*($i%16)+4`($Tbl) xor $a0,$flo,$glo lwz $t3,`$SZ*($i%16)+0`($Tbl) xor $a1,$fhi,$ghi addc $hlo,$hlo,$t0 ; h+=x[i] stw $t0,`$XOFF+0+$SZ*($i%16)`($sp) ; save x[i] srwi $s0,$elo,$Sigma1[0] srwi $s1,$ehi,$Sigma1[0] and $a0,$a0,$elo adde $hhi,$hhi,$t1 and $a1,$a1,$ehi stw $t1,`$XOFF+4+$SZ*($i%16)`($sp) srwi $t0,$elo,$Sigma1[1] srwi $t1,$ehi,$Sigma1[1] addc $hlo,$hlo,$t2 ; h+=K512[i] insrwi $s0,$ehi,$Sigma1[0],0 insrwi $s1,$elo,$Sigma1[0],0 xor $a0,$a0,$glo ; Ch(e,f,g) adde $hhi,$hhi,$t3 xor $a1,$a1,$ghi insrwi $t0,$ehi,$Sigma1[1],0 insrwi $t1,$elo,$Sigma1[1],0 addc $hlo,$hlo,$a0 ; h+=Ch(e,f,g) srwi $t2,$ehi,$Sigma1[2]-32 srwi $t3,$elo,$Sigma1[2]-32 xor $s0,$s0,$t0 xor $s1,$s1,$t1 insrwi $t2,$elo,$Sigma1[2]-32,0 insrwi $t3,$ehi,$Sigma1[2]-32,0 xor $a0,$alo,$blo ; a^b, b^c in next round adde $hhi,$hhi,$a1 xor $a1,$ahi,$bhi xor $s0,$s0,$t2 ; Sigma1(e) xor $s1,$s1,$t3 srwi $t0,$alo,$Sigma0[0] and $a2,$a2,$a0 addc $hlo,$hlo,$s0 ; h+=Sigma1(e) and $a3,$a3,$a1 srwi $t1,$ahi,$Sigma0[0] srwi $s0,$ahi,$Sigma0[1]-32 adde $hhi,$hhi,$s1 srwi $s1,$alo,$Sigma0[1]-32 insrwi $t0,$ahi,$Sigma0[0],0 insrwi $t1,$alo,$Sigma0[0],0 xor $a2,$a2,$blo ; Maj(a,b,c) addc $dlo,$dlo,$hlo ; d+=h xor $a3,$a3,$bhi insrwi $s0,$alo,$Sigma0[1]-32,0 insrwi $s1,$ahi,$Sigma0[1]-32,0 adde $dhi,$dhi,$hhi srwi $t2,$ahi,$Sigma0[2]-32 srwi $t3,$alo,$Sigma0[2]-32 xor $s0,$s0,$t0 addc $hlo,$hlo,$a2 ; h+=Maj(a,b,c) xor $s1,$s1,$t1 insrwi $t2,$alo,$Sigma0[2]-32,0 insrwi $t3,$ahi,$Sigma0[2]-32,0 adde $hhi,$hhi,$a3 ___ $code.=<<___ if ($i>=15); lwz $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp) lwz $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp) ___ $code.=<<___ if ($i<15); lwz $t1,`$SZ*($i+1)+0`($inp) lwz $t0,`$SZ*($i+1)+4`($inp) ___ $code.=<<___; xor $s0,$s0,$t2 ; Sigma0(a) xor $s1,$s1,$t3 addc $hlo,$hlo,$s0 ; h+=Sigma0(a) adde $hhi,$hhi,$s1 ___ $code.=<<___ if ($i==15); lwz $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp) lwz $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp) ___ } sub ROUND_16_xx_ppc32 { my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_; $code.=<<___; srwi $s0,$t0,$sigma0[0] srwi $s1,$t1,$sigma0[0] srwi $t2,$t0,$sigma0[1] srwi $t3,$t1,$sigma0[1] insrwi $s0,$t1,$sigma0[0],0 insrwi $s1,$t0,$sigma0[0],0 srwi $a0,$t0,$sigma0[2] insrwi $t2,$t1,$sigma0[1],0 insrwi $t3,$t0,$sigma0[1],0 insrwi $a0,$t1,$sigma0[2],0 xor $s0,$s0,$t2 lwz $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp) srwi $a1,$t1,$sigma0[2] xor $s1,$s1,$t3 lwz $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp) xor $a0,$a0,$s0 srwi $s0,$t2,$sigma1[0] xor $a1,$a1,$s1 srwi $s1,$t3,$sigma1[0] addc $x0,$x0,$a0 ; x[i]+=sigma0(x[i+1]) srwi $a0,$t3,$sigma1[1]-32 insrwi $s0,$t3,$sigma1[0],0 insrwi $s1,$t2,$sigma1[0],0 adde $x1,$x1,$a1 srwi $a1,$t2,$sigma1[1]-32 insrwi $a0,$t2,$sigma1[1]-32,0 srwi $t2,$t2,$sigma1[2] insrwi $a1,$t3,$sigma1[1]-32,0 insrwi $t2,$t3,$sigma1[2],0 xor $s0,$s0,$a0 lwz $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp) srwi $t3,$t3,$sigma1[2] xor $s1,$s1,$a1 lwz $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp) xor $s0,$s0,$t2 addc $x0,$x0,$a0 ; x[i]+=x[i+9] xor $s1,$s1,$t3 adde $x1,$x1,$a1 addc $x0,$x0,$s0 ; x[i]+=sigma1(x[i+14]) adde $x1,$x1,$s1 ___ ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1); &ROUND_00_15_ppc32(@_); } $code.=<<___; .align 4 Lsha2_block_private: lwz $t1,0($inp) xor $a2,@V[3],@V[5] ; B^C, magic seed lwz $t0,4($inp) xor $a3,@V[2],@V[4] ___ for($i=0;$i<16;$i++) { &ROUND_00_15_ppc32($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1); } $code.=<<___; li $a0,`$rounds/16-1` mtctr $a0 .align 4 Lrounds: addi $Tbl,$Tbl,`16*$SZ` ___ for(;$i<32;$i++) { &ROUND_16_xx_ppc32($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1); } $code.=<<___; bdnz- Lrounds $POP $ctx,`$FRAME-$SIZE_T*22`($sp) $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl lwz $t0,0($ctx) lwz $t1,4($ctx) lwz $t2,8($ctx) lwz $t3,12($ctx) lwz $a0,16($ctx) lwz $a1,20($ctx) lwz $a2,24($ctx) addc @V[1],@V[1],$t1 lwz $a3,28($ctx) adde @V[0],@V[0],$t0 lwz $t0,32($ctx) addc @V[3],@V[3],$t3 lwz $t1,36($ctx) adde @V[2],@V[2],$t2 lwz $t2,40($ctx) addc @V[5],@V[5],$a1 lwz $t3,44($ctx) adde @V[4],@V[4],$a0 lwz $a0,48($ctx) addc @V[7],@V[7],$a3 lwz $a1,52($ctx) adde @V[6],@V[6],$a2 lwz $a2,56($ctx) addc @V[9],@V[9],$t1 lwz $a3,60($ctx) adde @V[8],@V[8],$t0 stw @V[0],0($ctx) stw @V[1],4($ctx) addc @V[11],@V[11],$t3 stw @V[2],8($ctx) stw @V[3],12($ctx) adde @V[10],@V[10],$t2 stw @V[4],16($ctx) stw @V[5],20($ctx) addc @V[13],@V[13],$a1 stw @V[6],24($ctx) stw @V[7],28($ctx) adde @V[12],@V[12],$a0 stw @V[8],32($ctx) stw @V[9],36($ctx) addc @V[15],@V[15],$a3 stw @V[10],40($ctx) stw @V[11],44($ctx) adde @V[14],@V[14],$a2 stw @V[12],48($ctx) stw @V[13],52($ctx) stw @V[14],56($ctx) stw @V[15],60($ctx) addi $inp,$inp,`16*$SZ` ; advance inp $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) $UCMP $inp,$num bne Lsha2_block_private blr .long 0 .byte 0,12,0x14,0,0,0,0,0 ___ } # Ugly hack here, because PPC assembler syntax seem to vary too # much from platforms to platform... Loading Loading
Configure +1 −1 Original line number Diff line number Diff line Loading @@ -139,8 +139,8 @@ my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::"; my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::"; my $ppc32_asm=$ppc64_asm; my $no_asm=":::::::::::::::void"; # As for $BSDthreads. Idea is to maintain "collective" set of flags, Loading
crypto/sha/asm/sha512-ppc.pl +290 −2 Original line number Diff line number Diff line #!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. Loading Loading @@ -91,6 +91,10 @@ if ($output =~ /512/) { $FRAME=32*$SIZE_T+16*$SZ; $LOCALS=6*$SIZE_T; if ($SZ==8 && $SIZE_T==4) { $FRAME+=16*$SZ; $XOFF=$LOCALS+16*$SZ; } $sp ="r1"; $toc="r2"; Loading Loading @@ -118,7 +122,7 @@ $H ="r15"; @X=("r16","r17","r18","r19","r20","r21","r22","r23", "r24","r25","r26","r27","r28","r29","r30","r31"); $inp="r31"; # reassigned $inp! aliases with @X[15] $inp="r31" if($SZ==4 || $SIZE_T==8); # reassigned $inp! aliases with @X[15] sub ROUND_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; Loading Loading @@ -212,7 +216,10 @@ $func: $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) ___ if ($SZ==4 || $SIZE_T==8) { $code.=<<___; $LD $A,`0*$SZ`($ctx) mr $inp,r4 ; incarnate $inp $LD $B,`1*$SZ`($ctx) Loading @@ -222,7 +229,16 @@ $func: $LD $F,`5*$SZ`($ctx) $LD $G,`6*$SZ`($ctx) $LD $H,`7*$SZ`($ctx) ___ } else { for ($i=16;$i<32;$i++) { $code.=<<___; lwz r$i,`4*($i-16)`($ctx) ___ } } $code.=<<___; bl LPICmeup LPICedup: andi. r0,$inp,3 Loading Loading @@ -258,6 +274,9 @@ Lunaligned: Lcross_page: li $t1,`16*$SZ/4` mtctr $t1 ___ if ($SZ==4 || $SIZE_T==8) { $code.=<<___; addi r20,$sp,$LOCALS ; aligned spot below the frame Lmemcpy: lbz r16,0($inp) Loading @@ -271,7 +290,26 @@ Lmemcpy: stb r19,3(r20) addi r20,r20,4 bdnz Lmemcpy ___ } else { $code.=<<___; addi r12,$sp,$LOCALS ; aligned spot below the frame Lmemcpy: lbz r8,0($inp) lbz r9,1($inp) lbz r10,2($inp) lbz r11,3($inp) addi $inp,$inp,4 stb r8,0(r12) stb r9,1(r12) stb r10,2(r12) stb r11,3(r12) addi r12,r12,4 bdnz Lmemcpy ___ } $code.=<<___; $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer addi $inp,$sp,$LOCALS ; fictitious inp pointer Loading Loading @@ -310,7 +348,10 @@ Ldone: .long 0 .byte 0,12,4,1,0x80,18,3,0 .long 0 ___ if ($SZ==4 || $SIZE_T==8) { $code.=<<___; .align 4 Lsha2_block_private: $LD $t1,0($Tbl) Loading Loading @@ -380,6 +421,253 @@ $code.=<<___; .long 0 .byte 0,12,0x14,0,0,0,0,0 ___ } else { ######################################################################## # SHA512 for PPC32, X vector is off-loaded to stack... # # | sha512 # | -m32 # ----------------------+----------------------- # PPC74x0,gcc-4.0.1 | +48% # POWER6,gcc-4.4.6 | +124%(*) # POWER7,gcc-4.4.6 | +79%(*) # e300,gcc-4.1.0 | +167% # # (*) ~1/3 of -m64 result [and ~20% better than -m32 code generated # by xlc-12.1] my @V=map("r$_",(16..31)); # A..H my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15)); my ($x0,$x1)=("r3","r4"); # zaps $ctx and $inp sub ROUND_00_15_ppc32 { my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_; $code.=<<___; lwz $t2,`$SZ*($i%16)+4`($Tbl) xor $a0,$flo,$glo lwz $t3,`$SZ*($i%16)+0`($Tbl) xor $a1,$fhi,$ghi addc $hlo,$hlo,$t0 ; h+=x[i] stw $t0,`$XOFF+0+$SZ*($i%16)`($sp) ; save x[i] srwi $s0,$elo,$Sigma1[0] srwi $s1,$ehi,$Sigma1[0] and $a0,$a0,$elo adde $hhi,$hhi,$t1 and $a1,$a1,$ehi stw $t1,`$XOFF+4+$SZ*($i%16)`($sp) srwi $t0,$elo,$Sigma1[1] srwi $t1,$ehi,$Sigma1[1] addc $hlo,$hlo,$t2 ; h+=K512[i] insrwi $s0,$ehi,$Sigma1[0],0 insrwi $s1,$elo,$Sigma1[0],0 xor $a0,$a0,$glo ; Ch(e,f,g) adde $hhi,$hhi,$t3 xor $a1,$a1,$ghi insrwi $t0,$ehi,$Sigma1[1],0 insrwi $t1,$elo,$Sigma1[1],0 addc $hlo,$hlo,$a0 ; h+=Ch(e,f,g) srwi $t2,$ehi,$Sigma1[2]-32 srwi $t3,$elo,$Sigma1[2]-32 xor $s0,$s0,$t0 xor $s1,$s1,$t1 insrwi $t2,$elo,$Sigma1[2]-32,0 insrwi $t3,$ehi,$Sigma1[2]-32,0 xor $a0,$alo,$blo ; a^b, b^c in next round adde $hhi,$hhi,$a1 xor $a1,$ahi,$bhi xor $s0,$s0,$t2 ; Sigma1(e) xor $s1,$s1,$t3 srwi $t0,$alo,$Sigma0[0] and $a2,$a2,$a0 addc $hlo,$hlo,$s0 ; h+=Sigma1(e) and $a3,$a3,$a1 srwi $t1,$ahi,$Sigma0[0] srwi $s0,$ahi,$Sigma0[1]-32 adde $hhi,$hhi,$s1 srwi $s1,$alo,$Sigma0[1]-32 insrwi $t0,$ahi,$Sigma0[0],0 insrwi $t1,$alo,$Sigma0[0],0 xor $a2,$a2,$blo ; Maj(a,b,c) addc $dlo,$dlo,$hlo ; d+=h xor $a3,$a3,$bhi insrwi $s0,$alo,$Sigma0[1]-32,0 insrwi $s1,$ahi,$Sigma0[1]-32,0 adde $dhi,$dhi,$hhi srwi $t2,$ahi,$Sigma0[2]-32 srwi $t3,$alo,$Sigma0[2]-32 xor $s0,$s0,$t0 addc $hlo,$hlo,$a2 ; h+=Maj(a,b,c) xor $s1,$s1,$t1 insrwi $t2,$alo,$Sigma0[2]-32,0 insrwi $t3,$ahi,$Sigma0[2]-32,0 adde $hhi,$hhi,$a3 ___ $code.=<<___ if ($i>=15); lwz $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp) lwz $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp) ___ $code.=<<___ if ($i<15); lwz $t1,`$SZ*($i+1)+0`($inp) lwz $t0,`$SZ*($i+1)+4`($inp) ___ $code.=<<___; xor $s0,$s0,$t2 ; Sigma0(a) xor $s1,$s1,$t3 addc $hlo,$hlo,$s0 ; h+=Sigma0(a) adde $hhi,$hhi,$s1 ___ $code.=<<___ if ($i==15); lwz $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp) lwz $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp) ___ } sub ROUND_16_xx_ppc32 { my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_; $code.=<<___; srwi $s0,$t0,$sigma0[0] srwi $s1,$t1,$sigma0[0] srwi $t2,$t0,$sigma0[1] srwi $t3,$t1,$sigma0[1] insrwi $s0,$t1,$sigma0[0],0 insrwi $s1,$t0,$sigma0[0],0 srwi $a0,$t0,$sigma0[2] insrwi $t2,$t1,$sigma0[1],0 insrwi $t3,$t0,$sigma0[1],0 insrwi $a0,$t1,$sigma0[2],0 xor $s0,$s0,$t2 lwz $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp) srwi $a1,$t1,$sigma0[2] xor $s1,$s1,$t3 lwz $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp) xor $a0,$a0,$s0 srwi $s0,$t2,$sigma1[0] xor $a1,$a1,$s1 srwi $s1,$t3,$sigma1[0] addc $x0,$x0,$a0 ; x[i]+=sigma0(x[i+1]) srwi $a0,$t3,$sigma1[1]-32 insrwi $s0,$t3,$sigma1[0],0 insrwi $s1,$t2,$sigma1[0],0 adde $x1,$x1,$a1 srwi $a1,$t2,$sigma1[1]-32 insrwi $a0,$t2,$sigma1[1]-32,0 srwi $t2,$t2,$sigma1[2] insrwi $a1,$t3,$sigma1[1]-32,0 insrwi $t2,$t3,$sigma1[2],0 xor $s0,$s0,$a0 lwz $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp) srwi $t3,$t3,$sigma1[2] xor $s1,$s1,$a1 lwz $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp) xor $s0,$s0,$t2 addc $x0,$x0,$a0 ; x[i]+=x[i+9] xor $s1,$s1,$t3 adde $x1,$x1,$a1 addc $x0,$x0,$s0 ; x[i]+=sigma1(x[i+14]) adde $x1,$x1,$s1 ___ ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1); &ROUND_00_15_ppc32(@_); } $code.=<<___; .align 4 Lsha2_block_private: lwz $t1,0($inp) xor $a2,@V[3],@V[5] ; B^C, magic seed lwz $t0,4($inp) xor $a3,@V[2],@V[4] ___ for($i=0;$i<16;$i++) { &ROUND_00_15_ppc32($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1); } $code.=<<___; li $a0,`$rounds/16-1` mtctr $a0 .align 4 Lrounds: addi $Tbl,$Tbl,`16*$SZ` ___ for(;$i<32;$i++) { &ROUND_16_xx_ppc32($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1); } $code.=<<___; bdnz- Lrounds $POP $ctx,`$FRAME-$SIZE_T*22`($sp) $POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer $POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl lwz $t0,0($ctx) lwz $t1,4($ctx) lwz $t2,8($ctx) lwz $t3,12($ctx) lwz $a0,16($ctx) lwz $a1,20($ctx) lwz $a2,24($ctx) addc @V[1],@V[1],$t1 lwz $a3,28($ctx) adde @V[0],@V[0],$t0 lwz $t0,32($ctx) addc @V[3],@V[3],$t3 lwz $t1,36($ctx) adde @V[2],@V[2],$t2 lwz $t2,40($ctx) addc @V[5],@V[5],$a1 lwz $t3,44($ctx) adde @V[4],@V[4],$a0 lwz $a0,48($ctx) addc @V[7],@V[7],$a3 lwz $a1,52($ctx) adde @V[6],@V[6],$a2 lwz $a2,56($ctx) addc @V[9],@V[9],$t1 lwz $a3,60($ctx) adde @V[8],@V[8],$t0 stw @V[0],0($ctx) stw @V[1],4($ctx) addc @V[11],@V[11],$t3 stw @V[2],8($ctx) stw @V[3],12($ctx) adde @V[10],@V[10],$t2 stw @V[4],16($ctx) stw @V[5],20($ctx) addc @V[13],@V[13],$a1 stw @V[6],24($ctx) stw @V[7],28($ctx) adde @V[12],@V[12],$a0 stw @V[8],32($ctx) stw @V[9],36($ctx) addc @V[15],@V[15],$a3 stw @V[10],40($ctx) stw @V[11],44($ctx) adde @V[14],@V[14],$a2 stw @V[12],48($ctx) stw @V[13],52($ctx) stw @V[14],56($ctx) stw @V[15],60($ctx) addi $inp,$inp,`16*$SZ` ; advance inp $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) $UCMP $inp,$num bne Lsha2_block_private blr .long 0 .byte 0,12,0x14,0,0,0,0,0 ___ } # Ugly hack here, because PPC assembler syntax seem to vary too # much from platforms to platform... Loading