Loading crypto/aes/asm/aes-ppc.pl +31 −11 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ # February 2010 # # Rescheduling instructions to favour Power6 pipeline gives 10% # Rescheduling instructions to favour Power6 pipeline gave 10% # performance improvement on the platfrom in question (and marginal # improvement even on others). It should be noted that Power6 fails # to process byte in 18 cycles, only in 23, because it fails to issue Loading @@ -33,11 +33,13 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; $LRSAVE =2*$SIZE_T; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; Loading Loading @@ -116,15 +118,19 @@ LAES_Te: addi $Tbl0,$Tbl0,`128-8` mtlr r0 blr .space `32-24` .long 0 .byte 0,12,0x14,0,0,0,0,0 .space `64-9*4` LAES_Td: mflr r0 bcl 20,31,\$+4 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry addi $Tbl0,$Tbl0,`128-8-32+2048+256` addi $Tbl0,$Tbl0,`128-64-8+2048+256` mtlr r0 blr .space `128-32-24` .long 0 .byte 0,12,0x14,0,0,0,0,0 .space `128-64-9*4` ___ &_data_word( 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, Loading Loading @@ -328,10 +334,9 @@ $code.=<<___; .globl .AES_encrypt .align 7 .AES_encrypt: mflr r0 $STU $sp,-$FRAME($sp) mflr r0 $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -352,6 +357,7 @@ $code.=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $s0,0($inp) lwz $s1,4($inp) Loading @@ -364,7 +370,7 @@ $code.=<<___; stw $s2,8($out) stw $s3,12($out) $POP r0,`$FRAME-$SIZE_T*21`($sp) $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -388,6 +394,9 @@ $code.=<<___; mtlr r0 addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,18,3,0 .long 0 .align 5 Lppc_AES_encrypt: Loading Loading @@ -530,6 +539,8 @@ Lenc_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_encrypt_compact: Loading Loading @@ -673,14 +684,15 @@ Lenc_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .AES_decrypt .align 7 .AES_decrypt: mflr r0 $STU $sp,-$FRAME($sp) mflr r0 $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -701,6 +713,7 @@ Lenc_compact_done: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $s0,0($inp) lwz $s1,4($inp) Loading @@ -713,7 +726,7 @@ Lenc_compact_done: stw $s2,8($out) stw $s3,12($out) $POP r0,`$FRAME-$SIZE_T*21`($sp) $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -737,6 +750,9 @@ Lenc_compact_done: mtlr r0 addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,18,3,0 .long 0 .align 5 Lppc_AES_decrypt: Loading Loading @@ -879,6 +895,8 @@ Ldec_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_decrypt_compact: Loading Loading @@ -1180,6 +1198,8 @@ Ldec_compact_done: xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" .align 7 ___ Loading crypto/bn/asm/ppc-mont.pl +51 −45 Original line number Diff line number Diff line Loading @@ -31,7 +31,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=4; $RZONE= 224; $FRAME= $SIZE_T*16; $LD= "lwz"; # load $LDU= "lwzu"; # load and update Loading @@ -51,7 +50,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=8; $RZONE= 288; $FRAME= $SIZE_T*16; # same as above, but 64-bit mnemonics... $LD= "ld"; # load Loading @@ -69,6 +67,9 @@ if ($flavour =~ /32/) { $POP= $LD; } else { die "nonsense $flavour"; } $FRAME=8*$SIZE_T+$RZONE; $LOCALS=8*$SIZE_T; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or Loading @@ -89,18 +90,18 @@ $aj="r10"; $nj="r11"; $tj="r12"; # non-volatile registers $i="r14"; $j="r15"; $tp="r16"; $m0="r17"; $m1="r18"; $lo0="r19"; $hi0="r20"; $lo1="r21"; $hi1="r22"; $alo="r23"; $ahi="r24"; $nlo="r25"; $i="r20"; $j="r21"; $tp="r22"; $m0="r23"; $m1="r24"; $lo0="r25"; $hi0="r26"; $lo1="r27"; $hi1="r28"; $alo="r29"; $ahi="r30"; $nlo="r31"; # $nhi="r0"; Loading @@ -123,32 +124,33 @@ ___ $code.=<<___; slwi $num,$num,`log($BNSZ)/log(2)` li $tj,-4096 addi $ovf,$num,`$FRAME+$RZONE` addi $ovf,$num,$FRAME subf $ovf,$ovf,$sp ; $sp-$ovf and $ovf,$ovf,$tj ; minimize TLB usage subf $ovf,$sp,$ovf ; $ovf-$sp mr $tj,$sp srwi $num,$num,`log($BNSZ)/log(2)` $STUX $sp,$sp,$ovf $PUSH r14,`4*$SIZE_T`($sp) $PUSH r15,`5*$SIZE_T`($sp) $PUSH r16,`6*$SIZE_T`($sp) $PUSH r17,`7*$SIZE_T`($sp) $PUSH r18,`8*$SIZE_T`($sp) $PUSH r19,`9*$SIZE_T`($sp) $PUSH r20,`10*$SIZE_T`($sp) $PUSH r21,`11*$SIZE_T`($sp) $PUSH r22,`12*$SIZE_T`($sp) $PUSH r23,`13*$SIZE_T`($sp) $PUSH r24,`14*$SIZE_T`($sp) $PUSH r25,`15*$SIZE_T`($sp) $PUSH r20,`-12*$SIZE_T`($tj) $PUSH r21,`-11*$SIZE_T`($tj) $PUSH r22,`-10*$SIZE_T`($tj) $PUSH r23,`-9*$SIZE_T`($tj) $PUSH r24,`-8*$SIZE_T`($tj) $PUSH r25,`-7*$SIZE_T`($tj) $PUSH r26,`-6*$SIZE_T`($tj) $PUSH r27,`-5*$SIZE_T`($tj) $PUSH r28,`-4*$SIZE_T`($tj) $PUSH r29,`-3*$SIZE_T`($tj) $PUSH r30,`-2*$SIZE_T`($tj) $PUSH r31,`-1*$SIZE_T`($tj) $LD $n0,0($n0) ; pull n0[0] value addi $num,$num,-2 ; adjust $num for counter register $LD $m0,0($bp) ; m0=bp[0] $LD $aj,0($ap) ; ap[0] addi $tp,$sp,$FRAME addi $tp,$sp,$LOCALS $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] $UMULH $hi0,$aj,$m0 Loading Loading @@ -210,8 +212,8 @@ L1st: Louter: $LDX $m0,$bp,$i ; m0=bp[i] $LD $aj,0($ap) ; ap[0] addi $tp,$sp,$FRAME $LD $tj,$FRAME($sp) ; tp[0] addi $tp,$sp,$LOCALS $LD $tj,$LOCALS($sp); tp[0] $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] $UMULH $hi0,$aj,$m0 $LD $aj,$BNSZ($ap) ; ap[1] Loading Loading @@ -278,7 +280,7 @@ Linner: addi $num,$num,2 ; restore $num subfc $j,$j,$j ; j=0 and "clear" XER[CA] addi $tp,$sp,$FRAME addi $tp,$sp,$LOCALS mtctr $num .align 4 Loading @@ -304,23 +306,27 @@ Lcopy: ; copy or in-place refresh addi $j,$j,$BNSZ bdnz- Lcopy $POP r14,`4*$SIZE_T`($sp) $POP r15,`5*$SIZE_T`($sp) $POP r16,`6*$SIZE_T`($sp) $POP r17,`7*$SIZE_T`($sp) $POP r18,`8*$SIZE_T`($sp) $POP r19,`9*$SIZE_T`($sp) $POP r20,`10*$SIZE_T`($sp) $POP r21,`11*$SIZE_T`($sp) $POP r22,`12*$SIZE_T`($sp) $POP r23,`13*$SIZE_T`($sp) $POP r24,`14*$SIZE_T`($sp) $POP r25,`15*$SIZE_T`($sp) $POP $sp,0($sp) $POP $tj,0($sp) li r3,1 $POP r20,`-12*$SIZE_T`($tj) $POP r21,`-11*$SIZE_T`($tj) $POP r22,`-10*$SIZE_T`($tj) $POP r23,`-9*$SIZE_T`($tj) $POP r24,`-8*$SIZE_T`($tj) $POP r25,`-7*$SIZE_T`($tj) $POP r26,`-6*$SIZE_T`($tj) $POP r27,`-5*$SIZE_T`($tj) $POP r28,`-4*$SIZE_T`($tj) $POP r29,`-3*$SIZE_T`($tj) $POP r30,`-2*$SIZE_T`($tj) $POP r31,`-1*$SIZE_T`($tj) mr $sp,$tj blr .long 0 .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" .byte 0,12,4,0,0x80,12,6,0 .long 0 .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading crypto/bn/asm/ppc.pl +30 −13 Original line number Diff line number Diff line Loading @@ -389,7 +389,9 @@ $data=<<EOF; $ST r9,`6*$BNSZ`(r3) #r[6]=c1 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -814,8 +816,9 @@ $data=<<EOF; blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -966,7 +969,9 @@ $data=<<EOF; $ST r10,`6*$BNSZ`(r3) #r[6]=c1 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1502,7 +1507,9 @@ $data=<<EOF; $ST r12,`14*$BNSZ`(r3) #r[14]=c3; $ST r10,`15*$BNSZ`(r3) #r[15]=c1; blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1550,8 +1557,9 @@ Lppcasm_sub_adios: subfze r3,r0 # if carry bit is set then r3 = 0 else -1 andi. r3,r3,1 # keep only last bit. blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop: Lppcasm_add_adios: addze r3,r0 #return carry bit. blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1707,7 +1717,9 @@ Lppcasm_div8: Lppcasm_div9: or r3,r8,r0 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop: bdnz- Lppcasm_sqr_mainloop Lppcasm_sqr_adios: blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1850,7 +1863,9 @@ Lppcasm_mw_REM: Lppcasm_mw_OVER: addi r3,r12,0 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover: Lppcasm_maw_adios: addi r3,r12,0 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 .align 4 EOF $data =~ s/\`([^\`]*)\`/eval $1/gem; Loading crypto/bn/asm/ppc64-mont.pl +78 −76 Original line number Diff line number Diff line Loading @@ -70,7 +70,6 @@ $flavour = shift; if ($flavour =~ /32/) { $SIZE_T=4; $RZONE= 224; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont_fpu64"; $STUX= "stwux"; # store indexed and update Loading @@ -79,7 +78,6 @@ if ($flavour =~ /32/) { } elsif ($flavour =~ /64/) { $SIZE_T=8; $RZONE= 288; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont_fpu64"; # same as above, but 64-bit mnemonics... Loading @@ -95,7 +93,7 @@ die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; $FRAME=($FRAME+63)&~63; $FRAME=64; # padded frame header $TRANSFER=16*8; $carry="r0"; Loading @@ -112,16 +110,16 @@ $tp="r10"; $j="r11"; $i="r12"; # non-volatile registers $nap_d="r14"; # interleaved ap and np in double format $a0="r15"; # ap[0] $t0="r16"; # temporary registers $t1="r17"; $t2="r18"; $t3="r19"; $t4="r20"; $t5="r21"; $t6="r22"; $t7="r23"; $nap_d="r22"; # interleaved ap and np in double format $a0="r23"; # ap[0] $t0="r24"; # temporary registers $t1="r25"; $t2="r26"; $t3="r27"; $t4="r28"; $t5="r29"; $t6="r30"; $t7="r31"; # PPC offers enough register bank capacity to unroll inner loops twice # Loading Loading @@ -151,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; $dota="f8"; $dotb="f9"; $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; $T0a="f18"; $T0b="f19"; $T1a="f20"; $T1b="f21"; $T2a="f22"; $T2b="f23"; $T3a="f24"; $T3b="f25"; $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; $T0a="f24"; $T0b="f25"; $T1a="f26"; $T1b="f27"; $T2a="f28"; $T2b="f29"; $T3a="f30"; $T3b="f31"; # sp----------->+-------------------------------+ # | saved sp | # +-------------------------------+ # | | # +-------------------------------+ # | 10 saved gpr, r14-r23 | # . . # . . # +12*size_t +-------------------------------+ # | 12 saved fpr, f14-f25 | # . . # . . # +12*8 +-------------------------------+ # | padding to 64 byte boundary | # . . # +X +-------------------------------+ # +64 +-------------------------------+ # | 16 gpr<->fpr transfer zone | # . . # . . Loading @@ -192,6 +179,16 @@ $T3a="f24"; $T3b="f25"; # . . # . . # +-------------------------------+ # . . # -12*size_t +-------------------------------+ # | 10 saved gpr, r22-r31 | # . . # . . # -12*8 +-------------------------------+ # | 12 saved fpr, f20-f31 | # . . # . . # +-------------------------------+ $code=<<___; .machine "any" Loading @@ -215,30 +212,31 @@ $code=<<___; subf $tp,$tp,$sp ; $sp-$tp and $tp,$tp,$i ; minimize TLB usage subf $tp,$sp,$tp ; $tp-$sp mr $i,$sp $STUX $sp,$sp,$tp ; alloca $PUSH r14,`2*$SIZE_T`($sp) $PUSH r15,`3*$SIZE_T`($sp) $PUSH r16,`4*$SIZE_T`($sp) $PUSH r17,`5*$SIZE_T`($sp) $PUSH r18,`6*$SIZE_T`($sp) $PUSH r19,`7*$SIZE_T`($sp) $PUSH r20,`8*$SIZE_T`($sp) $PUSH r21,`9*$SIZE_T`($sp) $PUSH r22,`10*$SIZE_T`($sp) $PUSH r23,`11*$SIZE_T`($sp) stfd f14,`12*$SIZE_T+0`($sp) stfd f15,`12*$SIZE_T+8`($sp) stfd f16,`12*$SIZE_T+16`($sp) stfd f17,`12*$SIZE_T+24`($sp) stfd f18,`12*$SIZE_T+32`($sp) stfd f19,`12*$SIZE_T+40`($sp) stfd f20,`12*$SIZE_T+48`($sp) stfd f21,`12*$SIZE_T+56`($sp) stfd f22,`12*$SIZE_T+64`($sp) stfd f23,`12*$SIZE_T+72`($sp) stfd f24,`12*$SIZE_T+80`($sp) stfd f25,`12*$SIZE_T+88`($sp) $PUSH r22,`-12*8-10*$SIZE_T`($i) $PUSH r23,`-12*8-9*$SIZE_T`($i) $PUSH r24,`-12*8-8*$SIZE_T`($i) $PUSH r25,`-12*8-7*$SIZE_T`($i) $PUSH r26,`-12*8-6*$SIZE_T`($i) $PUSH r27,`-12*8-5*$SIZE_T`($i) $PUSH r28,`-12*8-4*$SIZE_T`($i) $PUSH r29,`-12*8-3*$SIZE_T`($i) $PUSH r30,`-12*8-2*$SIZE_T`($i) $PUSH r31,`-12*8-1*$SIZE_T`($i) stfd f20,`-12*8`($i) stfd f21,`-11*8`($i) stfd f22,`-10*8`($i) stfd f23,`-9*8`($i) stfd f24,`-8*8`($i) stfd f25,`-7*8`($i) stfd f26,`-6*8`($i) stfd f27,`-5*8`($i) stfd f28,`-4*8`($i) stfd f29,`-3*8`($i) stfd f30,`-2*8`($i) stfd f31,`-1*8`($i) ___ $code.=<<___ if ($SIZE_T==8); ld $a0,0($ap) ; pull ap[0] value Loading Loading @@ -1052,33 +1050,37 @@ Lcopy: ; copy or in-place refresh ___ $code.=<<___; $POP r14,`2*$SIZE_T`($sp) $POP r15,`3*$SIZE_T`($sp) $POP r16,`4*$SIZE_T`($sp) $POP r17,`5*$SIZE_T`($sp) $POP r18,`6*$SIZE_T`($sp) $POP r19,`7*$SIZE_T`($sp) $POP r20,`8*$SIZE_T`($sp) $POP r21,`9*$SIZE_T`($sp) $POP r22,`10*$SIZE_T`($sp) $POP r23,`11*$SIZE_T`($sp) lfd f14,`12*$SIZE_T+0`($sp) lfd f15,`12*$SIZE_T+8`($sp) lfd f16,`12*$SIZE_T+16`($sp) lfd f17,`12*$SIZE_T+24`($sp) lfd f18,`12*$SIZE_T+32`($sp) lfd f19,`12*$SIZE_T+40`($sp) lfd f20,`12*$SIZE_T+48`($sp) lfd f21,`12*$SIZE_T+56`($sp) lfd f22,`12*$SIZE_T+64`($sp) lfd f23,`12*$SIZE_T+72`($sp) lfd f24,`12*$SIZE_T+80`($sp) lfd f25,`12*$SIZE_T+88`($sp) $POP $sp,0($sp) $POP $i,0($sp) li r3,1 ; signal "handled" $POP r22,`-12*8-10*$SIZE_T`($i) $POP r23,`-12*8-9*$SIZE_T`($i) $POP r24,`-12*8-8*$SIZE_T`($i) $POP r25,`-12*8-7*$SIZE_T`($i) $POP r26,`-12*8-6*$SIZE_T`($i) $POP r27,`-12*8-5*$SIZE_T`($i) $POP r28,`-12*8-4*$SIZE_T`($i) $POP r29,`-12*8-3*$SIZE_T`($i) $POP r30,`-12*8-2*$SIZE_T`($i) $POP r31,`-12*8-1*$SIZE_T`($i) lfd f20,`-12*8`($i) lfd f21,`-11*8`($i) lfd f22,`-10*8`($i) lfd f23,`-9*8`($i) lfd f24,`-8*8`($i) lfd f25,`-7*8`($i) lfd f26,`-6*8`($i) lfd f27,`-5*8`($i) lfd f28,`-4*8`($i) lfd f29,`-3*8`($i) lfd f30,`-2*8`($i) lfd f31,`-1*8`($i) mr $sp,$i blr .long 0 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" .byte 0,12,4,0,0x8c,10,6,0 .long 0 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading crypto/ppccpuid.pl +20 −0 Original line number Diff line number Diff line Loading @@ -29,12 +29,16 @@ $code=<<___; fcfid f1,f1 extrdi r0,r0,32,0 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_altivec_probe .align 4 .OPENSSL_altivec_probe: .long 0x10000484 # vor v0,v0,v0 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_wipe_cpu .align 4 Loading Loading @@ -65,6 +69,8 @@ $code=<<___; fmr f12,f31 fmr f13,f31 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_atomic_add .align 4 Loading @@ -75,6 +81,9 @@ Ladd: lwarx r5,0,r3 bne- Ladd $SIGNX r3,r0 blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 .globl .OPENSSL_rdtsc .align 4 Loading @@ -82,6 +91,8 @@ Ladd: lwarx r5,0,r3 mftb r3 mftbu r4 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_cleanse .align 4 Loading Loading @@ -111,6 +122,9 @@ Laligned: andi. r4,r4,3 bne Little blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 ___ { my ($out,$cnt,$max)=("r3","r4","r5"); Loading Loading @@ -145,6 +159,9 @@ Loop: mftb $tick mr r3,$cnt blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 .globl .OPENSSL_instrument_bus2 .align 4 Loading Loading @@ -193,6 +210,9 @@ Ldone2: srwi $cnt,$cnt,2 sub r3,r0,$cnt blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 ___ } Loading Loading
crypto/aes/asm/aes-ppc.pl +31 −11 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ # February 2010 # # Rescheduling instructions to favour Power6 pipeline gives 10% # Rescheduling instructions to favour Power6 pipeline gave 10% # performance improvement on the platfrom in question (and marginal # improvement even on others). It should be noted that Power6 fails # to process byte in 18 cycles, only in 23, because it fails to issue Loading @@ -33,11 +33,13 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; $LRSAVE =2*$SIZE_T; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; Loading Loading @@ -116,15 +118,19 @@ LAES_Te: addi $Tbl0,$Tbl0,`128-8` mtlr r0 blr .space `32-24` .long 0 .byte 0,12,0x14,0,0,0,0,0 .space `64-9*4` LAES_Td: mflr r0 bcl 20,31,\$+4 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry addi $Tbl0,$Tbl0,`128-8-32+2048+256` addi $Tbl0,$Tbl0,`128-64-8+2048+256` mtlr r0 blr .space `128-32-24` .long 0 .byte 0,12,0x14,0,0,0,0,0 .space `128-64-9*4` ___ &_data_word( 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, Loading Loading @@ -328,10 +334,9 @@ $code.=<<___; .globl .AES_encrypt .align 7 .AES_encrypt: mflr r0 $STU $sp,-$FRAME($sp) mflr r0 $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -352,6 +357,7 @@ $code.=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $s0,0($inp) lwz $s1,4($inp) Loading @@ -364,7 +370,7 @@ $code.=<<___; stw $s2,8($out) stw $s3,12($out) $POP r0,`$FRAME-$SIZE_T*21`($sp) $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -388,6 +394,9 @@ $code.=<<___; mtlr r0 addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,18,3,0 .long 0 .align 5 Lppc_AES_encrypt: Loading Loading @@ -530,6 +539,8 @@ Lenc_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_encrypt_compact: Loading Loading @@ -673,14 +684,15 @@ Lenc_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .AES_decrypt .align 7 .AES_decrypt: mflr r0 $STU $sp,-$FRAME($sp) mflr r0 $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -701,6 +713,7 @@ Lenc_compact_done: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $s0,0($inp) lwz $s1,4($inp) Loading @@ -713,7 +726,7 @@ Lenc_compact_done: stw $s2,8($out) stw $s3,12($out) $POP r0,`$FRAME-$SIZE_T*21`($sp) $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) Loading @@ -737,6 +750,9 @@ Lenc_compact_done: mtlr r0 addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,18,3,0 .long 0 .align 5 Lppc_AES_decrypt: Loading Loading @@ -879,6 +895,8 @@ Ldec_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_decrypt_compact: Loading Loading @@ -1180,6 +1198,8 @@ Ldec_compact_done: xor $s3,$s3,$t3 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" .align 7 ___ Loading
crypto/bn/asm/ppc-mont.pl +51 −45 Original line number Diff line number Diff line Loading @@ -31,7 +31,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=4; $RZONE= 224; $FRAME= $SIZE_T*16; $LD= "lwz"; # load $LDU= "lwzu"; # load and update Loading @@ -51,7 +50,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=8; $RZONE= 288; $FRAME= $SIZE_T*16; # same as above, but 64-bit mnemonics... $LD= "ld"; # load Loading @@ -69,6 +67,9 @@ if ($flavour =~ /32/) { $POP= $LD; } else { die "nonsense $flavour"; } $FRAME=8*$SIZE_T+$RZONE; $LOCALS=8*$SIZE_T; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or Loading @@ -89,18 +90,18 @@ $aj="r10"; $nj="r11"; $tj="r12"; # non-volatile registers $i="r14"; $j="r15"; $tp="r16"; $m0="r17"; $m1="r18"; $lo0="r19"; $hi0="r20"; $lo1="r21"; $hi1="r22"; $alo="r23"; $ahi="r24"; $nlo="r25"; $i="r20"; $j="r21"; $tp="r22"; $m0="r23"; $m1="r24"; $lo0="r25"; $hi0="r26"; $lo1="r27"; $hi1="r28"; $alo="r29"; $ahi="r30"; $nlo="r31"; # $nhi="r0"; Loading @@ -123,32 +124,33 @@ ___ $code.=<<___; slwi $num,$num,`log($BNSZ)/log(2)` li $tj,-4096 addi $ovf,$num,`$FRAME+$RZONE` addi $ovf,$num,$FRAME subf $ovf,$ovf,$sp ; $sp-$ovf and $ovf,$ovf,$tj ; minimize TLB usage subf $ovf,$sp,$ovf ; $ovf-$sp mr $tj,$sp srwi $num,$num,`log($BNSZ)/log(2)` $STUX $sp,$sp,$ovf $PUSH r14,`4*$SIZE_T`($sp) $PUSH r15,`5*$SIZE_T`($sp) $PUSH r16,`6*$SIZE_T`($sp) $PUSH r17,`7*$SIZE_T`($sp) $PUSH r18,`8*$SIZE_T`($sp) $PUSH r19,`9*$SIZE_T`($sp) $PUSH r20,`10*$SIZE_T`($sp) $PUSH r21,`11*$SIZE_T`($sp) $PUSH r22,`12*$SIZE_T`($sp) $PUSH r23,`13*$SIZE_T`($sp) $PUSH r24,`14*$SIZE_T`($sp) $PUSH r25,`15*$SIZE_T`($sp) $PUSH r20,`-12*$SIZE_T`($tj) $PUSH r21,`-11*$SIZE_T`($tj) $PUSH r22,`-10*$SIZE_T`($tj) $PUSH r23,`-9*$SIZE_T`($tj) $PUSH r24,`-8*$SIZE_T`($tj) $PUSH r25,`-7*$SIZE_T`($tj) $PUSH r26,`-6*$SIZE_T`($tj) $PUSH r27,`-5*$SIZE_T`($tj) $PUSH r28,`-4*$SIZE_T`($tj) $PUSH r29,`-3*$SIZE_T`($tj) $PUSH r30,`-2*$SIZE_T`($tj) $PUSH r31,`-1*$SIZE_T`($tj) $LD $n0,0($n0) ; pull n0[0] value addi $num,$num,-2 ; adjust $num for counter register $LD $m0,0($bp) ; m0=bp[0] $LD $aj,0($ap) ; ap[0] addi $tp,$sp,$FRAME addi $tp,$sp,$LOCALS $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] $UMULH $hi0,$aj,$m0 Loading Loading @@ -210,8 +212,8 @@ L1st: Louter: $LDX $m0,$bp,$i ; m0=bp[i] $LD $aj,0($ap) ; ap[0] addi $tp,$sp,$FRAME $LD $tj,$FRAME($sp) ; tp[0] addi $tp,$sp,$LOCALS $LD $tj,$LOCALS($sp); tp[0] $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] $UMULH $hi0,$aj,$m0 $LD $aj,$BNSZ($ap) ; ap[1] Loading Loading @@ -278,7 +280,7 @@ Linner: addi $num,$num,2 ; restore $num subfc $j,$j,$j ; j=0 and "clear" XER[CA] addi $tp,$sp,$FRAME addi $tp,$sp,$LOCALS mtctr $num .align 4 Loading @@ -304,23 +306,27 @@ Lcopy: ; copy or in-place refresh addi $j,$j,$BNSZ bdnz- Lcopy $POP r14,`4*$SIZE_T`($sp) $POP r15,`5*$SIZE_T`($sp) $POP r16,`6*$SIZE_T`($sp) $POP r17,`7*$SIZE_T`($sp) $POP r18,`8*$SIZE_T`($sp) $POP r19,`9*$SIZE_T`($sp) $POP r20,`10*$SIZE_T`($sp) $POP r21,`11*$SIZE_T`($sp) $POP r22,`12*$SIZE_T`($sp) $POP r23,`13*$SIZE_T`($sp) $POP r24,`14*$SIZE_T`($sp) $POP r25,`15*$SIZE_T`($sp) $POP $sp,0($sp) $POP $tj,0($sp) li r3,1 $POP r20,`-12*$SIZE_T`($tj) $POP r21,`-11*$SIZE_T`($tj) $POP r22,`-10*$SIZE_T`($tj) $POP r23,`-9*$SIZE_T`($tj) $POP r24,`-8*$SIZE_T`($tj) $POP r25,`-7*$SIZE_T`($tj) $POP r26,`-6*$SIZE_T`($tj) $POP r27,`-5*$SIZE_T`($tj) $POP r28,`-4*$SIZE_T`($tj) $POP r29,`-3*$SIZE_T`($tj) $POP r30,`-2*$SIZE_T`($tj) $POP r31,`-1*$SIZE_T`($tj) mr $sp,$tj blr .long 0 .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" .byte 0,12,4,0,0x80,12,6,0 .long 0 .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading
crypto/bn/asm/ppc.pl +30 −13 Original line number Diff line number Diff line Loading @@ -389,7 +389,9 @@ $data=<<EOF; $ST r9,`6*$BNSZ`(r3) #r[6]=c1 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -814,8 +816,9 @@ $data=<<EOF; blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -966,7 +969,9 @@ $data=<<EOF; $ST r10,`6*$BNSZ`(r3) #r[6]=c1 $ST r11,`7*$BNSZ`(r3) #r[7]=c2 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1502,7 +1507,9 @@ $data=<<EOF; $ST r12,`14*$BNSZ`(r3) #r[14]=c3; $ST r10,`15*$BNSZ`(r3) #r[15]=c1; blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1550,8 +1557,9 @@ Lppcasm_sub_adios: subfze r3,r0 # if carry bit is set then r3 = 0 else -1 andi. r3,r3,1 # keep only last bit. blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop: Lppcasm_add_adios: addze r3,r0 #return carry bit. blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1707,7 +1717,9 @@ Lppcasm_div8: Lppcasm_div9: or r3,r8,r0 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop: bdnz- Lppcasm_sqr_mainloop Lppcasm_sqr_adios: blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1850,7 +1863,9 @@ Lppcasm_mw_REM: Lppcasm_mw_OVER: addi r3,r12,0 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 # # NOTE: The following label name should be changed to Loading Loading @@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover: Lppcasm_maw_adios: addi r3,r12,0 blr .long 0x00000000 .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 .align 4 EOF $data =~ s/\`([^\`]*)\`/eval $1/gem; Loading
crypto/bn/asm/ppc64-mont.pl +78 −76 Original line number Diff line number Diff line Loading @@ -70,7 +70,6 @@ $flavour = shift; if ($flavour =~ /32/) { $SIZE_T=4; $RZONE= 224; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont_fpu64"; $STUX= "stwux"; # store indexed and update Loading @@ -79,7 +78,6 @@ if ($flavour =~ /32/) { } elsif ($flavour =~ /64/) { $SIZE_T=8; $RZONE= 288; $FRAME= $SIZE_T*12+8*12; $fname= "bn_mul_mont_fpu64"; # same as above, but 64-bit mnemonics... Loading @@ -95,7 +93,7 @@ die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; $FRAME=($FRAME+63)&~63; $FRAME=64; # padded frame header $TRANSFER=16*8; $carry="r0"; Loading @@ -112,16 +110,16 @@ $tp="r10"; $j="r11"; $i="r12"; # non-volatile registers $nap_d="r14"; # interleaved ap and np in double format $a0="r15"; # ap[0] $t0="r16"; # temporary registers $t1="r17"; $t2="r18"; $t3="r19"; $t4="r20"; $t5="r21"; $t6="r22"; $t7="r23"; $nap_d="r22"; # interleaved ap and np in double format $a0="r23"; # ap[0] $t0="r24"; # temporary registers $t1="r25"; $t2="r26"; $t3="r27"; $t4="r28"; $t5="r29"; $t6="r30"; $t7="r31"; # PPC offers enough register bank capacity to unroll inner loops twice # Loading Loading @@ -151,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; $na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; $dota="f8"; $dotb="f9"; $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; $N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; $T0a="f18"; $T0b="f19"; $T1a="f20"; $T1b="f21"; $T2a="f22"; $T2b="f23"; $T3a="f24"; $T3b="f25"; $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; $T0a="f24"; $T0b="f25"; $T1a="f26"; $T1b="f27"; $T2a="f28"; $T2b="f29"; $T3a="f30"; $T3b="f31"; # sp----------->+-------------------------------+ # | saved sp | # +-------------------------------+ # | | # +-------------------------------+ # | 10 saved gpr, r14-r23 | # . . # . . # +12*size_t +-------------------------------+ # | 12 saved fpr, f14-f25 | # . . # . . # +12*8 +-------------------------------+ # | padding to 64 byte boundary | # . . # +X +-------------------------------+ # +64 +-------------------------------+ # | 16 gpr<->fpr transfer zone | # . . # . . Loading @@ -192,6 +179,16 @@ $T3a="f24"; $T3b="f25"; # . . # . . # +-------------------------------+ # . . # -12*size_t +-------------------------------+ # | 10 saved gpr, r22-r31 | # . . # . . # -12*8 +-------------------------------+ # | 12 saved fpr, f20-f31 | # . . # . . # +-------------------------------+ $code=<<___; .machine "any" Loading @@ -215,30 +212,31 @@ $code=<<___; subf $tp,$tp,$sp ; $sp-$tp and $tp,$tp,$i ; minimize TLB usage subf $tp,$sp,$tp ; $tp-$sp mr $i,$sp $STUX $sp,$sp,$tp ; alloca $PUSH r14,`2*$SIZE_T`($sp) $PUSH r15,`3*$SIZE_T`($sp) $PUSH r16,`4*$SIZE_T`($sp) $PUSH r17,`5*$SIZE_T`($sp) $PUSH r18,`6*$SIZE_T`($sp) $PUSH r19,`7*$SIZE_T`($sp) $PUSH r20,`8*$SIZE_T`($sp) $PUSH r21,`9*$SIZE_T`($sp) $PUSH r22,`10*$SIZE_T`($sp) $PUSH r23,`11*$SIZE_T`($sp) stfd f14,`12*$SIZE_T+0`($sp) stfd f15,`12*$SIZE_T+8`($sp) stfd f16,`12*$SIZE_T+16`($sp) stfd f17,`12*$SIZE_T+24`($sp) stfd f18,`12*$SIZE_T+32`($sp) stfd f19,`12*$SIZE_T+40`($sp) stfd f20,`12*$SIZE_T+48`($sp) stfd f21,`12*$SIZE_T+56`($sp) stfd f22,`12*$SIZE_T+64`($sp) stfd f23,`12*$SIZE_T+72`($sp) stfd f24,`12*$SIZE_T+80`($sp) stfd f25,`12*$SIZE_T+88`($sp) $PUSH r22,`-12*8-10*$SIZE_T`($i) $PUSH r23,`-12*8-9*$SIZE_T`($i) $PUSH r24,`-12*8-8*$SIZE_T`($i) $PUSH r25,`-12*8-7*$SIZE_T`($i) $PUSH r26,`-12*8-6*$SIZE_T`($i) $PUSH r27,`-12*8-5*$SIZE_T`($i) $PUSH r28,`-12*8-4*$SIZE_T`($i) $PUSH r29,`-12*8-3*$SIZE_T`($i) $PUSH r30,`-12*8-2*$SIZE_T`($i) $PUSH r31,`-12*8-1*$SIZE_T`($i) stfd f20,`-12*8`($i) stfd f21,`-11*8`($i) stfd f22,`-10*8`($i) stfd f23,`-9*8`($i) stfd f24,`-8*8`($i) stfd f25,`-7*8`($i) stfd f26,`-6*8`($i) stfd f27,`-5*8`($i) stfd f28,`-4*8`($i) stfd f29,`-3*8`($i) stfd f30,`-2*8`($i) stfd f31,`-1*8`($i) ___ $code.=<<___ if ($SIZE_T==8); ld $a0,0($ap) ; pull ap[0] value Loading Loading @@ -1052,33 +1050,37 @@ Lcopy: ; copy or in-place refresh ___ $code.=<<___; $POP r14,`2*$SIZE_T`($sp) $POP r15,`3*$SIZE_T`($sp) $POP r16,`4*$SIZE_T`($sp) $POP r17,`5*$SIZE_T`($sp) $POP r18,`6*$SIZE_T`($sp) $POP r19,`7*$SIZE_T`($sp) $POP r20,`8*$SIZE_T`($sp) $POP r21,`9*$SIZE_T`($sp) $POP r22,`10*$SIZE_T`($sp) $POP r23,`11*$SIZE_T`($sp) lfd f14,`12*$SIZE_T+0`($sp) lfd f15,`12*$SIZE_T+8`($sp) lfd f16,`12*$SIZE_T+16`($sp) lfd f17,`12*$SIZE_T+24`($sp) lfd f18,`12*$SIZE_T+32`($sp) lfd f19,`12*$SIZE_T+40`($sp) lfd f20,`12*$SIZE_T+48`($sp) lfd f21,`12*$SIZE_T+56`($sp) lfd f22,`12*$SIZE_T+64`($sp) lfd f23,`12*$SIZE_T+72`($sp) lfd f24,`12*$SIZE_T+80`($sp) lfd f25,`12*$SIZE_T+88`($sp) $POP $sp,0($sp) $POP $i,0($sp) li r3,1 ; signal "handled" $POP r22,`-12*8-10*$SIZE_T`($i) $POP r23,`-12*8-9*$SIZE_T`($i) $POP r24,`-12*8-8*$SIZE_T`($i) $POP r25,`-12*8-7*$SIZE_T`($i) $POP r26,`-12*8-6*$SIZE_T`($i) $POP r27,`-12*8-5*$SIZE_T`($i) $POP r28,`-12*8-4*$SIZE_T`($i) $POP r29,`-12*8-3*$SIZE_T`($i) $POP r30,`-12*8-2*$SIZE_T`($i) $POP r31,`-12*8-1*$SIZE_T`($i) lfd f20,`-12*8`($i) lfd f21,`-11*8`($i) lfd f22,`-10*8`($i) lfd f23,`-9*8`($i) lfd f24,`-8*8`($i) lfd f25,`-7*8`($i) lfd f26,`-6*8`($i) lfd f27,`-5*8`($i) lfd f28,`-4*8`($i) lfd f29,`-3*8`($i) lfd f30,`-2*8`($i) lfd f31,`-1*8`($i) mr $sp,$i blr .long 0 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" .byte 0,12,4,0,0x8c,10,6,0 .long 0 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; Loading
crypto/ppccpuid.pl +20 −0 Original line number Diff line number Diff line Loading @@ -29,12 +29,16 @@ $code=<<___; fcfid f1,f1 extrdi r0,r0,32,0 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_altivec_probe .align 4 .OPENSSL_altivec_probe: .long 0x10000484 # vor v0,v0,v0 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_wipe_cpu .align 4 Loading Loading @@ -65,6 +69,8 @@ $code=<<___; fmr f12,f31 fmr f13,f31 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_atomic_add .align 4 Loading @@ -75,6 +81,9 @@ Ladd: lwarx r5,0,r3 bne- Ladd $SIGNX r3,r0 blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 .globl .OPENSSL_rdtsc .align 4 Loading @@ -82,6 +91,8 @@ Ladd: lwarx r5,0,r3 mftb r3 mftbu r4 blr .long 0 .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_cleanse .align 4 Loading Loading @@ -111,6 +122,9 @@ Laligned: andi. r4,r4,3 bne Little blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 ___ { my ($out,$cnt,$max)=("r3","r4","r5"); Loading Loading @@ -145,6 +159,9 @@ Loop: mftb $tick mr r3,$cnt blr .long 0 .byte 0,12,0x14,0,0,0,2,0 .long 0 .globl .OPENSSL_instrument_bus2 .align 4 Loading Loading @@ -193,6 +210,9 @@ Ldone2: srwi $cnt,$cnt,2 sub r3,r0,$cnt blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 ___ } Loading