Loading crypto/aes/asm/vpaes-ppc.pl +136 −62 Original line number Diff line number Diff line Loading @@ -337,24 +337,27 @@ Lenc_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key # prepare for unaligned access vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case ?vperm $outmask, v7, $outmask, $outperm lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_encrypt_core andi. r8, $out, 15 li r9, 16 beq Lenc_out_aligned vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out addi $out, $out, 15 # 15 is not a typo ######## mtctr r9 Lenc_out_unaligned: stvebx v0, 0, $out addi $out, $out, 1 bdnz Lenc_out_unaligned b Lenc_done lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out .align 4 Lenc_out_aligned: stvx v0, 0, $out Lenc_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` Loading Loading @@ -566,24 +569,27 @@ Ldec_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case ?vperm $outmask, v7, $outmask, $outperm lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_decrypt_core andi. r8, $out, 15 li r9, 16 beq Ldec_out_aligned vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out addi $out, $out, 15 # 15 is not a typo ######## mtctr r9 Ldec_out_unaligned: stvebx v0, 0, $out addi $out, $out, 1 bdnz Ldec_out_unaligned b Ldec_done lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out .align 4 Ldec_out_aligned: stvx v0, 0, $out Ldec_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` Loading Loading @@ -658,11 +664,11 @@ Ldec_entry: $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) and r30, r5, r9 # copy length&-16 andi. r9, $out, 15 # is $out aligned? mr r5, r6 # copy pointer to key mr r31, r7 # copy pointer to iv blt Lcbc_abort cmpwi r8, 0 # test direction li r6, -1 mcrf cr1, cr0 # put aside $out alignment flag mr r7, r12 # copy vrsave mtspr 256, r6 # preserve all AltiVec registers Loading @@ -672,6 +678,7 @@ Ldec_entry: lvx v25, r9, r31 ?vperm v24, v24, v25, $inpperm cmpwi r8, 0 # test direction neg r8, $inp # prepare for unaligned access vxor v7, v7, v7 ?lvsl $keyperm, 0, $key Loading @@ -681,13 +688,37 @@ Ldec_entry: lvx $inptail, 0, $inp ?vperm $outmask, v7, $outmask, $outperm addi $inp, $inp, 15 # 15 is not a typo lvx $outhead, 0, $out beq Lcbc_decrypt bl _vpaes_encrypt_preheat li r0, 16 beq cr1, Lcbc_enc_loop # $out is aligned vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 ?vperm v0, v0, $inptail, $inpperm vxor v0, v0, v24 # ^= iv bl _vpaes_encrypt_core andi. r8, $out, 15 vmr v24, v0 # put aside iv sub r9, $out, r8 vperm $outhead, v0, v0, $outperm # rotate right/left Lcbc_enc_head: stvebx $outhead, r8, r9 cmpwi r8, 15 addi r8, r8, 1 bne Lcbc_enc_head sub. r30, r30, r0 # len -= 16 addi $out, $out, 16 beq Lcbc_unaligned_done Lcbc_enc_loop: vmr v0, $inptail lvx $inptail, 0, $inp Loading @@ -713,6 +744,32 @@ Lcbc_decrypt: bl _vpaes_decrypt_preheat li r0, 16 beq cr1, Lcbc_dec_loop # $out is aligned vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 ?vperm v0, v0, $inptail, $inpperm vmr v25, v0 # put aside input bl _vpaes_decrypt_core andi. r8, $out, 15 vxor v0, v0, v24 # ^= iv vmr v24, v25 sub r9, $out, r8 vperm $outhead, v0, v0, $outperm # rotate right/left Lcbc_dec_head: stvebx $outhead, r8, r9 cmpwi r8, 15 addi r8, r8, 1 bne Lcbc_dec_head sub. r30, r30, r0 # len -= 16 addi $out, $out, 16 beq Lcbc_unaligned_done Lcbc_dec_loop: vmr v0, $inptail lvx $inptail, 0, $inp Loading @@ -733,23 +790,29 @@ Lcbc_dec_loop: bne Lcbc_dec_loop Lcbc_done: addi $out, $out, -1 lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out beq cr1, Lcbc_write_iv # $out is aligned Lcbc_unaligned_done: andi. r8, $out, 15 sub $out, $out, r8 li r9, 0 Lcbc_tail: stvebx $outhead, r9, $out addi r9, r9, 1 cmpw r9, r8 bne Lcbc_tail Lcbc_write_iv: neg r8, r31 # write [potentially unaligned] iv li r10, 4 ?lvsl $outperm, 0, r8 li r6, 15 vnor $outmask, v7, v7 # 0xff..ff ?vperm $outmask, v7, $outmask, $outperm lvx $outhead, 0, r31 li r11, 8 li r12, 12 vperm v24, v24, v24, $outperm # rotate right/left vsel v0, $outhead, v24, $outmask lvx v1, r6, r31 stvx v0, 0, r31 vsel v1, v24, v1, $outmask stvx v1, r6, r31 stvewx v24, 0, r31 # ivp is at least 32-bit aligned stvewx v24, r10, r31 stvewx v24, r11, r31 stvewx v24, r12, r31 mtspr 256, r7 # restore vrsave li r10,`15+6*$SIZE_T` Loading Loading @@ -872,18 +935,21 @@ _vpaes_schedule_core: # encrypting, output zeroth round key after transform li r8, 0x30 # mov \$0x30,%r8d addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 li r9, 4 li r10, 8 li r11, 12 ?lvsr $outperm, 0, $out # prepare for unaligned access vnor $outmask, v9, v9 # 0xff..ff lvx $outhead, 0, $out ?vperm $outmask, v9, $outmask, $outperm #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) vperm v1, v0, v0, $outperm # rotate right/left vsel v2, $outhead, v1, $outmask vmr $outhead, v1 stvx v2, 0, $out vperm $outhead, v0, v0, $outperm # rotate right/left stvewx $outhead, 0, $out # some are superfluous stvewx $outhead, r9, $out stvewx $outhead, r10, $out addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 stvewx $outhead, r11, $out b Lschedule_go Lschedule_am_decrypting: Loading @@ -893,20 +959,24 @@ Lschedule_am_decrypting: addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 # decrypting, output zeroth round key after shiftrows lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 li r9, 4 li r10, 8 li r11, 12 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 neg r0, $out # prepare for unaligned access ?lvsl $outperm, 0, r0 addi $out, $out, 15 # 15 is not typo vnor $outmask, v9, v9 # 0xff..ff lvx $outhead, 0, $out ?vperm $outmask, $outmask, v9, $outperm #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) vperm v4, v4, v4, $outperm # rotate right/left vsel v2, $outhead, v4, $outmask vmr $outhead, v4 stvx v2, 0, $out vperm $outhead, v4, v4, $outperm # rotate right/left stvewx $outhead, 0, $out # some are superfluous stvewx $outhead, r9, $out stvewx $outhead, r10, $out addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 stvewx $outhead, r11, $out addi $out, $out, 15 # 15 is not typo xori r8, r8, 0x30 # xor \$0x30, %r8 Lschedule_go: Loading Loading @@ -1038,14 +1108,15 @@ Lschedule_mangle_last: #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key vperm v0, v0, v0, $outperm # rotate right/left li r10, 4 vsel v2, $outhead, v0, $outmask vmr $outhead, v0 li r11, 8 stvx v2, 0, $out addi $out, $out, 15 # 15 is not typo lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out li r12, 12 stvewx v0, 0, $out # some (or all) are redundant stvewx v0, r10, $out stvewx v0, r11, $out stvewx v0, r12, $out b Lschedule_mangle_done .align 4 Loading @@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec: bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key addi r9, $out, -15 # -15 is not typo vperm v0, v0, v0, $outperm # rotate right/left li r10, 4 vsel v2, $outhead, v0, $outmask vmr $outhead, v0 li r11, 8 stvx v2, 0, $out li r12, 12 stvewx v0, 0, r9 # some (or all) are redundant stvewx v0, r10, r9 stvewx v0, r11, r9 stvewx v0, r12, r9 addi $out, $out, -15 # -15 is not typo lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out Lschedule_mangle_done: mtlr r7 Loading Loading
crypto/aes/asm/vpaes-ppc.pl +136 −62 Original line number Diff line number Diff line Loading @@ -337,24 +337,27 @@ Lenc_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key # prepare for unaligned access vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case ?vperm $outmask, v7, $outmask, $outperm lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_encrypt_core andi. r8, $out, 15 li r9, 16 beq Lenc_out_aligned vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out addi $out, $out, 15 # 15 is not a typo ######## mtctr r9 Lenc_out_unaligned: stvebx v0, 0, $out addi $out, $out, 1 bdnz Lenc_out_unaligned b Lenc_done lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out .align 4 Lenc_out_aligned: stvx v0, 0, $out Lenc_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` Loading Loading @@ -566,24 +569,27 @@ Ldec_entry: addi $inp, $inp, 15 # 15 is not a typo ?lvsr $outperm, 0, $out ?lvsl $keyperm, 0, $key vnor $outmask, v7, v7 # 0xff..ff lvx $inptail, 0, $inp # redundant in aligned case ?vperm $outmask, v7, $outmask, $outperm lvx $outhead, 0, $out ?vperm v0, v0, $inptail, $inpperm bl _vpaes_decrypt_core andi. r8, $out, 15 li r9, 16 beq Ldec_out_aligned vperm v0, v0, v0, $outperm # rotate right/left vsel v1, $outhead, v0, $outmask vmr $outhead, v0 stvx v1, 0, $out addi $out, $out, 15 # 15 is not a typo ######## mtctr r9 Ldec_out_unaligned: stvebx v0, 0, $out addi $out, $out, 1 bdnz Ldec_out_unaligned b Ldec_done lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out .align 4 Ldec_out_aligned: stvx v0, 0, $out Ldec_done: li r10,`15+6*$SIZE_T` li r11,`31+6*$SIZE_T` Loading Loading @@ -658,11 +664,11 @@ Ldec_entry: $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) and r30, r5, r9 # copy length&-16 andi. r9, $out, 15 # is $out aligned? mr r5, r6 # copy pointer to key mr r31, r7 # copy pointer to iv blt Lcbc_abort cmpwi r8, 0 # test direction li r6, -1 mcrf cr1, cr0 # put aside $out alignment flag mr r7, r12 # copy vrsave mtspr 256, r6 # preserve all AltiVec registers Loading @@ -672,6 +678,7 @@ Ldec_entry: lvx v25, r9, r31 ?vperm v24, v24, v25, $inpperm cmpwi r8, 0 # test direction neg r8, $inp # prepare for unaligned access vxor v7, v7, v7 ?lvsl $keyperm, 0, $key Loading @@ -681,13 +688,37 @@ Ldec_entry: lvx $inptail, 0, $inp ?vperm $outmask, v7, $outmask, $outperm addi $inp, $inp, 15 # 15 is not a typo lvx $outhead, 0, $out beq Lcbc_decrypt bl _vpaes_encrypt_preheat li r0, 16 beq cr1, Lcbc_enc_loop # $out is aligned vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 ?vperm v0, v0, $inptail, $inpperm vxor v0, v0, v24 # ^= iv bl _vpaes_encrypt_core andi. r8, $out, 15 vmr v24, v0 # put aside iv sub r9, $out, r8 vperm $outhead, v0, v0, $outperm # rotate right/left Lcbc_enc_head: stvebx $outhead, r8, r9 cmpwi r8, 15 addi r8, r8, 1 bne Lcbc_enc_head sub. r30, r30, r0 # len -= 16 addi $out, $out, 16 beq Lcbc_unaligned_done Lcbc_enc_loop: vmr v0, $inptail lvx $inptail, 0, $inp Loading @@ -713,6 +744,32 @@ Lcbc_decrypt: bl _vpaes_decrypt_preheat li r0, 16 beq cr1, Lcbc_dec_loop # $out is aligned vmr v0, $inptail lvx $inptail, 0, $inp addi $inp, $inp, 16 ?vperm v0, v0, $inptail, $inpperm vmr v25, v0 # put aside input bl _vpaes_decrypt_core andi. r8, $out, 15 vxor v0, v0, v24 # ^= iv vmr v24, v25 sub r9, $out, r8 vperm $outhead, v0, v0, $outperm # rotate right/left Lcbc_dec_head: stvebx $outhead, r8, r9 cmpwi r8, 15 addi r8, r8, 1 bne Lcbc_dec_head sub. r30, r30, r0 # len -= 16 addi $out, $out, 16 beq Lcbc_unaligned_done Lcbc_dec_loop: vmr v0, $inptail lvx $inptail, 0, $inp Loading @@ -733,23 +790,29 @@ Lcbc_dec_loop: bne Lcbc_dec_loop Lcbc_done: addi $out, $out, -1 lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out beq cr1, Lcbc_write_iv # $out is aligned Lcbc_unaligned_done: andi. r8, $out, 15 sub $out, $out, r8 li r9, 0 Lcbc_tail: stvebx $outhead, r9, $out addi r9, r9, 1 cmpw r9, r8 bne Lcbc_tail Lcbc_write_iv: neg r8, r31 # write [potentially unaligned] iv li r10, 4 ?lvsl $outperm, 0, r8 li r6, 15 vnor $outmask, v7, v7 # 0xff..ff ?vperm $outmask, v7, $outmask, $outperm lvx $outhead, 0, r31 li r11, 8 li r12, 12 vperm v24, v24, v24, $outperm # rotate right/left vsel v0, $outhead, v24, $outmask lvx v1, r6, r31 stvx v0, 0, r31 vsel v1, v24, v1, $outmask stvx v1, r6, r31 stvewx v24, 0, r31 # ivp is at least 32-bit aligned stvewx v24, r10, r31 stvewx v24, r11, r31 stvewx v24, r12, r31 mtspr 256, r7 # restore vrsave li r10,`15+6*$SIZE_T` Loading Loading @@ -872,18 +935,21 @@ _vpaes_schedule_core: # encrypting, output zeroth round key after transform li r8, 0x30 # mov \$0x30,%r8d addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 li r9, 4 li r10, 8 li r11, 12 ?lvsr $outperm, 0, $out # prepare for unaligned access vnor $outmask, v9, v9 # 0xff..ff lvx $outhead, 0, $out ?vperm $outmask, v9, $outmask, $outperm #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) vperm v1, v0, v0, $outperm # rotate right/left vsel v2, $outhead, v1, $outmask vmr $outhead, v1 stvx v2, 0, $out vperm $outhead, v0, v0, $outperm # rotate right/left stvewx $outhead, 0, $out # some are superfluous stvewx $outhead, r9, $out stvewx $outhead, r10, $out addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 stvewx $outhead, r11, $out b Lschedule_go Lschedule_am_decrypting: Loading @@ -893,20 +959,24 @@ Lschedule_am_decrypting: addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 # decrypting, output zeroth round key after shiftrows lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 li r9, 4 li r10, 8 li r11, 12 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 neg r0, $out # prepare for unaligned access ?lvsl $outperm, 0, r0 addi $out, $out, 15 # 15 is not typo vnor $outmask, v9, v9 # 0xff..ff lvx $outhead, 0, $out ?vperm $outmask, $outmask, v9, $outperm #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) vperm v4, v4, v4, $outperm # rotate right/left vsel v2, $outhead, v4, $outmask vmr $outhead, v4 stvx v2, 0, $out vperm $outhead, v4, v4, $outperm # rotate right/left stvewx $outhead, 0, $out # some are superfluous stvewx $outhead, r9, $out stvewx $outhead, r10, $out addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 stvewx $outhead, r11, $out addi $out, $out, 15 # 15 is not typo xori r8, r8, 0x30 # xor \$0x30, %r8 Lschedule_go: Loading Loading @@ -1038,14 +1108,15 @@ Lschedule_mangle_last: #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key vperm v0, v0, v0, $outperm # rotate right/left li r10, 4 vsel v2, $outhead, v0, $outmask vmr $outhead, v0 li r11, 8 stvx v2, 0, $out addi $out, $out, 15 # 15 is not typo lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out li r12, 12 stvewx v0, 0, $out # some (or all) are redundant stvewx v0, r10, $out stvewx v0, r11, $out stvewx v0, r12, $out b Lschedule_mangle_done .align 4 Loading @@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec: bl _vpaes_schedule_transform # output transform #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key addi r9, $out, -15 # -15 is not typo vperm v0, v0, v0, $outperm # rotate right/left li r10, 4 vsel v2, $outhead, v0, $outmask vmr $outhead, v0 li r11, 8 stvx v2, 0, $out li r12, 12 stvewx v0, 0, r9 # some (or all) are redundant stvewx v0, r10, r9 stvewx v0, r11, r9 stvewx v0, r12, r9 addi $out, $out, -15 # -15 is not typo lvx v1, 0, $out # redundant in aligned case vsel v1, $outhead, v1, $outmask stvx v1, 0, $out Lschedule_mangle_done: mtlr r7 Loading