Loading crypto/modes/asm/ghash-x86.pl +9 −7 Original line number Diff line number Diff line Loading @@ -1021,13 +1021,14 @@ my ($Xhi,$Xi) = @_; &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 &movdqa ($Xhn,$Xn); &pxor ($T1,$Xn); # &lea ($inp,&DWP(32,$inp)); # i+=2 &pclmulqdq ($Xn,$Hkey,0x00); ####### &pclmulqdq ($Xhn,$Hkey,0x11); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &pclmulqdq ($T1,$T3,0x00); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &nop (); &lea ($inp,&DWP(32,$inp)); # i+=2 &sub ($len,0x20); &jbe (&label("even_tail")); &jmp (&label("mod_loop")); Loading @@ -1036,22 +1037,23 @@ my ($Xhi,$Xi) = @_; &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) &movdqa ($Xhi,$Xi); &pxor ($T2,$Xi); # &nop (); &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### &movups ($Hkey,&QWP(0,$Htbl)); # load H &pclmulqdq ($T2,$T3,0x10); ####### &movdqa ($T3,&QWP(0,$const)); &movups ($Hkey,&QWP(0,$Htbl)); # load H &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) &movdqa ($T3,&QWP(0,$const)); &xorps ($Xhi,$Xhn); &movdqu ($Xhn,&QWP(0,$inp)); # Ii &pxor ($T1,$Xi); # aggregated Karatsuba post-processing &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 &pxor ($T1,$Xhi); # &pxor ($T2,$T1); # &pshufb ($Xhn,$T3); &pxor ($T2,$T1); # &movdqa ($T1,$T2); # &psrldq ($T2,8); Loading @@ -1068,8 +1070,8 @@ my ($Xhi,$Xi) = @_; &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # &movups ($T3,&QWP(32,$Htbl)); &pclmulqdq ($Xn,$Hkey,0x00); ####### &movups ($T3,&QWP(32,$Htbl)); &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); Loading @@ -1080,9 +1082,9 @@ my ($Xhi,$Xi) = @_; &movdqa ($T2,$Xi); # 2nd phase &psrlq ($Xi,1); &pxor ($T1,$Xhn); &pxor ($Xhi,$T2); # &pclmulqdq ($Xhn,$Hkey,0x11); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &pxor ($Xhi,$T2); # &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # Loading crypto/modes/asm/ghash-x86_64.pl +37 −25 Original line number Diff line number Diff line Loading @@ -214,6 +214,7 @@ ___ $code=<<___; .text .extern OPENSSL_ia32cap_P .globl gcm_gmult_4bit .type gcm_gmult_4bit,\@function,2 Loading Loading @@ -597,7 +598,8 @@ ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10)); my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); $code.=<<___; .globl gcm_ghash_clmul Loading @@ -624,7 +626,6 @@ $code.=<<___ if ($win64); ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey Loading @@ -640,10 +641,16 @@ if ($do4xaggr) { my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); $code.=<<___; mov OPENSSL_ia32cap_P+4(%rip),%eax cmp \$0x30,$len jb .Lskip4x and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE cmp \$`1<<22`,%eax # check for MOVBE without XSAVE je .Lskip4x sub \$0x30,$len mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff movdqu 0x30($Htbl),$Hkey3 movdqu 0x40($Htbl),$Hkey4 Loading Loading @@ -819,51 +826,54 @@ $code.=<<___; pxor $T1,$Xi # Ii+Xi movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$T1 pxor $Xln,$T1 pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$T1 pclmulqdq \$0x00,$HK,$Xmn lea 32($inp),$inp # i+=2 nop sub \$0x20,$len jbe .Leven_tail nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T2 # pxor $Xi,$T2 # movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$T2 pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$Xhn # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$Xhn movdqu 16($inp),$Xln # Ii+1 pxor $Xi,$T1 # aggregated Karatsuba post-processing pxor $Xhi,$T1 pxor $Xhn,$Xhi # "Ii+Xi", consume early pxor $T1,$T2 pxor $T1,$Xmn pshufb $T3,$Xln movdqa $T2,$T1 # movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$T2 # pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $T2,$Xi # pxor $Xmn,$Xi # movdqa $Xln,$Xhn # movdqa $Xi,$T2 # 1st phase movdqa $Xi,$T1 psllq \$5,$Xi pclmulqdq \$0x00,$Hkey,$Xln ####### pxor $Xi,$T1 # pclmulqdq \$0x00,$Hkey,$Xln ####### psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # Loading @@ -871,9 +881,9 @@ $code.=<<___; pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # pshufd \$0b01001110,$Xhn,$T1 pxor $Xhn,$T1 # pxor $Xhn,$Xmn # pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase Loading @@ -882,33 +892,35 @@ $code.=<<___; pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # lea 32($inp),$inp psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$T1 ####### pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # .byte 0x66,0x90 lea 32($inp),$inp sub \$0x20,$len ja .Lmod_loop .Leven_tail: movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T2 # pxor $Xi,$T2 # movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$T2 pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi pxor $Xi,$T1 pxor $Xhi,$T1 pxor $T1,$T2 movdqa $T2,$T1 # pxor $T1,$Xmn movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$T2 # pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $T2,$Xi # pxor $Xmn,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; Loading Loading
crypto/modes/asm/ghash-x86.pl +9 −7 Original line number Diff line number Diff line Loading @@ -1021,13 +1021,14 @@ my ($Xhi,$Xi) = @_; &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 &movdqa ($Xhn,$Xn); &pxor ($T1,$Xn); # &lea ($inp,&DWP(32,$inp)); # i+=2 &pclmulqdq ($Xn,$Hkey,0x00); ####### &pclmulqdq ($Xhn,$Hkey,0x11); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &pclmulqdq ($T1,$T3,0x00); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &nop (); &lea ($inp,&DWP(32,$inp)); # i+=2 &sub ($len,0x20); &jbe (&label("even_tail")); &jmp (&label("mod_loop")); Loading @@ -1036,22 +1037,23 @@ my ($Xhi,$Xi) = @_; &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) &movdqa ($Xhi,$Xi); &pxor ($T2,$Xi); # &nop (); &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### &movups ($Hkey,&QWP(0,$Htbl)); # load H &pclmulqdq ($T2,$T3,0x10); ####### &movdqa ($T3,&QWP(0,$const)); &movups ($Hkey,&QWP(0,$Htbl)); # load H &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) &movdqa ($T3,&QWP(0,$const)); &xorps ($Xhi,$Xhn); &movdqu ($Xhn,&QWP(0,$inp)); # Ii &pxor ($T1,$Xi); # aggregated Karatsuba post-processing &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 &pxor ($T1,$Xhi); # &pxor ($T2,$T1); # &pshufb ($Xhn,$T3); &pxor ($T2,$T1); # &movdqa ($T1,$T2); # &psrldq ($T2,8); Loading @@ -1068,8 +1070,8 @@ my ($Xhi,$Xi) = @_; &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # &movups ($T3,&QWP(32,$Htbl)); &pclmulqdq ($Xn,$Hkey,0x00); ####### &movups ($T3,&QWP(32,$Htbl)); &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); Loading @@ -1080,9 +1082,9 @@ my ($Xhi,$Xi) = @_; &movdqa ($T2,$Xi); # 2nd phase &psrlq ($Xi,1); &pxor ($T1,$Xhn); &pxor ($Xhi,$T2); # &pclmulqdq ($Xhn,$Hkey,0x11); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &pxor ($Xhi,$T2); # &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # Loading
crypto/modes/asm/ghash-x86_64.pl +37 −25 Original line number Diff line number Diff line Loading @@ -214,6 +214,7 @@ ___ $code=<<___; .text .extern OPENSSL_ia32cap_P .globl gcm_gmult_4bit .type gcm_gmult_4bit,\@function,2 Loading Loading @@ -597,7 +598,8 @@ ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10)); my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); $code.=<<___; .globl gcm_ghash_clmul Loading @@ -624,7 +626,6 @@ $code.=<<___ if ($win64); ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey Loading @@ -640,10 +641,16 @@ if ($do4xaggr) { my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); $code.=<<___; mov OPENSSL_ia32cap_P+4(%rip),%eax cmp \$0x30,$len jb .Lskip4x and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE cmp \$`1<<22`,%eax # check for MOVBE without XSAVE je .Lskip4x sub \$0x30,$len mov \$0xA040608020C0E000,%rax # ((7..0)0xE0)&0xff movdqu 0x30($Htbl),$Hkey3 movdqu 0x40($Htbl),$Hkey4 Loading Loading @@ -819,51 +826,54 @@ $code.=<<___; pxor $T1,$Xi # Ii+Xi movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$T1 pxor $Xln,$T1 pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$T1 pclmulqdq \$0x00,$HK,$Xmn lea 32($inp),$inp # i+=2 nop sub \$0x20,$len jbe .Leven_tail nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T2 # pxor $Xi,$T2 # movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$T2 pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$Xhn # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$Xhn movdqu 16($inp),$Xln # Ii+1 pxor $Xi,$T1 # aggregated Karatsuba post-processing pxor $Xhi,$T1 pxor $Xhn,$Xhi # "Ii+Xi", consume early pxor $T1,$T2 pxor $T1,$Xmn pshufb $T3,$Xln movdqa $T2,$T1 # movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$T2 # pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $T2,$Xi # pxor $Xmn,$Xi # movdqa $Xln,$Xhn # movdqa $Xi,$T2 # 1st phase movdqa $Xi,$T1 psllq \$5,$Xi pclmulqdq \$0x00,$Hkey,$Xln ####### pxor $Xi,$T1 # pclmulqdq \$0x00,$Hkey,$Xln ####### psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # Loading @@ -871,9 +881,9 @@ $code.=<<___; pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # pshufd \$0b01001110,$Xhn,$T1 pxor $Xhn,$T1 # pxor $Xhn,$Xmn # pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase Loading @@ -882,33 +892,35 @@ $code.=<<___; pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # lea 32($inp),$inp psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$T1 ####### pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # .byte 0x66,0x90 lea 32($inp),$inp sub \$0x20,$len ja .Lmod_loop .Leven_tail: movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T2 # pxor $Xi,$T2 # movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$T2 pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi pxor $Xi,$T1 pxor $Xhi,$T1 pxor $T1,$T2 movdqa $T2,$T1 # pxor $T1,$Xmn movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$T2 # pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $T2,$Xi # pxor $Xmn,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; Loading