Loading crypto/modes/asm/ghash-x86_64.pl +24 −26 Original line number Diff line number Diff line Loading @@ -59,11 +59,11 @@ # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.76(+14%) # Sandy Bridge 1.79(+9%) # Ivy Bridge 1.79(+8%) # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Bulldozer 1.52(+25%) # Bulldozer 1.49(+27%) # March 2013 # Loading Loading @@ -673,8 +673,8 @@ $code.=<<___; pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey2,$Xl pclmulqdq \$0x11,$Hkey2,$Xh xorps $Xl,$Xln pclmulqdq \$0x10,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn movups 0x50($Htbl),$HK xorps $Xm,$Xmn Loading @@ -692,8 +692,8 @@ $code.=<<___; pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln pclmulqdq \$0x00,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn lea 0x40($inp),$inp Loading @@ -711,23 +711,23 @@ $code.=<<___; xorps $Xln,$Xi movdqu 0x20($inp),$Xln movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pclmulqdq \$0x10,$HK,$T1 pshufd \$0b01001110,$Xl,$Xm xorps $Xhn,$Xhi pxor $Xl,$Xm pshufb $T3,$Xln movups 0x20($Htbl),$HK pclmulqdq \$0x00,$Hkey,$Xl xorps $Xmn,$T1 movdqa $Xln,$Xhn pclmulqdq \$0x00,$Hkey,$Xl pshufd \$0b01001110,$Xln,$Xmn pxor $Xi,$T1 # aggregated Karatsuba post-processing pxor $Xln,$Xmn movdqa $Xln,$Xhn pxor $Xhi,$T1 # pxor $Xln,$Xmn movdqa $T1,$T2 # pslldq \$8,$T1 pclmulqdq \$0x11,$Hkey,$Xh pslldq \$8,$T1 psrldq \$8,$T2 # pxor $T1,$Xi movdqa .L7_mask(%rip),$T1 Loading @@ -736,8 +736,8 @@ $code.=<<___; pand $Xi,$T1 # 1st phase pshufb $T1,$T2 # pclmulqdq \$0x00,$HK,$Xm pxor $Xi,$T2 # pclmulqdq \$0x00,$HK,$Xm psllq \$57,$T2 # movdqa $T2,$T1 # pslldq \$8,$T2 Loading @@ -764,32 +764,31 @@ $code.=<<___; movdqa $Xl,$Xh pxor $Xm,$Xmn pshufd \$0b01001110,$Xl,$Xm pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl pxor $T2,$Xi # pxor $T1,$Xhi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl psrlq \$1,$Xi # pxor $Xhi,$Xi # movdqa $Xi,$Xhi pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln pxor $Xhi,$Xi # pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x00,$HK,$Xm xorps $Xh,$Xhn movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 lea 0x40($inp),$inp sub \$0x40,$len jnc .Lmod4_loop .Ltail4x: pclmulqdq \$0x00,$Hkey4,$Xi xorps $Xm,$Xmn pclmulqdq \$0x11,$Hkey4,$Xhi xorps $Xln,$Xi pclmulqdq \$0x10,$HK,$T1 xorps $Xm,$Xmn xorps $Xln,$Xi xorps $Xhn,$Xhi pxor $Xi,$Xhi # aggregated Karatsuba post-processing pxor $Xmn,$T1 Loading Loading @@ -852,13 +851,13 @@ $code.=<<___; pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$Xhn # Ii movdqu ($inp),$T2 # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$Xhn pshufb $T3,$T2 movdqu 16($inp),$Xln # Ii+1 pxor $Xhi,$T1 pxor $Xhn,$Xhi # "Ii+Xi", consume early pxor $T2,$Xhi # "Ii+Xi", consume early pxor $T1,$Xmn pshufb $T3,$Xln movdqa $Xmn,$T1 # Loading @@ -885,9 +884,9 @@ $code.=<<___; pxor $T1,$Xhi # pxor $Xhn,$Xmn # pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey,$Xhn ####### pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi Loading @@ -896,7 +895,6 @@ $code.=<<___; psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # .byte 0x66,0x90 sub \$0x20,$len ja .Lmod_loop Loading Loading
crypto/modes/asm/ghash-x86_64.pl +24 −26 Original line number Diff line number Diff line Loading @@ -59,11 +59,11 @@ # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.76(+14%) # Sandy Bridge 1.79(+9%) # Ivy Bridge 1.79(+8%) # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Bulldozer 1.52(+25%) # Bulldozer 1.49(+27%) # March 2013 # Loading Loading @@ -673,8 +673,8 @@ $code.=<<___; pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey2,$Xl pclmulqdq \$0x11,$Hkey2,$Xh xorps $Xl,$Xln pclmulqdq \$0x10,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn movups 0x50($Htbl),$HK xorps $Xm,$Xmn Loading @@ -692,8 +692,8 @@ $code.=<<___; pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln pclmulqdq \$0x00,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn lea 0x40($inp),$inp Loading @@ -711,23 +711,23 @@ $code.=<<___; xorps $Xln,$Xi movdqu 0x20($inp),$Xln movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pclmulqdq \$0x10,$HK,$T1 pshufd \$0b01001110,$Xl,$Xm xorps $Xhn,$Xhi pxor $Xl,$Xm pshufb $T3,$Xln movups 0x20($Htbl),$HK pclmulqdq \$0x00,$Hkey,$Xl xorps $Xmn,$T1 movdqa $Xln,$Xhn pclmulqdq \$0x00,$Hkey,$Xl pshufd \$0b01001110,$Xln,$Xmn pxor $Xi,$T1 # aggregated Karatsuba post-processing pxor $Xln,$Xmn movdqa $Xln,$Xhn pxor $Xhi,$T1 # pxor $Xln,$Xmn movdqa $T1,$T2 # pslldq \$8,$T1 pclmulqdq \$0x11,$Hkey,$Xh pslldq \$8,$T1 psrldq \$8,$T2 # pxor $T1,$Xi movdqa .L7_mask(%rip),$T1 Loading @@ -736,8 +736,8 @@ $code.=<<___; pand $Xi,$T1 # 1st phase pshufb $T1,$T2 # pclmulqdq \$0x00,$HK,$Xm pxor $Xi,$T2 # pclmulqdq \$0x00,$HK,$Xm psllq \$57,$T2 # movdqa $T2,$T1 # pslldq \$8,$T2 Loading @@ -764,32 +764,31 @@ $code.=<<___; movdqa $Xl,$Xh pxor $Xm,$Xmn pshufd \$0b01001110,$Xl,$Xm pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl pxor $T2,$Xi # pxor $T1,$Xhi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl psrlq \$1,$Xi # pxor $Xhi,$Xi # movdqa $Xi,$Xhi pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln pxor $Xhi,$Xi # pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x00,$HK,$Xm xorps $Xh,$Xhn movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 lea 0x40($inp),$inp sub \$0x40,$len jnc .Lmod4_loop .Ltail4x: pclmulqdq \$0x00,$Hkey4,$Xi xorps $Xm,$Xmn pclmulqdq \$0x11,$Hkey4,$Xhi xorps $Xln,$Xi pclmulqdq \$0x10,$HK,$T1 xorps $Xm,$Xmn xorps $Xln,$Xi xorps $Xhn,$Xhi pxor $Xi,$Xhi # aggregated Karatsuba post-processing pxor $Xmn,$T1 Loading Loading @@ -852,13 +851,13 @@ $code.=<<___; pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$Xhn # Ii movdqu ($inp),$T2 # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$Xhn pshufb $T3,$T2 movdqu 16($inp),$Xln # Ii+1 pxor $Xhi,$T1 pxor $Xhn,$Xhi # "Ii+Xi", consume early pxor $T2,$Xhi # "Ii+Xi", consume early pxor $T1,$Xmn pshufb $T3,$Xln movdqa $Xmn,$T1 # Loading @@ -885,9 +884,9 @@ $code.=<<___; pxor $T1,$Xhi # pxor $Xhn,$Xmn # pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey,$Xhn ####### pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi Loading @@ -896,7 +895,6 @@ $code.=<<___; psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # .byte 0x66,0x90 sub \$0x20,$len ja .Lmod_loop Loading