Commit 1ea8ae50 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

poly1305/asm/poly1305-*.pl: flip horizontal add and reduction.



Formally only 32-bit AVX2 code path needs this, but I choose to
harmonize all vector code paths.

RT#4346
Reviewed-by: default avatarRichard Levitte <levitte@openssl.org>
parent bdbd3aea
Loading
Loading
Loading
Loading
+9 −9
Original line number Diff line number Diff line
@@ -1056,6 +1056,15 @@ poly1305_blocks_neon:
	vmlal.u32	$D2,$H3#lo,$S4

.Lshort_tail:
	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	@ horizontal addition

	vadd.i64	$D3#lo,$D3#lo,$D3#hi
	vadd.i64	$D0#lo,$D0#lo,$D0#hi
	vadd.i64	$D4#lo,$D4#lo,$D4#hi
	vadd.i64	$D1#lo,$D1#lo,$D1#hi
	vadd.i64	$D2#lo,$D2#lo,$D2#hi

	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	@ lazy reduction, but without narrowing

@@ -1086,15 +1095,6 @@ poly1305_blocks_neon:
	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4

	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	@ horizontal addition

	vadd.i64	$D2#lo,$D2#lo,$D2#hi
	vadd.i64	$D0#lo,$D0#lo,$D0#hi
	vadd.i64	$D3#lo,$D3#lo,$D3#hi
	vadd.i64	$D1#lo,$D1#lo,$D1#hi
	vadd.i64	$D4#lo,$D4#lo,$D4#hi

	cmp		$len,#0
	bne		.Leven

+13 −13
Original line number Diff line number Diff line
@@ -790,6 +790,19 @@ poly1305_blocks_neon:
	umlal	$ACC2,$IN01_4,${S3}

.Lshort_tail:
	////////////////////////////////////////////////////////////////
	// horizontal add

	addp	$ACC3,$ACC3,$ACC3
	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
	addp	$ACC0,$ACC0,$ACC0
	 ldp	d10,d11,[sp,#32]
	addp	$ACC4,$ACC4,$ACC4
	 ldp	d12,d13,[sp,#48]
	addp	$ACC1,$ACC1,$ACC1
	 ldp	d14,d15,[sp,#64]
	addp	$ACC2,$ACC2,$ACC2

	////////////////////////////////////////////////////////////////
	// lazy reduction, but without narrowing

@@ -821,19 +834,6 @@ poly1305_blocks_neon:
	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4

	////////////////////////////////////////////////////////////////
	// horizontal add

	addp	$ACC2,$ACC2,$ACC2
	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
	addp	$ACC0,$ACC0,$ACC0
	 ldp	d10,d11,[sp,#32]
	addp	$ACC1,$ACC1,$ACC1
	 ldp	d12,d13,[sp,#48]
	addp	$ACC3,$ACC3,$ACC3
	 ldp	d14,d15,[sp,#64]
	addp	$ACC4,$ACC4,$ACC4

	////////////////////////////////////////////////////////////////
	// write the result, can be partially reduced

+31 −28
Original line number Diff line number Diff line
@@ -536,6 +536,8 @@ my $base = shift; $base = "esp" if (!defined($base));
			     },"edx");

sub lazy_reduction {
my $extra = shift;

	################################################################
	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
	# and P. Schwabe
@@ -543,6 +545,7 @@ sub lazy_reduction {
	 &movdqa	($T0,$D3);
	 &pand		($D3,$MASK);
	 &psrlq		($T0,26);
	 &$extra	()				if (defined($extra));
	 &paddq		($T0,$D4);			# h3 -> h4
	&movdqa		($T1,$D0);
	&pand		($D0,$MASK);
@@ -1091,21 +1094,21 @@ my $addr = shift;

&set_label("short_tail");

	&lazy_reduction	();

	################################################################
	# horizontal addition

	&pshufd		($T1,$D4,0b01001110);
	&pshufd		($T0,$D3,0b01001110);
	&paddq		($D4,$T1);
	&paddq		($D3,$T0);
	&pshufd		($T1,$D0,0b01001110);
	&pshufd		($T0,$D1,0b01001110);
	&paddd		($D0,$T1);
	&paddq		($D0,$T1);
	&paddq		($D1,$T0);
	&pshufd		($T1,$D2,0b01001110);
	&paddd		($D1,$T0);
	&pshufd		($T0,$D3,0b01001110);
	&paddd		($D2,$T1);
	&pshufd		($T1,$D4,0b01001110);
	&paddd		($D3,$T0);
	&paddd		($D4,$T1);
	#&paddq		($D2,$T1);

	&lazy_reduction	(sub { &paddq ($D2,$T1) });

&set_label("done");
	&movd		(&DWP(-16*3+4*0,"edi"),$D0);	# store hash value
@@ -1113,8 +1116,8 @@ my $addr = shift;
	&movd		(&DWP(-16*3+4*2,"edi"),$D2);
	&movd		(&DWP(-16*3+4*3,"edi"),$D3);
	&movd		(&DWP(-16*3+4*4,"edi"),$D4);
&set_label("nodata");
	&mov	("esp","ebp");
&set_label("nodata");
&function_end("_poly1305_blocks_sse2");

&align	(32);
@@ -1435,7 +1438,7 @@ sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
	&test	("eax","eax");				# is_base2_26?
	&jz	(&label("enter_blocks"));

&set_label("enter_avx2",16);
&set_label("enter_avx2");
	&vzeroupper	();

	&call	(&label("pic_point"));
@@ -1731,31 +1734,31 @@ sub vlazy_reduction {

	&vpmuladd	(sub {	my $i=shift; &QWP(4+32*$i-128,"edx");	});

	&vlazy_reduction();

	################################################################
	# horizontal addition

	&vpsrldq	($T0,$D4,8);
	&vpsrldq	($T1,$D3,8);
	&vpaddq		($D4,$D4,$T0);
	&vpsrldq	($T0,$D0,8);
	&vpaddq		($D3,$D3,$T1);
	&vpsrldq	($T1,$D1,8);
	&vpaddq		($D0,$D0,$T0);
	&vpsrldq	($T0,$D2,8);
	&vpaddq		($D1,$D1,$T1);
	&vpsrldq	($T1,$D3,8);
	&vpermq		($T1,$D4,2);			# keep folding
	&vpaddq		($D2,$D2,$T0);
	&vpsrldq	($T0,$D4,8);
	&vpaddq		($D3,$D3,$T1);
	&vpermq		($T1,$D0,2);			# keep folding
	&vpaddq		($D4,$D4,$T0);
	&vpermq		($T0,$D3,2);
	&vpaddq		($D4,$D4,$T1);
	&vpermq		($T1,$D0,2);
	&vpaddq		($D3,$D3,$T0);
	&vpermq		($T0,$D1,2);
	&vpaddq		($D0,$D0,$T1);
	&vpermq		($T1,$D2,2);
	&vpaddq		($D1,$D1,$T0);
	&vpermq		($T0,$D3,2);
	&vpaddq		($D2,$D2,$T1);
	&vpermq		($T1,$D4,2);
	&vpaddq		($D3,$D3,$T0);
	&vpaddq		($D4,$D4,$T1);

	&vlazy_reduction();

	&cmp		("ecx",0);
	&je		(&label("done"));
@@ -1772,14 +1775,14 @@ sub vlazy_reduction {
	&jmp		(&label("even"));

&set_label("done",16);
	&vmovd		(&DWP(-16*3+4*0,"edi"),"xmm0");	# store hash value
	&vmovd		(&DWP(-16*3+4*1,"edi"),"xmm1");
	&vmovd		(&DWP(-16*3+4*2,"edi"),"xmm2");
	&vmovd		(&DWP(-16*3+4*3,"edi"),"xmm3");
	&vmovd		(&DWP(-16*3+4*4,"edi"),"xmm4");
	&vmovd		(&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
	&vmovd		(&DWP(-16*3+4*1,"edi"),&X($D1));
	&vmovd		(&DWP(-16*3+4*2,"edi"),&X($D2));
	&vmovd		(&DWP(-16*3+4*3,"edi"),&X($D3));
	&vmovd		(&DWP(-16*3+4*4,"edi"),&X($D4));
	&vzeroupper	();
&set_label("nodata");
	&mov	("esp","ebp");
&set_label("nodata");
&function_end("_poly1305_blocks_avx2");
}
&set_label("const_sse2",64);
+44 −44
Original line number Diff line number Diff line
@@ -1197,6 +1197,20 @@ $code.=<<___;
	vpaddq		$T3,$D0,$D0		# d0 += h1*s4

.Lshort_tail_avx:
	################################################################
	# horizontal addition

	vpsrldq		\$8,$D4,$T4
	vpsrldq		\$8,$D3,$T3
	vpsrldq		\$8,$D1,$T1
	vpsrldq		\$8,$D0,$T0
	vpsrldq		\$8,$D2,$T2
	vpaddq		$T3,$D3,$D3
	vpaddq		$T4,$D4,$D4
	vpaddq		$T0,$D0,$D0
	vpaddq		$T1,$D1,$D1
	vpaddq		$T2,$D2,$D2

	################################################################
	# lazy reduction

@@ -1231,25 +1245,11 @@ $code.=<<___;
	vpand		$MASK,$D3,$D3
	vpaddq		$H3,$D4,$D4		# h3 -> h4

	################################################################
	# horizontal addition

	vpsrldq		\$8,$D2,$T2
	vpsrldq		\$8,$D0,$T0
	vpsrldq		\$8,$D1,$T1
	vpsrldq		\$8,$D3,$T3
	vpsrldq		\$8,$D4,$T4
	vpaddq		$T2,$D2,$H2
	vpaddq		$T0,$D0,$H0
	vpaddq		$T1,$D1,$H1
	vpaddq		$T3,$D3,$H3
	vpaddq		$T4,$D4,$H4

	vmovd		$H0,`4*0-48-64`($ctx)	# save partially reduced
	vmovd		$H1,`4*1-48-64`($ctx)
	vmovd		$H2,`4*2-48-64`($ctx)
	vmovd		$H3,`4*3-48-64`($ctx)
	vmovd		$H4,`4*4-48-64`($ctx)
	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
	vmovd		$D1,`4*1-48-64`($ctx)
	vmovd		$D2,`4*2-48-64`($ctx)
	vmovd		$D3,`4*3-48-64`($ctx)
	vmovd		$D4,`4*4-48-64`($ctx)
___
$code.=<<___	if ($win64);
	vmovdqa		0x50(%r11),%xmm6
@@ -1887,6 +1887,31 @@ $code.=<<___;
	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4

	################################################################
	# horizontal addition

	vpsrldq		\$8,$D1,$T1
	vpsrldq		\$8,$H2,$T2
	vpsrldq		\$8,$H3,$T3
	vpsrldq		\$8,$H4,$T4
	vpsrldq		\$8,$H0,$T0
	vpaddq		$T1,$D1,$D1
	vpaddq		$T2,$H2,$H2
	vpaddq		$T3,$H3,$H3
	vpaddq		$T4,$H4,$H4
	vpaddq		$T0,$H0,$H0

	vpermq		\$0x2,$H3,$T3
	vpermq		\$0x2,$H4,$T4
	vpermq		\$0x2,$H0,$T0
	vpermq		\$0x2,$D1,$T1
	vpermq		\$0x2,$H2,$T2
	vpaddq		$T3,$H3,$H3
	vpaddq		$T4,$H4,$H4
	vpaddq		$T0,$H0,$H0
	vpaddq		$T1,$D1,$D1
	vpaddq		$T2,$H2,$H2

	################################################################
	# lazy reduction

@@ -1921,31 +1946,6 @@ $code.=<<___;
	vpand		$MASK,$H3,$H3
	vpaddq		$D3,$H4,$H4		# h3 -> h4

	################################################################
	# horizontal addition

	vpsrldq		\$8,$H2,$T2
	vpsrldq		\$8,$H0,$T0
	vpsrldq		\$8,$H1,$T1
	vpsrldq		\$8,$H3,$T3
	vpsrldq		\$8,$H4,$T4
	vpaddq		$T2,$H2,$H2
	vpaddq		$T0,$H0,$H0
	vpaddq		$T1,$H1,$H1
	vpaddq		$T3,$H3,$H3
	vpaddq		$T4,$H4,$H4

	vpermq		\$0x2,$H2,$T2
	vpermq		\$0x2,$H0,$T0
	vpermq		\$0x2,$H1,$T1
	vpermq		\$0x2,$H3,$T3
	vpermq		\$0x2,$H4,$T4
	vpaddq		$T2,$H2,$H2
	vpaddq		$T0,$H0,$H0
	vpaddq		$T1,$H1,$H1
	vpaddq		$T3,$H3,$H3
	vpaddq		$T4,$H4,$H4

	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
	vmovd		%x#$H1,`4*1-48-64`($ctx)
	vmovd		%x#$H2,`4*2-48-64`($ctx)
+31 −0
Original line number Diff line number Diff line
@@ -667,6 +667,20 @@ static const struct poly1305_test poly1305_tests[] = {
     "95d5c005503e510d8cd0aa072c4a4d06""6eabc52d11653df47fbf63ab198bcc26",
     "f248312e578d9d58f8b7bb4d19105431"
    },
    /*
     * AVX2 in poly1305-x86.pl failed this with 176+32 split
     */
    {
    "248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd"
    "2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e8"
    "74cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c"
    "8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936a"
    "ff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a37"
    "09894e4eb0a4eedc4ae19468e66b81f2"
    "71351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb",
    "000102030405060708090a0b0c0d0e0f""00000000000000000000000000000000",
    "bc939bc5281480fa99c6d68c258ec42f"
    },
    /*
     * test vectors from Google
     */
@@ -844,6 +858,23 @@ int main()
                printf("\n");
                return 1;
            }

            for (half = 16; half < inlen; half += 16) {
                Poly1305_Init(&poly1305, key);
                Poly1305_Update(&poly1305, in, half);
                Poly1305_Update(&poly1305, in+half, inlen-half);
                Poly1305_Final(&poly1305, out);

                if (memcmp(out, expected, sizeof(expected)) != 0) {
                    printf("Poly1305 test #%d/%d failed.\n", i, half);
                    printf("got:      ");
                    hexdump(out, sizeof(out));
                    printf("\nexpected: ");
                    hexdump(expected, sizeof(expected));
                    printf("\n");
                    return 1;
                }
            }
        }

        free(in);