Commit 619b9466 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Add support for Intel SHA extension.

parent fd2309aa
Loading
Loading
Loading
Loading
+228 −22
Original line number Diff line number Diff line
@@ -118,7 +118,9 @@ $code.=<<___;
aesni_cbc_sha1_enc:
	# caller should check for SSSE3 and AES-NI bits
	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
	mov	OPENSSL_ia32cap_P+4(%rip),%r11
	bt	\$61,%r11		# check SHA bit
	jc	aesni_cbc_sha1_enc_shaext
___
$code.=<<___ if ($avx);
	and	\$`1<<28`,%r11d		# mask AVX bit
@@ -200,7 +202,7 @@ $code.=<<___;
	mov	$in0,%r12			# reassign arguments
	mov	$out,%r13
	mov	$len,%r14
	mov	$key,%r15
	lea	112($key),%r15			# size optimization
	movdqu	($ivp),$iv			# load IV
	mov	$ivp,88(%rsp)			# save $ivp
___
@@ -209,7 +211,7 @@ my $rounds="${ivp}d";
$code.=<<___;
	shl	\$6,$len
	sub	$in0,$out
	mov	240($key),$rounds
	mov	240-112($key),$rounds
	add	$inp,$len		# end of input

	lea	K_XX_XX(%rip),$K_XX_XX
@@ -243,8 +245,8 @@ $code.=<<___;
	psubd	@Tx[1],@X[-3&7]
	movdqa	@X[-2&7],32(%rsp)
	psubd	@Tx[1],@X[-2&7]
	movups	($key),$rndkey0		# $key[0]
	movups	16($key),$rndkey[0]	# forward reference
	movups	-112($key),$rndkey0	# $key[0]
	movups	16-112($key),$rndkey[0]	# forward reference
	jmp	.Loop_ssse3
___

@@ -261,31 +263,31 @@ ___
___
      $code.=<<___;
	xorps		$in,$iv
	movups		`32+16*$k-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
	movups		`32+16*$k`($key),$rndkey[1]
___
    } elsif ($k==9) {
      $sn++;
      $code.=<<___;
	cmp		\$11,$rounds
	jb		.Laesenclast$sn
	movups		`32+16*($k+0)`($key),$rndkey[1]
	movups		`32+16*($k+0)-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
	movups		`32+16*($k+1)`($key),$rndkey[0]
	movups		`32+16*($k+1)-112`($key),$rndkey[0]
	aesenc		$rndkey[1],$iv
	je		.Laesenclast$sn
	movups		`32+16*($k+2)`($key),$rndkey[1]
	movups		`32+16*($k+2)-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
	movups		`32+16*($k+3)`($key),$rndkey[0]
	movups		`32+16*($k+3)-112`($key),$rndkey[0]
	aesenc		$rndkey[1],$iv
.Laesenclast$sn:
	aesenclast	$rndkey[0],$iv
	movups		16($key),$rndkey[1]		# forward reference
	movups		16-112($key),$rndkey[1]		# forward reference
___
    } else {
      $code.=<<___;
	movups		`32+16*$k-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
	movups		`32+16*$k`($key),$rndkey[1]
___
    }
    $r++;	unshift(@rndkey,pop(@rndkey));
@@ -1041,7 +1043,7 @@ $code.=<<___;
	mov	$in0,%r12			# reassign arguments
	mov	$out,%r13
	mov	$len,%r14
	mov	$key,%r15
	lea	112($key),%r15			# size optimization
	vmovdqu	($ivp),$iv			# load IV
	mov	$ivp,88(%rsp)			# save $ivp
___
@@ -1050,8 +1052,7 @@ my $rounds="${ivp}d";
$code.=<<___;
	shl	\$6,$len
	sub	$in0,$out
	mov	240($key),$rounds
	add	\$112,$key		# size optimization
	mov	240-112($key),$rounds
	add	$inp,$len		# end of input

	lea	K_XX_XX(%rip),$K_XX_XX
@@ -1651,11 +1652,180 @@ K_XX_XX:
.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0

.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align	64
___
						{{{
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");

$rounds="%r11d";

($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
@rndkey=("%xmm0","%xmm1");
$r=0;

my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
my @MSG=map("%xmm$_",(3..6));

$code.=<<___;
.type	aesni_cbc_sha1_enc_shaext,\@function,6
.align	32
aesni_cbc_sha1_enc_shaext:
	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
___
$code.=<<___ if ($win64);
	lea	`-8-4*16`(%rsp),%rsp
	movaps	%xmm6,-8-10*16(%rax)
	movaps	%xmm7,-8-9*16(%rax)
	movaps	%xmm8,-8-8*16(%rax)
	movaps	%xmm9,-8-7*16(%rax)
	movaps	%xmm10,-8-6*16(%rax)
	movaps	%xmm11,-8-5*16(%rax)
	movaps	%xmm12,-8-4*16(%rax)
	movaps	%xmm13,-8-3*16(%rax)
	movaps	%xmm14,-8-2*16(%rax)
	movaps	%xmm15,-8-1*16(%rax)
.Lprologue_shaext:
___
$code.=<<___;
	movdqu	($ctx),$ABCD
	movd	16($ctx),$E
	movdqa	K_XX_XX+0x50(%rip),$BSWAP	# byte-n-word swap

	mov	240($key),$rounds
	sub	$in0,$out
	movups	($key),$rndkey0			# $key[0]
	movups	16($key),$rndkey[0]		# forward reference
	lea	112($key),$key			# size optimization

	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
	pshufd	\$0b00011011,$E,$E		# flip word order
	jmp	.Loop_shaext

.align	16
.Loop_shaext:
___
	&$aesenc();
$code.=<<___;
	movdqu		($inp),@MSG[0]
	movdqa		$E,$E_SAVE		# offload $E
	pshufb		$BSWAP,@MSG[0]
	movdqu		0x10($inp),@MSG[1]
	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
___
	&$aesenc();
$code.=<<___;
	pshufb		$BSWAP,@MSG[1]

	paddd		@MSG[0],$E
	movdqu		0x20($inp),@MSG[2]
	lea		0x40($inp),$inp
	pxor		$E_SAVE,@MSG[0]		# black magic
___
	&$aesenc();
$code.=<<___;
	pxor		$E_SAVE,@MSG[0]		# black magic
	movdqa		$ABCD,$E_
	pshufb		$BSWAP,@MSG[2]
	sha1rnds4	\$0,$E,$ABCD		# 0-3
	sha1nexte	@MSG[1],$E_
___
	&$aesenc();
$code.=<<___;
	sha1msg1	@MSG[1],@MSG[0]
	movdqu		-0x10($inp),@MSG[3]
	movdqa		$ABCD,$E
	pshufb		$BSWAP,@MSG[3]
___
	&$aesenc();
$code.=<<___;
	sha1rnds4	\$0,$E_,$ABCD		# 4-7
	sha1nexte	@MSG[2],$E
	pxor		@MSG[2],@MSG[0]
	sha1msg1	@MSG[2],@MSG[1]
___
	&$aesenc();

for($i=2;$i<20-4;$i++) {
$code.=<<___;
	movdqa		$ABCD,$E_
	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 8-11
	sha1nexte	@MSG[3],$E_
___
	&$aesenc();
$code.=<<___;
	sha1msg2	@MSG[3],@MSG[0]
	pxor		@MSG[3],@MSG[1]
	sha1msg1	@MSG[3],@MSG[2]
___
	($E,$E_)=($E_,$E);
	push(@MSG,shift(@MSG));

	&$aesenc();
}
$code.=<<___;
	movdqa		$ABCD,$E_
	sha1rnds4	\$3,$E,$ABCD		# 64-67
	sha1nexte	@MSG[3],$E_
	sha1msg2	@MSG[3],@MSG[0]
	pxor		@MSG[3],@MSG[1]
___
	&$aesenc();
$code.=<<___;
	movdqa		$ABCD,$E
	sha1rnds4	\$3,$E_,$ABCD		# 68-71
	sha1nexte	@MSG[0],$E
	sha1msg2	@MSG[0],@MSG[1]
___
	&$aesenc();
$code.=<<___;
	movdqa		$E_SAVE,@MSG[0]
	movdqa		$ABCD,$E_
	sha1rnds4	\$3,$E,$ABCD		# 72-75
	sha1nexte	@MSG[1],$E_
___
	&$aesenc();
$code.=<<___;
	movdqa		$ABCD,$E
	sha1rnds4	\$3,$E_,$ABCD		# 76-79
	sha1nexte	$MSG[0],$E
___
	while($r<40)	{ &$aesenc(); }		# remaining aesenc's
$code.=<<___;
	dec		$len

	paddd		$ABCD_SAVE,$ABCD
	movups		$iv,48($out,$in0)	# write output
	lea		64($in0),$in0
	jnz		.Loop_shaext

	pshufd	\$0b00011011,$ABCD,$ABCD
	pshufd	\$0b00011011,$E,$E
	movups	$iv,($ivp)			# write IV
	movdqu	$ABCD,($ctx)
	movd	$E,16($ctx)
___
$code.=<<___ if ($win64);
	movaps	-8-10*16(%rax),%xmm6
	movaps	-8-9*16(%rax),%xmm7
	movaps	-8-8*16(%rax),%xmm8
	movaps	-8-7*16(%rax),%xmm9
	movaps	-8-6*16(%rax),%xmm10
	movaps	-8-5*16(%rax),%xmm11
	movaps	-8-4*16(%rax),%xmm12
	movaps	-8-3*16(%rax),%xmm13
	movaps	-8-2*16(%rax),%xmm14
	movaps	-8-1*16(%rax),%xmm15
	mov	%rax,%rsp
.Lepilogue_shaext:
___
$code.=<<___;
	ret
.size	aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
___
						}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
@@ -1793,12 +1963,43 @@ sub rex {

    $rex|=0x04			if($dst>=8);
    $rex|=0x01			if($src>=8);
    push @opcode,$rex|0x40	if($rex);
    unshift @opcode,$rex|0x40	if($rex);
}

sub sha1rnds4 {
    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
      my @opcode=(0x0f,0x3a,0xcc);
	rex(\@opcode,$3,$2);
	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
	my $c=$1;
	push @opcode,$c=~/^0/?oct($c):$c;
	return ".byte\t".join(',',@opcode);
    } else {
	return "sha1rnds4\t".@_[0];
    }
}

sub sha1op38 {
    my $instr = shift;
    my %opcodelet = (
		"sha1nexte" => 0xc8,
  		"sha1msg1"  => 0xc9,
		"sha1msg2"  => 0xca	);

    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
      my @opcode=(0x0f,0x38);
	rex(\@opcode,$2,$1);
	push @opcode,$opcodelet{$instr};
	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
	return ".byte\t".join(',',@opcode);
    } else {
	return $instr."\t".@_[0];
    }
}

sub aesni {
  my $line=shift;
  my @opcode=(0x66);
  my @opcode=(0x0f,0x38);

    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
	my %opcodelet = (
@@ -1807,15 +2008,20 @@ sub aesni {
	);
	return undef if (!defined($opcodelet{$1}));
	rex(\@opcode,$3,$2);
	push @opcode,0x0f,0x38,$opcodelet{$1};
	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
	push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3);	# ModR/M
	unshift @opcode,0x66;
	return ".byte\t".join(',',@opcode);
    }
    return $line;
}

$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
foreach (split("\n",$code)) {
        s/\`([^\`]*)\`/eval $1/geo;

print $code;
	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
	s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;

	print $_,"\n";
}
close STDOUT;
+322 −2
Original line number Diff line number Diff line
@@ -112,8 +112,13 @@ $code.=<<___ if ($avx);
	cmp	\$0,`$win64?"%rcx":"%rdi"`
	je	.Lprobe
	mov	0(%r11),%eax
	mov	4(%r11),%r10d
	mov	8(%r11),%r11d
	mov	4(%r11),%r10

	bt	\$61,%r10			# check for SHA
	jc	${func}_shaext

	mov	%r10,%r11
	shr	\$32,%r11

	test	\$`1<<11`,%r10d			# check for XOP
	jnz	${func}_xop
@@ -1196,6 +1201,288 @@ $code.=<<___;
.size	${func}_avx2,.-${func}_avx2
___
}}
}}
{{
my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");

my ($rounds,$Tbl)=("%r11d","%rbx");

my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
my @rndkey=("%xmm4","%xmm5");
my $r=0;
my $sn=0;

my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
my @MSG=map("%xmm$_",(10..13));

my $aesenc=sub {
  use integer;
  my ($n,$k)=($r/10,$r%10);
    if ($k==0) {
      $code.=<<___;
	movups		`16*$n`($in0),$in		# load input
	xorps		$rndkey0,$in
___
      $code.=<<___ if ($n);
	movups		$iv,`16*($n-1)`($out,$in0)	# write output
___
      $code.=<<___;
	xorps		$in,$iv
	movups		`32+16*$k-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
___
    } elsif ($k==9) {
      $sn++;
      $code.=<<___;
	cmp		\$11,$rounds
	jb		.Laesenclast$sn
	movups		`32+16*($k+0)-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
	movups		`32+16*($k+1)-112`($key),$rndkey[0]
	aesenc		$rndkey[1],$iv
	je		.Laesenclast$sn
	movups		`32+16*($k+2)-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
	movups		`32+16*($k+3)-112`($key),$rndkey[0]
	aesenc		$rndkey[1],$iv
.Laesenclast$sn:
	aesenclast	$rndkey[0],$iv
	movups		16-112($key),$rndkey[1]		# forward reference
	nop
___
    } else {
      $code.=<<___;
	movups		`32+16*$k-112`($key),$rndkey[1]
	aesenc		$rndkey[0],$iv
___
    }
    $r++;	unshift(@rndkey,pop(@rndkey));
};

$code.=<<___;
.type	${func}_shaext,\@function,6
.align	32
${func}_shaext:
	mov	%rsp,%rax
	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
	push	%rbx
___
$code.=<<___ if ($win64);
	lea	`-4*16`(%rsp),%rsp
	movaps	%xmm6,-8-10*16(%rax)
	movaps	%xmm7,-8-9*16(%rax)
	movaps	%xmm8,-8-8*16(%rax)
	movaps	%xmm9,-8-7*16(%rax)
	movaps	%xmm10,-8-6*16(%rax)
	movaps	%xmm11,-8-5*16(%rax)
	movaps	%xmm12,-8-4*16(%rax)
	movaps	%xmm13,-8-3*16(%rax)
	movaps	%xmm14,-8-2*16(%rax)
	movaps	%xmm15,-8-1*16(%rax)
.Lprologue_shaext:
___
$code.=<<___;
	lea		K256+0x80(%rip),$Tbl
	movdqu		($ctx),$ABEF		# DCBA
	movdqu		16($ctx),$CDGH		# HGFE
	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask

	mov		240($key),$rounds
	sub		$in0,$out
	movups		($key),$rndkey0		# $key[0]
	movups		16($key),$rndkey[0]	# forward reference
	lea		112($key),$key		# size optimization

	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
	movdqa		$TMP,$BSWAP		# offload
	palignr		\$8,$CDGH,$ABEF		# ABEF
	punpcklqdq	$Wi,$CDGH		# CDGH

	jmp	.Loop_shaext

.align	16
.Loop_shaext:
	movdqu		($inp),@MSG[0]
	movdqu		0x10($inp),@MSG[1]
	movdqu		0x20($inp),@MSG[2]
	pshufb		$TMP,@MSG[0]
	movdqu		0x30($inp),@MSG[3]

	movdqa		0*32-0x80($Tbl),$Wi
	paddd		@MSG[0],$Wi
	pshufb		$TMP,@MSG[1]
	movdqa		$CDGH,$CDGH_SAVE	# offload
	movdqa		$ABEF,$ABEF_SAVE	# offload
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 0-3
	pshufd		\$0x0e,$Wi,$Wi
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$CDGH,$ABEF

	movdqa		1*32-0x80($Tbl),$Wi
	paddd		@MSG[1],$Wi
	pshufb		$TMP,@MSG[2]
	lea		0x40($inp),$inp
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 4-7
	pshufd		\$0x0e,$Wi,$Wi
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$CDGH,$ABEF

	movdqa		2*32-0x80($Tbl),$Wi
	paddd		@MSG[2],$Wi
	pshufb		$TMP,@MSG[3]
	sha256msg1	@MSG[1],@MSG[0]
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 8-11
	pshufd		\$0x0e,$Wi,$Wi
	movdqa		@MSG[3],$TMP
	palignr		\$4,@MSG[2],$TMP
	paddd		$TMP,@MSG[0]
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$CDGH,$ABEF

	movdqa		3*32-0x80($Tbl),$Wi
	paddd		@MSG[3],$Wi
	sha256msg2	@MSG[3],@MSG[0]
	sha256msg1	@MSG[2],@MSG[1]
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 12-15
	pshufd		\$0x0e,$Wi,$Wi
___
	&$aesenc();
$code.=<<___;
	movdqa		@MSG[0],$TMP
	palignr		\$4,@MSG[3],$TMP
	paddd		$TMP,@MSG[1]
	sha256rnds2	$CDGH,$ABEF
___
for($i=4;$i<16-3;$i++) {
	&$aesenc()	if (($r%10)==0);
$code.=<<___;
	movdqa		$i*32-0x80($Tbl),$Wi
	paddd		@MSG[0],$Wi
	sha256msg2	@MSG[0],@MSG[1]
	sha256msg1	@MSG[3],@MSG[2]
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 16-19...
	pshufd		\$0x0e,$Wi,$Wi
	movdqa		@MSG[1],$TMP
	palignr		\$4,@MSG[0],$TMP
	paddd		$TMP,@MSG[2]
___
	&$aesenc();
	&$aesenc()	if ($r==19);
$code.=<<___;
	sha256rnds2	$CDGH,$ABEF
___
	push(@MSG,shift(@MSG));
}
$code.=<<___;
	movdqa		13*32-0x80($Tbl),$Wi
	paddd		@MSG[0],$Wi
	sha256msg2	@MSG[0],@MSG[1]
	sha256msg1	@MSG[3],@MSG[2]
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 52-55
	pshufd		\$0x0e,$Wi,$Wi
	movdqa		@MSG[1],$TMP
	palignr		\$4,@MSG[0],$TMP
	paddd		$TMP,@MSG[2]
___
	&$aesenc();
	&$aesenc();
$code.=<<___;
	sha256rnds2	$CDGH,$ABEF

	movdqa		14*32-0x80($Tbl),$Wi
	paddd		@MSG[1],$Wi
	sha256msg2	@MSG[1],@MSG[2]
	movdqa		$BSWAP,$TMP
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 56-59
	pshufd		\$0x0e,$Wi,$Wi
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$CDGH,$ABEF

	movdqa		15*32-0x80($Tbl),$Wi
	paddd		@MSG[2],$Wi
___
	&$aesenc();
	&$aesenc();
$code.=<<___;
	sha256rnds2	$ABEF,$CDGH		# 60-63
	pshufd		\$0x0e,$Wi,$Wi
___
	&$aesenc();
$code.=<<___;
	sha256rnds2	$CDGH,$ABEF
	#pxor		$CDGH,$rndkey0		# black magic
___
	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
$code.=<<___;
	#xorps		$CDGH,$rndkey0		# black magic
	paddd		$CDGH_SAVE,$CDGH
	paddd		$ABEF_SAVE,$ABEF

	dec		$len
	movups		$iv,48($out,$in0)	# write output
	lea		64($in0),$in0
	jnz		.Loop_shaext

	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
	punpckhqdq	$CDGH,$ABEF		# DCBA
	palignr		\$8,$TMP,$CDGH		# HGFE

	movups		$iv,($ivp)		# write IV
	movdqu		$ABEF,($ctx)
	movdqu		$CDGH,16($ctx)
___
$code.=<<___ if ($win64);
	movaps	-8-10*16(%rax),%xmm6
	movaps	-8-9*16(%rax),%xmm7
	movaps	-8-8*16(%rax),%xmm8
	movaps	-8-7*16(%rax),%xmm9
	movaps	-8-6*16(%rax),%xmm10
	movaps	-8-5*16(%rax),%xmm11
	movaps	-8-4*16(%rax),%xmm12
	movaps	-8-3*16(%rax),%xmm13
	movaps	-8-2*16(%rax),%xmm14
	movaps	-8-1*16(%rax),%xmm15
.Lepilogue_shaext:
___
$code.=<<___;
	mov	-8(%rax),%rbx
	mov	%rax,%rsp
	ret
.size	${func}_shaext,.-${func}_shaext
___
}}}}}

# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -1347,6 +1634,39 @@ $code.=<<___ if ($avx>1);
___
}

####################################################################
sub rex {
  local *opcode=shift;
  my ($dst,$src)=@_;
  my $rex=0;

    $rex|=0x04			if($dst>=8);
    $rex|=0x01			if($src>=8);
    unshift @opcode,$rex|0x40	if($rex);
}

{
  my %opcodelet = (
		"sha256rnds2" => 0xcb,
  		"sha256msg1"  => 0xcc,
		"sha256msg2"  => 0xcd	);

  sub sha256op38 {
    my $instr = shift;

    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
      my @opcode=(0x0f,0x38);
	rex(\@opcode,$2,$1);
	push @opcode,$opcodelet{$instr};
	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
	return ".byte\t".join(',',@opcode);
    } else {
	return $instr."\t".@_[0];
    }
  }
}

$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
print $code;
close STDOUT;
+120 −0
Original line number Diff line number Diff line
@@ -79,6 +79,10 @@
# strongly, it's probably more appropriate to discuss possibility of
# using vector rotate XOP on AMD...

# March 2014.
#
# Add support for Intel SHA Extensions.

######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
@@ -303,6 +307,7 @@ if ($alt) {

&function_begin("sha1_block_data_order");
if ($xmm) {
  &static_label("shaext_shortcut");
  &static_label("ssse3_shortcut");
  &static_label("avx_shortcut")		if ($ymm);
  &static_label("K_XX_XX");
@@ -317,8 +322,11 @@ if ($xmm) {
	&mov	($D,&DWP(4,$T));
	&test	($D,1<<9);		# check SSSE3 bit
	&jz	(&label("x86"));
	&mov	($C,&DWP(8,$T));
	&test	($A,1<<24);		# check FXSR bit
	&jz	(&label("x86"));
	&test	($C,1<<29);		# check SHA bit
	&jnz	(&label("shaext_shortcut"));
	if ($ymm) {
		&and	($D,1<<28);		# mask AVX bit
		&and	($A,1<<30);		# mask "Intel CPU" bit
@@ -397,6 +405,117 @@ if ($xmm) {
&function_end("sha1_block_data_order");

if ($xmm) {
{
######################################################################
# Intel SHA Extensions implementation of SHA1 update function.
#
my ($ctx,$inp,$num)=("edi","esi","ecx");
my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3));
my @MSG=map("xmm$_",(4..7));

sub sha1rnds4 {
 my ($dst,$src,$imm)=@_;
    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    {	&data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm);	}
}
sub sha1op38 {
 my ($opcodelet,$dst,$src)=@_;
    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    {	&data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);	}
}
sub sha1nexte	{ sha1op38(0xc8,@_); }
sub sha1msg1	{ sha1op38(0xc9,@_); }
sub sha1msg2	{ sha1op38(0xca,@_); }

&function_begin("_sha1_block_data_order_shaext");
	&call	(&label("pic_point"));	# make it PIC!
	&set_label("pic_point");
	&blindpop($tmp1);
	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
&set_label("shaext_shortcut");
	&mov	($ctx,&wparam(0));
	&mov	("ebx","esp");
	&mov	($inp,&wparam(1));
	&mov	($num,&wparam(2));
	&sub	("esp",32);

	&movdqu	($ABCD,&QWP(0,$ctx));
	&movd	($E,&QWP(16,$ctx));
	&and	("esp",-32);
	&movdqa	($BSWAP,&QWP(0x50,$tmp1));	# byte-n-word swap

	&movdqu	(@MSG[0],&QWP(0,$inp));
	&pshufd	($ABCD,$ABCD,0b00011011);	# flip word order
	&movdqu	(@MSG[1],&QWP(0x10,$inp));
	&pshufd	($E,$E,0b00011011);		# flip word order
	&movdqu	(@MSG[2],&QWP(0x20,$inp));
	&pshufb	(@MSG[0],$BSWAP);
	&movdqu	(@MSG[3],&QWP(0x30,$inp));
	&pshufb	(@MSG[1],$BSWAP);
	&pshufb	(@MSG[2],$BSWAP);
	&pshufb	(@MSG[3],$BSWAP);
	&jmp	(&label("loop_shaext"));

&set_label("loop_shaext",16);
	&dec		($num);
	&lea		("eax",&DWP(0x40,$inp));
	&movdqa		(&QWP(0,"esp"),$E);	# offload $E
	&paddd		($E,@MSG[0]);
	&cmovne		($inp,"eax");
	&movdqa		(&QWP(16,"esp"),$ABCD);	# offload $ABCD

for($i=0;$i<20-4;$i+=2) {
	&sha1msg1	(@MSG[0],@MSG[1]);
	&movdqa		($E_,$ABCD);
	&sha1rnds4	($ABCD,$E,int($i/5));	# 0-3...
	&sha1nexte	($E_,@MSG[1]);
	&pxor		(@MSG[0],@MSG[2]);
	&sha1msg1	(@MSG[1],@MSG[2]);
	&sha1msg2	(@MSG[0],@MSG[3]);

	&movdqa		($E,$ABCD);
	&sha1rnds4	($ABCD,$E_,int(($i+1)/5));
	&sha1nexte	($E,@MSG[2]);
	&pxor		(@MSG[1],@MSG[3]);
	&sha1msg2	(@MSG[1],@MSG[0]);

	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
}
	&movdqu		(@MSG[0],&QWP(0,$inp));
	&movdqa		($E_,$ABCD);
	&sha1rnds4	($ABCD,$E,3);		# 64-67
	&sha1nexte	($E_,@MSG[1]);
	&movdqu		(@MSG[1],&QWP(0x10,$inp));
	&pshufb		(@MSG[0],$BSWAP);

	&movdqa		($E,$ABCD);
	&sha1rnds4	($ABCD,$E_,3);		# 68-71
	&sha1nexte	($E,@MSG[2]);
	&movdqu		(@MSG[2],&QWP(0x20,$inp));
	&pshufb		(@MSG[1],$BSWAP);

	&movdqa		($E_,$ABCD);
	&sha1rnds4	($ABCD,$E,3);		# 72-75
	&sha1nexte	($E_,@MSG[3]);
	&movdqu		(@MSG[3],&QWP(0x30,$inp));
	&pshufb		(@MSG[2],$BSWAP);

	&movdqa		($E,$ABCD);
	&sha1rnds4	($ABCD,$E_,3);		# 76-79
	&movdqa		($E_,&QWP(0,"esp"));
	&pshufb		(@MSG[3],$BSWAP);
	&sha1nexte	($E,$E_);
	&paddd		($ABCD,&QWP(16,"esp"));

	&jnz		(&label("loop_shaext"));

	&pshufd	($ABCD,$ABCD,0b00011011);
	&pshufd	($E,$E,0b00011011);
	&movdqu	(&QWP(0,$ctx),$ABCD)
	&movd	(&DWP(16,$ctx),$E);
	&mov	("esp","ebx");
&function_end("_sha1_block_data_order_shaext");
}
######################################################################
# The SSSE3 implementation.
#
@@ -1340,6 +1459,7 @@ sub Xtail_avx()
&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
&data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0);
}
&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");

+509 −2

File changed.

Preview size limit exceeded, changes collapsed.

+199 −3

File changed.

Preview size limit exceeded, changes collapsed.

Loading