sha/asm/sha1-armv4-large.pl: add NEON and ARMv8 code paths. (9250a306) · Commits · CYBER - Cyber Security / TS 103 523 MSP / ETS / ETS OpenSSL

crypto/sha/asm/sha1-armv4-large.pl

+432 −8

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl

		# ====================================================================
		# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		@@ -52,6 +52,20 @@
		# Profiler-assisted and platform-specific optimization resulted in 10%
		# improvement on Cortex A8 core and 12.2 cycles per byte.

		# September 2013.
		#
		# Add NEON implementation (see sha1-586.pl for background info). On
		# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
		# faster than integer-only code. Because [fully unrolled] NEON code
		# is ~2.5x larger and there are some redundant instructions executed
		# when processing last block, improvement is not as big for smallest
		# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
		# byte, which is also >80% faster than integer-only code.

		# May 2014.
		#
		# Add ARMv8 code path performing at 2.35 cpb on Apple A7.

		while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
		open STDOUT,">$output";

		@@ -153,12 +167,22 @@ $code=<<___;
		#include "arm_arch.h"

		.text
		.code 32

		.global sha1_block_data_order
		.type sha1_block_data_order,%function

		.align 2
		.align 5
		sha1_block_data_order:
		#if __ARM_ARCH__>=7
		sub r3,pc,#8 @ sha1_block_data_order
		ldr r12,.LOPENSSL_armcap
		ldr r12,[r3,r12] @ OPENSSL_armcap_P
		tst r12,#8
		bne .LARMv8
		tst r12,#1
		bne .LNEON
		#endif
		stmdb sp!,{r4-r12,lr}
		add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
		ldmia $ctx,{$a,$b,$c,$d,$e}
		@@ -233,16 +257,416 @@ $code.=<<___;
		moveq pc,lr @ be binary compatible with V4, yet
		bx lr @ interoperable with Thumb ISA:-)
		#endif
		.align 2
		.size sha1_block_data_order,.-sha1_block_data_order

		.align 5
		.LK_00_19: .word 0x5a827999
		.LK_20_39: .word 0x6ed9eba1
		.LK_40_59: .word 0x8f1bbcdc
		.LK_60_79: .word 0xca62c1d6
		.size sha1_block_data_order,.-sha1_block_data_order
		.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
		.align 2
		.LOPENSSL_armcap:
		.word OPENSSL_armcap_P-sha1_block_data_order
		.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
		.align 5
		___
		#####################################################################
		# NEON stuff
		#
		{{{
		my @V=($a,$b,$c,$d,$e);
		my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
		my $Xi=4;
		my @X=map("q$_",(8..11,0..3));
		my @Tx=("q12","q13");
		my ($K,$zero)=("q14","q15");
		my $j=0;

		sub AUTOLOAD() # thunk [simplified] x86-style perlasm
		{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
		my $arg = pop;
		$arg = "#$arg" if ($arg*1 eq $arg);
		$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
		}

		sub body_00_19 () {
		(
		'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
		'&bic ($t0,$d,$b)',
		'&add ($e,$e,$Ki)', # e+=X[i]+K
		'&and ($t1,$c,$b)',
		'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
		'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
		'&eor ($t1,$t1,$t0)', # F_00_19
		'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
		'&add ($e,$e,$t1);'. # e+=F_00_19
		'$j++; unshift(@V,pop(@V));'
		)
		}
		sub body_20_39 () {
		(
		'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
		'&eor ($t0,$b,$d)',
		'&add ($e,$e,$Ki)', # e+=X[i]+K
		'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
		'&eor ($t1,$t0,$c)', # F_20_39
		'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
		'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
		'&add ($e,$e,$t1);'. # e+=F_20_39
		'$j++; unshift(@V,pop(@V));'
		)
		}
		sub body_40_59 () {
		(
		'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
		'&add ($e,$e,$Ki)', # e+=X[i]+K
		'&and ($t0,$c,$d)',
		'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
		'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
		'&eor ($t1,$c,$d)',
		'&add ($e,$e,$t0)',
		'&and ($t1,$t1,$b)',
		'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
		'&add ($e,$e,$t1);'. # e+=F_40_59
		'$j++; unshift(@V,pop(@V));'
		)
		}

		sub Xupdate_16_31 ()
		{ use integer;
		my $body = shift;
		my @insns = (&$body,&$body,&$body,&$body);
		my ($a,$b,$c,$d,$e);

		&vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&vadd_i32 (@Tx[1],@X[-1&7],$K);
		eval(shift(@insns));
		&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
		eval(shift(@insns));
		&vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
		eval(shift(@insns));
		eval(shift(@insns));
		&veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
		eval(shift(@insns));
		eval(shift(@insns));
		&veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
		eval(shift(@insns));
		eval(shift(@insns));
		&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
		&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
		eval(shift(@insns));
		eval(shift(@insns));
		&vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
		eval(shift(@insns));
		eval(shift(@insns));
		&vadd_i32 (@X[0],@Tx[0],@Tx[0]);
		eval(shift(@insns));
		eval(shift(@insns));
		&vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&vshr_u32 (@Tx[0],@Tx[1],30);
		eval(shift(@insns));
		eval(shift(@insns));
		&vshl_u32 (@Tx[1],@Tx[1],2);
		eval(shift(@insns));
		eval(shift(@insns));
		&veor (@X[0],@X[0],@Tx[0]);
		eval(shift(@insns));
		eval(shift(@insns));
		&veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2

		foreach (@insns) { eval; } # remaining instructions [if any]

		$Xi++; push(@X,shift(@X)); # "rotate" X[]
		}

		sub Xupdate_32_79 ()
		{ use integer;
		my $body = shift;
		my @insns = (&$body,&$body,&$body,&$body);
		my ($a,$b,$c,$d,$e);

		&vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
		eval(shift(@insns));
		eval(shift(@insns));
		eval(shift(@insns));
		&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
		eval(shift(@insns));
		eval(shift(@insns));
		&veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
		eval(shift(@insns));
		eval(shift(@insns));
		&vadd_i32 (@Tx[1],@X[-1&7],$K);
		eval(shift(@insns));
		&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
		eval(shift(@insns));
		&veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
		eval(shift(@insns));
		eval(shift(@insns));
		&vshr_u32 (@X[0],@Tx[0],30);
		eval(shift(@insns));
		eval(shift(@insns));
		&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
		&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
		eval(shift(@insns));
		eval(shift(@insns));
		&vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2

		foreach (@insns) { eval; } # remaining instructions [if any]

		$Xi++; push(@X,shift(@X)); # "rotate" X[]
		}

		sub Xuplast_80 ()
		{ use integer;
		my $body = shift;
		my @insns = (&$body,&$body,&$body,&$body);
		my ($a,$b,$c,$d,$e);

		&vadd_i32 (@Tx[1],@X[-1&7],$K);
		eval(shift(@insns));
		eval(shift(@insns));
		&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
		&sub ($Xfer,$Xfer,64);

		&teq ($inp,$len);
		&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
		&subeq ($inp,$inp,64); # reload last block to avoid SEGV
		&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
		eval(shift(@insns));
		eval(shift(@insns));
		&vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
		eval(shift(@insns));
		eval(shift(@insns));
		&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
		eval(shift(@insns));
		eval(shift(@insns));
		&vrev32_8 (@X[-4&7],@X[-4&7]);

		foreach (@insns) { eval; } # remaining instructions

		$Xi=0;
		}

		sub Xloop()
		{ use integer;
		my $body = shift;
		my @insns = (&$body,&$body,&$body,&$body);
		my ($a,$b,$c,$d,$e);

		&vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
		eval(shift(@insns));
		eval(shift(@insns));
		&vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
		eval(shift(@insns));
		eval(shift(@insns));
		&vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU

		foreach (@insns) { eval; }

		$Xi++;
		}

		$code.=<<___;
		#if __ARM_ARCH__>=7
		.fpu neon

		.type sha1_block_data_order_neon,%function
		.align 4
		sha1_block_data_order_neon:
		.LNEON:
		stmdb sp!,{r4-r12,lr}
		add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
		@ dmb @ errata #451034 on early Cortex A8
		@ vstmdb sp!,{d8-d15} @ ABI specification says so
		mov $saved_sp,sp
		sub sp,sp,#64 @ alloca
		adr $K_XX_XX,.LK_00_19
		bic sp,sp,#15 @ align for 128-bit stores

		ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
		mov $Xfer,sp

		vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
		veor $zero,$zero,$zero
		vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
		vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
		vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
		vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
		vrev32.8 @X[-2&7],@X[-2&7]
		vadd.i32 @X[0],@X[-4&7],$K
		vrev32.8 @X[-1&7],@X[-1&7]
		vadd.i32 @X[1],@X[-3&7],$K
		vst1.32 {@X[0]},[$Xfer,:128]!
		vadd.i32 @X[2],@X[-2&7],$K
		vst1.32 {@X[1]},[$Xfer,:128]!
		vst1.32 {@X[2]},[$Xfer,:128]!
		ldr $Ki,[sp] @ big RAW stall

		.Loop_neon:
		___
		&Xupdate_16_31(\&body_00_19);
		&Xupdate_16_31(\&body_00_19);
		&Xupdate_16_31(\&body_00_19);
		&Xupdate_16_31(\&body_00_19);
		&Xupdate_32_79(\&body_00_19);
		&Xupdate_32_79(\&body_20_39);
		&Xupdate_32_79(\&body_20_39);
		&Xupdate_32_79(\&body_20_39);
		&Xupdate_32_79(\&body_20_39);
		&Xupdate_32_79(\&body_20_39);
		&Xupdate_32_79(\&body_40_59);
		&Xupdate_32_79(\&body_40_59);
		&Xupdate_32_79(\&body_40_59);
		&Xupdate_32_79(\&body_40_59);
		&Xupdate_32_79(\&body_40_59);
		&Xupdate_32_79(\&body_20_39);
		&Xuplast_80(\&body_20_39);
		&Xloop(\&body_20_39);
		&Xloop(\&body_20_39);
		&Xloop(\&body_20_39);
		$code.=<<___;
		ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
		add $a,$a,$Ki
		ldr $Ki,[$ctx,#16]
		add $b,$b,$t0
		add $c,$c,$t1
		add $d,$d,$Xfer
		moveq sp,$saved_sp
		add $e,$e,$Ki
		ldrne $Ki,[sp]
		stmia $ctx,{$a,$b,$c,$d,$e}
		addne $Xfer,sp,#3*16
		bne .Loop_neon

		@ vldmia sp!,{d8-d15}
		ldmia sp!,{r4-r12,pc}
		.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
		#endif
		___
		}}}
		#####################################################################
		# ARMv8 stuff
		#
		{{{
		my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
		my @MSG=map("q$_",(4..7));
		my @Kxx=map("q$_",(8..11));
		my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));

		$code.=<<___;
		#if __ARM_ARCH__>=7
		.type sha1_block_data_order_armv8,%function
		.align 5
		sha1_block_data_order_armv8:
		.LARMv8:
		vstmdb sp!,{d8-d15} @ ABI specification says so

		veor $E,$E,$E
		adr r3,.LK_00_19
		vld1.32 {$ABCD},[$ctx]!
		vld1.32 {$E\[0]},[$ctx]
		sub $ctx,$ctx,#16
		vld1.32 {@Kxx[0]\[]},[r3,:32]!
		vld1.32 {@Kxx[1]\[]},[r3,:32]!
		vld1.32 {@Kxx[2]\[]},[r3,:32]!
		vld1.32 {@Kxx[3]\[]},[r3,:32]

		.Loop_v8:
		vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
		vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
		vrev32.8 @MSG[0],@MSG[0]
		vrev32.8 @MSG[1],@MSG[1]

		vadd.i32 $W0,@Kxx[0],@MSG[0]
		vrev32.8 @MSG[2],@MSG[2]
		vmov $ABCD_SAVE,$ABCD @ offload
		subs $len,$len,#1

		vadd.i32 $W1,@Kxx[0],@MSG[1]
		vrev32.8 @MSG[3],@MSG[3]
		sha1h $E1,$ABCD @ 0
		sha1c $ABCD,$E,$W0
		vadd.i32 $W0,@Kxx[$j],@MSG[2]
		sha1su0 @MSG[0],@MSG[1],@MSG[2]
		___
		for ($j=0,$i=1;$i<20-3;$i++) {
		my $f=("c","p","m","p")[$i/5];
		$code.=<<___;
		sha1h $E0,$ABCD @ $i
		sha1$f $ABCD,$E1,$W1
		vadd.i32 $W1,@Kxx[$j],@MSG[3]
		sha1su1 @MSG[0],@MSG[3]
		___
		$code.=<<___ if ($i<20-4);
		sha1su0 @MSG[1],@MSG[2],@MSG[3]
		___
		($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
		push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
		}
		$code.=<<___;
		sha1h $E0,$ABCD @ $i
		sha1p $ABCD,$E1,$W1
		vadd.i32 $W1,@Kxx[$j],@MSG[3]

		sha1h $E1,$ABCD @ 18
		sha1p $ABCD,$E0,$W0

		sha1h $E0,$ABCD @ 19
		sha1p $ABCD,$E1,$W1

		vadd.i32 $E,$E,$E0
		vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
		bne .Loop_v8

		vst1.32 {$ABCD},[$ctx]!
		vst1.32 {$E\[0]},[$ctx]

		vldmia sp!,{d8-d15}
		bx lr
		.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
		#endif
		___
		}}}
		$code.=<<___;
		.comm OPENSSL_armcap_P,4,4
		___

		{ my %opcode = (
		"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
		"sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
		"sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );

		sub unsha1 {
		my ($mnemonic,$arg)=@_;

		$arg =~ m/q([0-9]+)(?:,\sq([0-9]+))?,\sq([0-9]+)/o
		&&
		sprintf ".long\t0x%08x\t@ %s %s",
		$opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
		\|(($2&7)<<17)\|(($2&8)<<4)
		\|(($3&7)<<1) \|(($3&8)<<2),
		$mnemonic,$arg;
		}
		}

		foreach (split($/,$code)) {
		s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2$1,2$1+1/eo or
		s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;

		s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;

		s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4

		print $_,$/;
		}

		$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
		print $code;
		close STDOUT; # enforce flush

crypto/sha/asm/sha256-armv4.pl

+118 −5

Original line number	Diff line number	Diff line
		@@ -31,6 +31,10 @@
		# code (meaning that latter performs sub-optimally, nothing was done
		# about it).

		# May 2014.
		#
		# Add ARMv8 code path performing at 2.0 cpb on Apple A7.

		while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
		open STDOUT,">$output";

		@@ -185,6 +189,8 @@ sha256_block_data_order:
		#if __ARM_ARCH__>=7
		ldr r12,.LOPENSSL_armcap
		ldr r12,[r3,r12] @ OPENSSL_armcap_P
		tst r12,#8
		bne .LARMv8
		tst r12,#1
		bne .LNEON
		#endif
		@@ -241,6 +247,7 @@ $code.=<<___;
		moveq pc,lr @ be binary compatible with V4, yet
		bx lr @ interoperable with Thumb ISA:-)
		#endif
		.size sha256_block_data_order,.-sha256_block_data_order
		___
		######################################################################
		# NEON stuff
		@@ -418,7 +425,10 @@ sub body_00_15 () {
		$code.=<<___;
		#if __ARM_ARCH__>=7
		.fpu neon

		.type sha256_block_data_order_neon,%function
		.align 4
		sha256_block_data_order_neon:
		.LNEON:
		stmdb sp!,{r4-r12,lr}

		@@ -521,17 +531,120 @@ $code.=<<___;
		bne .L_00_48

		ldmia sp!,{r4-r12,pc}
		.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
		#endif
		___
		}}}
		######################################################################
		# ARMv8 stuff
		#
		{{{
		my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
		my @MSG=map("q$_",(8..11));
		my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
		my $Ktbl="r3";

		$code.=<<___;
		.size sha256_block_data_order,.-sha256_block_data_order
		.asciz "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
		#if __ARM_ARCH__>=7
		.type sha256_block_data_order_armv8,%function
		.align 5
		sha256_block_data_order_armv8:
		.LARMv8:
		vld1.32 {$ABCD,$EFGH},[$ctx]
		sub $Ktbl,r3,#sha256_block_data_order-K256

		.Loop_v8:
		vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
		vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
		vld1.32 {$W0},[$Ktbl]!
		vrev32.8 @MSG[0],@MSG[0]
		vrev32.8 @MSG[1],@MSG[1]
		vrev32.8 @MSG[2],@MSG[2]
		vrev32.8 @MSG[3],@MSG[3]
		vmov $ABCD_SAVE,$ABCD @ offload
		vmov $EFGH_SAVE,$EFGH
		teq $inp,$len
		___
		for($i=0;$i<12;$i++) {
		$code.=<<___;
		vld1.32 {$W1},[$Ktbl]!
		vadd.i32 $W0,$W0,@MSG[0]
		sha256su0 @MSG[0],@MSG[1]
		vmov $abcd,$ABCD
		sha256h $ABCD,$EFGH,$W0
		sha256h2 $EFGH,$abcd,$W0
		sha256su1 @MSG[0],@MSG[2],@MSG[3]
		___
		($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
		}
		$code.=<<___;
		vld1.32 {$W1},[$Ktbl]!
		vadd.i32 $W0,$W0,@MSG[0]
		vmov $abcd,$ABCD
		sha256h $ABCD,$EFGH,$W0
		sha256h2 $EFGH,$abcd,$W0

		vld1.32 {$W0},[$Ktbl]!
		vadd.i32 $W1,$W1,@MSG[1]
		vmov $abcd,$ABCD
		sha256h $ABCD,$EFGH,$W1
		sha256h2 $EFGH,$abcd,$W1

		vld1.32 {$W1},[$Ktbl]
		vadd.i32 $W0,$W0,@MSG[2]
		sub $Ktbl,$Ktbl,#256-16 @ rewind
		vmov $abcd,$ABCD
		sha256h $ABCD,$EFGH,$W0
		sha256h2 $EFGH,$abcd,$W0

		vadd.i32 $W1,$W1,@MSG[3]
		vmov $abcd,$ABCD
		sha256h $ABCD,$EFGH,$W1
		sha256h2 $EFGH,$abcd,$W1

		vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
		vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
		bne .Loop_v8

		vst1.32 {$ABCD,$EFGH},[$ctx]

		bx lr
		.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
		#endif
		___
		}}}
		$code.=<<___;
		.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
		.align 2
		.comm OPENSSL_armcap_P,4,4
		___

		$code =~ s/\`([^\`]*)\`/eval $1/gem;
		$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
		print $code;
		{ my %opcode = (
		"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
		"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );

		sub unsha256 {
		my ($mnemonic,$arg)=@_;

		$arg =~ m/q([0-9]+)(?:,\sq([0-9]+))?,\sq([0-9]+)/o
		&&
		sprintf ".long\t0x%08x\t@ %s %s",
		$opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
		\|(($2&7)<<17)\|(($2&8)<<4)
		\|(($3&7)<<1) \|(($3&8)<<2),
		$mnemonic,$arg;
		}
		}

		foreach (split($/,$code)) {

		s/\`([^\`]*)\`/eval $1/geo;

		s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;

		s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4

		print $_,"\n";
		}

		close STDOUT; # enforce flush