sha512-ia64.pl: 15-20% performance improvement. (46a2b338) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/sha/asm/sha512-ia64.pl

+182 −169

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl
		#
		# ====================================================================
		# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		@@ -48,16 +48,22 @@
		# because on Itanium 1 stall on MM result is accompanied by
		# pipeline flush, which takes 6 cycles:-(
		#
		# Resulting performance numbers for 900MHz Itanium 2 system:
		# June 2012
		#
		# The 'numbers' are in 1000s of bytes per second processed.
		# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes
		# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k
		# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k
		# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k
		# Improve performance by 15-20%. Note about "rules of engagement"
		# above. Contemporary cores are equipped with additional shifter,
		# so that they should perform even better than below, presumably
		# by ~10%.
		#
		# (*) SHA1 numbers are for HP-UX compiler and are presented purely
		# for reference purposes. I bet it can improved too...
		######################################################################
		# Current performance in cycles per processed byte for Itanium 2
		# pre-9000 series [little-endian] system:
		#
		# SHA1(*) 5.7
		# SHA256 12.6
		# SHA512 6.7
		#
		# (*) SHA1 result is presented purely for reference purposes.
		#
		# To generate code, pass the file name with either 256 or 512 in its
		# name and compiler flags.
		@@ -106,8 +112,8 @@ if (!defined($big_endian))
		{ $big_endian=(unpack('L',pack('N',1))==1); }

		$code=<<___;
		.ident \"$output, version 1.1\"
		.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
		.ident \"$output, version 2.0\"
		.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@openssl.org>\"
		.explicit
		.text

		@@ -115,26 +121,25 @@ pfssave=r2;
		lcsave=r3;
		prsave=r14;
		K=r15;
		A=r16; B=r17; C=r18; D=r19;
		E=r20; F=r21; G=r22; H=r23;
		A_=r16; B_=r17; C_=r18; D_=r19;
		E_=r20; F_=r21; G_=r22; H_=r23;
		T1=r24; T2=r25;
		s0=r26; s1=r27; t0=r28; t1=r29;
		Ktbl=r30;
		ctx=r31; // 1st arg
		input=r48; // 2nd arg
		num=r49; // 3rd arg
		sgm0=r50; sgm1=r51; // small constants
		A_=r54; B_=r55; C_=r56; D_=r57;
		E_=r58; F_=r59; G_=r60; H_=r61;
		input=r56; // 2nd arg
		num=r57; // 3rd arg
		sgm0=r58; sgm1=r59; // small constants

		// void $func (SHA_CTX ctx, const void in,size_t num[,int host])
		.global $func#
		.proc $func#
		.align 32
		.skip 16
		$func:
		.prologue
		.save ar.pfs,pfssave
		{ .mmi; alloc pfssave=ar.pfs,3,27,0,16
		{ .mmi; alloc pfssave=ar.pfs,3,25,0,24
		$ADDP ctx=0,r32 // 1st arg
		.save ar.lc,lcsave
		mov lcsave=ar.lc }
		@@ -145,11 +150,9 @@ $func:

		.body
		{ .mib; add r8=0*$SZ,ctx
		add r9=1*$SZ,ctx
		brp.loop.imp .L_first16,.L_first16_end-16 }
		add r9=1*$SZ,ctx }
		{ .mib; add r10=2*$SZ,ctx
		add r11=3*$SZ,ctx
		brp.loop.imp .L_rest,.L_rest_end-16 };;
		add r11=3*$SZ,ctx };;

		// load A-H
		.Lpic_point:
		@@ -164,7 +167,7 @@ $func:
		add Ktbl=($TABLE#-.Lpic_point),Ktbl }
		{ .mmi; $LDW G_=[r10]
		$LDW H_=[r11]
		cmp.ne p0,p16=0,r0 };; // used in sha256_block
		cmp.ne p0,p16=0,r0 };;
		___
		$code.=<<___ if ($BITS==64);
		{ .mii; and r8=7,input
		@@ -179,50 +182,26 @@ $code.=<<___ if ($BITS==64);
		___
		$code.=<<___;
		.L_outer:
		.rotr X[16]
		{ .mmi; mov A=A_
		mov B=B_
		.rotr R[8],X[16]
		A=R[0]; B=R[1]; C=R[2]; D=R[3]; E=R[4]; F=R[5]; G=R[6]; H=R[7]
		{ .mmi; ld1 X[15]=[input],$SZ // eliminated in sha512
		mov A=A_
		mov ar.lc=14 }
		{ .mmi; mov C=C_
		mov D=D_
		mov E=E_ }
		{ .mmi; mov F=F_
		mov G=G_
		mov ar.ec=2 }
		{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit
		{ .mmi; mov B=B_
		mov C=C_
		mov D=D_ }
		{ .mmi; mov E=E_
		mov F=F_
		mov ar.ec=2 };;
		{ .mmi; mov G=G_
		mov H=H_
		mov sgm1=$sigma1[2] };;

		___
		$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
		.align 32
		.L_first16:
		{ .mmi; add r9=1-$SZ,input
		add r10=2-$SZ,input
		add r11=3-$SZ,input };;
		{ .mmi; ld1 r9=[r9]
		ld1 r10=[r10]
		dep.z $t1=E,32,32 }
		{ .mmi; $LDW K=[Ktbl],$SZ
		ld1 r11=[r11]
		zxt4 E=E };;
		{ .mii; or $t1=$t1,E
		dep X[15]=X[15],r9,8,8
		dep r11=r10,r11,8,8 };;
		{ .mmi; and T1=F,E
		and T2=A,B
		dep X[15]=X[15],r11,16,16 }
		{ .mmi; andcm r8=G,E
		and r9=A,C
		mux2 $t0=A,0x44 };; // copy lower half to upper
		{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
		xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
		_rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
		{ .mib; and r10=B,C
		xor T2=T2,r9 };;
		mov sgm1=$sigma1[2] }
		{ .mib; mov r8=0
		add r9=1-$SZ,input
		brp.loop.imp .L_first16,.L_first16_end-16 };;
		___
		$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		// in 64-bit mode I load whole X[16] at once and take care of alignment...
		// in sha512 case I load whole X[16] at once and take care of alignment...
		{ .mmi; add r8=1*$SZ,input
		add r9=2*$SZ,input
		add r10=3*$SZ,input };;
		@@ -248,7 +227,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		$LDW X[ 2]=[r8],4*$SZ
		(p15) br.cond.dpnt.many .L7byte };;
		{ .mmb; $LDW X[ 1]=[r9],4*$SZ
		$LDW X[ 0]=[r10],4*$SZ
		$LDW X[ 0]=[r10],4*$SZ }
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev // eliminated on big-endian
		br.many .L_first16 };;
		.L1byte:
		{ .mmi; $LDW X[13]=[r9],4*$SZ
		@@ -281,7 +262,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		shrp X[ 3]=X[ 3],X[ 2],56 }
		{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
		shrp X[ 1]=X[ 1],X[ 0],56 }
		{ .mib; shrp X[ 0]=X[ 0],T1,56
		{ .mib; shrp X[ 0]=X[ 0],T1,56 }
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev // eliminated on big-endian
		br.many .L_first16 };;
		.L2byte:
		{ .mmi; $LDW X[11]=[input],4*$SZ
		@@ -313,7 +296,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		shrp X[ 2]=X[ 2],X[ 1],48 }
		{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
		shrp X[ 0]=X[ 0],T1,48 }
		{ .mfb; br.many .L_first16 };;
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev // eliminated on big-endian
		br.many .L_first16 };;
		.L3byte:
		{ .mmi; $LDW X[ 9]=[r9],4*$SZ
		$LDW X[ 8]=[r10],4*$SZ
		@@ -341,7 +326,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		shrp X[ 3]=X[ 3],X[ 2],40 }
		{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
		shrp X[ 1]=X[ 1],X[ 0],40 }
		{ .mib; shrp X[ 0]=X[ 0],T1,40
		{ .mib; shrp X[ 0]=X[ 0],T1,40 }
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev // eliminated on big-endian
		br.many .L_first16 };;
		.L4byte:
		{ .mmi; $LDW X[ 7]=[input],4*$SZ
		@@ -369,7 +356,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		shrp X[ 2]=X[ 2],X[ 1],32 }
		{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
		shrp X[ 0]=X[ 0],T1,32 }
		{ .mfb; br.many .L_first16 };;
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev // eliminated on big-endian
		br.many .L_first16 };;
		.L5byte:
		{ .mmi; $LDW X[ 5]=[r9],4*$SZ
		$LDW X[ 4]=[r10],4*$SZ
		@@ -393,7 +382,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		shrp X[ 3]=X[ 3],X[ 2],24 }
		{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
		shrp X[ 1]=X[ 1],X[ 0],24 }
		{ .mib; shrp X[ 0]=X[ 0],T1,24
		{ .mib; shrp X[ 0]=X[ 0],T1,24 }
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev // eliminated on big-endian
		br.many .L_first16 };;
		.L6byte:
		{ .mmi; $LDW X[ 3]=[input],4*$SZ
		@@ -417,7 +408,9 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		shrp X[ 2]=X[ 2],X[ 1],16 }
		{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
		shrp X[ 0]=X[ 0],T1,16 }
		{ .mfb; br.many .L_first16 };;
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev // eliminated on big-endian
		br.many .L_first16 };;
		.L7byte:
		{ .mmi; $LDW X[ 1]=[r9],4*$SZ
		$LDW X[ 0]=[r10],4*$SZ
		@@ -437,89 +430,113 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		shrp X[ 3]=X[ 3],X[ 2],8 }
		{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
		shrp X[ 1]=X[ 1],X[ 0],8 }
		{ .mib; shrp X[ 0]=X[ 0],T1,8
		br.many .L_first16 };;
		{ .mib; shrp X[ 0]=X[ 0],T1,8 }
		{ .mib; mov r8=0
		mux1 X[15]=X[15],\@rev };; // eliminated on big-endian

		.align 32
		.L_first16:
		{ .mmi; $LDW K=[Ktbl],$SZ
		and T1=F,E
		and T2=A,B }
		{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++
		add A=A,r8 // H+=Sigma(0) from the past
		_rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
		{ .mmi; and T1=F,E
		andcm r8=G,E
		and r9=A,C };;
		(p16) mux1 X[14]=X[14],\@rev };; // eliminated on big-endian
		{ .mmi; and T2=A,B
		and r9=A,C
		_rotr r11=$t1,$Sigma1[1] } // ROTR(e,41)
		{ .mmi; xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
		and r10=B,C
		_rotr r11=$t1,$Sigma1[0] } // ROTR(e,14)
		{ .mmi; xor T2=T2,r9
		mux1 X[15]=X[15],\@rev };; // eliminated in big-endian
		and r8=B,C };;
		___
		$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
		.align 32
		.L_first16:
		{ .mmi; add A=A,r8 // H+=Sigma(0) from the past
		add r10=2-$SZ,input
		add r11=3-$SZ,input };;
		{ .mmi; ld1 r9=[r9]
		ld1 r10=[r10]
		dep.z $t1=E,32,32 }
		{ .mmi; ld1 r11=[r11]
		$LDW K=[Ktbl],$SZ
		zxt4 E=E };;
		{ .mii; or $t1=$t1,E
		dep X[15]=X[15],r9,8,8
		mux2 $t0=A,0x44 };; // copy lower half to upper
		{ .mmi; and T1=F,E
		andcm r8=G,E
		dep r11=r10,r11,8,8 };;
		{ .mmi; and T2=A,B
		and r9=A,C
		dep X[15]=X[15],r11,16,16 };;
		{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
		xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
		_rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
		{ .mmi; and r8=B,C
		_rotr r11=$t1,$Sigma1[1] };; // ROTR(e,18)
		___
		$code.=<<___;
		{ .mib; add T1=T1,H // T1=Ch(e,f,g)+h
		_rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
		{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
		mov H=G };;
		{ .mib; xor r11=r8,r11
		_rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
		{ .mib; mov G=F
		mov F=E };;
		{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
		_rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
		{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
		mov E=D };;
		{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
		_rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
		{ .mib; mov D=C
		mov C=B };;
		{ .mib; add T1=T1,X[15] // T1+=X[i]
		_rotr r8=$t0,$Sigma0[2] } // ROTR(a,39)
		{ .mib; xor r10=r10,r11
		mux2 X[15]=X[15],0x44 };; // eliminated in 64-bit
		{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
		mov B=A
		add A=T1,T2 };;
		{ .mib; add E=E,T1
		add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
		{ .mmi; add T1=T1,H // T1=Ch(e,f,g)+h
		xor r10=r10,r11
		_rotr r11=$t1,$Sigma1[2] } // ROTR(e,41)
		{ .mmi; xor T2=T2,r9
		add K=K,X[15] };;
		{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
		xor T2=T2,r8 // T2=((a & b) ^ (a & c) ^ (b & c))
		_rotr r8=$t0,$Sigma0[0] } // ROTR(a,28)
		{ .mmi; xor r11=r11,r10 // Sigma1(e)
		_rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
		{ .mmi; add T1=T1,r11 // T+=Sigma1(e)
		xor r8=r8,r9
		_rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
		{ .mmi; xor r8=r8,r9 // Sigma0(a)
		add D=D,T1
		mux2 H=X[15],0x44 } // mov H=X[15] in sha512
		{ .mib; (p16) add r9=1-$SZ,input // not used in sha512
		add X[15]=T1,T2 // H=T1+Maj(a,b,c)
		br.ctop.sptk .L_first16 };;
		.L_first16_end:

		{ .mii; mov ar.lc=$rounds-17
		mov ar.ec=1 };;
		{ .mib; mov ar.lc=$rounds-17
		brp.loop.imp .L_rest,.L_rest_end-16 }
		{ .mib; mov ar.ec=1
		br.many .L_rest };;

		.align 32
		.L_rest:
		.rotr X[16]
		{ .mib; $LDW K=[Ktbl],$SZ
		{ .mmi; $LDW K=[Ktbl],$SZ
		add A=A,r8 // H+=Sigma0(a) from the past
		_rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
		{ .mib; $ADD X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
		{ .mmi; add X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
		$SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
		{ .mib; and T1=F,E
		_rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
		{ .mib; andcm r10=G,E
		$SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
		// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
		// upon $SHRU output usage
		{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
		xor r9=r8,r9
		_rotr r10=X[15-14],$sigma1[0] };;// ROTR(s1,19)
		{ .mib; and T2=A,B
		_rotr r11=X[15-14],$sigma1[1] }// ROTR(s1,61)
		{ .mib; and r8=A,C };;
		_rotr r10=X[15-14],$sigma1[0] }// ROTR(s1,19)
		{ .mmi; and T2=A,B
		and r8=A,C
		_rotr r11=X[15-14],$sigma1[1] };;// ROTR(s1,61)
		___
		$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
		// I adhere to mmi; in order to hold Itanium 1 back and avoid 6 cycle
		// pipeline flush in last bundle. Note that even on Itanium2 the
		// latter stalls for one clock cycle...
		{ .mmi; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
		{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
		dep.z $t1=E,32,32 }
		{ .mmi; xor r10=r11,r10
		{ .mib; xor r10=r11,r10
		zxt4 E=E };;
		{ .mmi; or $t1=$t1,E
		xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
		{ .mii; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
		shrp r9=E,$t1,32+$Sigma1[0] // ROTR(e,14)
		mux2 $t0=A,0x44 };; // copy lower half to upper
		// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
		// upon mux2 output usage
		{ .mmi; xor T2=T2,r8
		_rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
		shrp r8=E,$t1,32+$Sigma1[1]} // ROTR(e,18)
		{ .mmi; and r10=B,C
		add T1=T1,H // T1=Ch(e,f,g)+h
		$ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
		or $t1=$t1,E };;
		___
		$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
		@@ -527,38 +544,32 @@ $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
		{ .mib; xor r10=r11,r10
		xor T2=T2,r8 };;
		{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
		add T1=T1,H }
		_rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
		{ .mib; and r10=B,C
		$ADD X[15]=X[15],s0 };; // X[i&0xF]+=sigma0(X[(i+1)&0xF])
		add T1=T1,H };; // T1+=H
		___
		$code.=<<___;
		{ .mmi; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
		mov H=G
		_rotr r8=$t1,$Sigma1[1] };; // ROTR(e,18)
		{ .mmi; xor r11=r8,r9
		$ADD X[15]=X[15],s1 // X[i&0xF]+=sigma1(X[(i+14)&0xF])
		_rotr r9=$t1,$Sigma1[2] } // ROTR(e,41)
		{ .mmi; mov G=F
		mov F=E };;
		{ .mib; xor r9=r9,r11 // r9=Sigma1(e)
		_rotr r10=$t0,$Sigma0[0] } // ROTR(a,28)
		{ .mib; add T1=T1,K // T1=Ch(e,f,g)+h+K512[i]
		mov E=D };;
		{ .mib; add T1=T1,r9 // T1+=Sigma1(e)
		_rotr r11=$t0,$Sigma0[1] } // ROTR(a,34)
		{ .mib; mov D=C
		mov C=B };;
		{ .mmi; add T1=T1,X[15] // T1+=X[i]
		xor r10=r10,r11
		_rotr r8=$t0,$Sigma0[2] };; // ROTR(a,39)
		{ .mmi; xor r10=r8,r10 // r10=Sigma0(a)
		mov B=A
		add A=T1,T2 };;
		{ .mib; add E=E,T1
		add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a)
		{ .mib; xor r9=r9,r8
		_rotr r8=$t1,$Sigma1[2] } // ROTR(e,41)
		{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
		add X[15]=X[15],s0 };; // X[i]+=sigma0(X[i+1])
		{ .mmi; xor r9=r9,r8 // Sigma1(e)
		add X[15]=X[15],s1 // X[i]+=sigma0(X[i+14])
		_rotr r8=$t0,$Sigma0[0] };; // ROTR(a,28)
		{ .mmi; add K=K,X[15]
		add T1=T1,r9 // T1+=Sigma1(e)
		_rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
		{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
		xor r8=r8,r9
		_rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
		{ .mib; add D=D,T1
		mux2 H=X[15],0x44 } // mov H=X[15] in sha512
		{ .mib; xor r8=r8,r9 // Sigma0(a)
		add X[15]=T1,T2 // H=T1+Maj(a,b,c)
		br.ctop.sptk .L_rest };;
		.L_rest_end:

		{ .mmi; add A=A,r8 };; // H+=Sigma0(a) from the past
		{ .mmi; add A_=A_,A
		add B_=B_,B
		add C_=C_,C }
		@@ -590,17 +601,19 @@ $code.=<<___;
		.endp $func#
		___

		$code =~ s/\`([^\`]*)\`/eval $1/gem;
		$code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
		foreach(split($/,$code)) {
		s/\`([^\`]*)\`/eval $1/gem;
		s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
		if ($BITS==64) {
		$code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
		$code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
		$code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
		s/mux2(\s+)([^=]+)=([^,]+),\S+/mov$1 $2=$3/gm;
		s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
		s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
		if (!$big_endian);
		$code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
		s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
		}

		print $code;
		print $_,"\n";
		}

		print<<___ if ($BITS==32);
		.align 64