Sync ASM/modes to add CCM and XTS modes and assembly language optimisation (dc01af77) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/modes/Makefile

+67 −8

Original line number	Diff line number	Diff line
		@@ -10,15 +10,21 @@ CFLAG=-g
		MAKEFILE= Makefile
		AR= ar r

		MODES_ASM_OBJ=

		CFLAGS= $(INCLUDES) $(CFLAG)
		ASFLAGS= $(INCLUDES) $(ASFLAG)
		AFLAGS= $(ASFLAGS)

		GENERAL=Makefile
		TEST=
		APPS=

		LIB=$(TOP)/libcrypto.a
		LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c
		LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o
		LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
		ccm128.c xts128.c
		LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
		ccm128.o xts128.o $(MODES_ASM_OBJ)

		SRC= $(LIBSRC)

		@@ -38,6 +44,24 @@ lib: $(LIBOBJ)
		$(RANLIB) $(LIB) \|\| echo Never mind.
		@touch lib

		ghash-ia64.s: asm/ghash-ia64.pl
		$(PERL) asm/ghash-ia64.pl $@ $(CFLAGS)
		ghash-x86.s: asm/ghash-x86.pl
		$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
		ghash-x86_64.s: asm/ghash-x86_64.pl
		$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
		ghash-sparcv9.s: asm/ghash-sparcv9.pl
		$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
		ghash-alpha.s: asm/ghash-alpha.pl
		$(PERL) $< \| $(CC) -E - \| tee $@ > /dev/null
		ghash-parisc.s: asm/ghash-parisc.pl
		$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@

		# GNU make "catch all"
		ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@

		ghash-armv4.o: ghash-armv4.S

		files:
		$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

		@@ -71,12 +95,47 @@ dclean:
		mv -f Makefile.new $(MAKEFILE)

		clean:
		rm -f .o /.o .obj lib tags core .pure .nfs* .old .bak fluff
		rm -f .s .o /.o .obj lib tags core .pure .nfs .old .bak fluff

		# DO NOT DELETE THIS LINE -- make depend depends on it.

		cbc128.o: cbc128.c modes.h
		cfb128.o: cfb128.c modes.h
		ctr128.o: ctr128.c modes.h
		cts128.o: cts128.c modes.h
		ofb128.o: modes.h ofb128.c
		cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h
		ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h
		cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h
		ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h
		cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h
		gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h
		ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
		xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
		xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
		xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
		xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
		xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c

crypto/modes/asm/ghash-alpha.pl

0 → 100644

+451 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl
		#
		# ====================================================================
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================
		#
		# March 2010
		#
		# The module implements "4-bit" GCM GHASH function and underlying
		# single multiplication operation in GF(2^128). "4-bit" means that it
		# uses 256 bytes per-key table [+128 bytes shared table]. Even though
		# loops are aggressively modulo-scheduled in respect to references to
		# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
		# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
		# scheduling "glitch," because uprofile(1) indicates uniform sample
		# distribution, as if all instruction bundles execute in 1.5 cycles.
		# Meaning that it could have been even faster, yet 12 cycles is ~60%
		# better than gcc-generated code and ~80% than code generated by vendor
		# compiler.

		$cnt="v0"; # $0
		$t0="t0";
		$t1="t1";
		$t2="t2";
		$Thi0="t3"; # $4
		$Tlo0="t4";
		$Thi1="t5";
		$Tlo1="t6";
		$rem="t7"; # $8
		#################
		$Xi="a0"; # $16, input argument block
		$Htbl="a1";
		$inp="a2";
		$len="a3";
		$nlo="a4"; # $20
		$nhi="a5";
		$Zhi="t8";
		$Zlo="t9";
		$Xhi="t10"; # $24
		$Xlo="t11";
		$remp="t12";
		$rem_4bit="AT"; # $28

		{ my $N;
		sub loop() {

		$N++;
		$code.=<<___;
		.align 4
		extbl $Xlo,7,$nlo
		and $nlo,0xf0,$nhi
		sll $nlo,4,$nlo
		and $nlo,0xf0,$nlo

		addq $nlo,$Htbl,$nlo
		ldq $Zlo,8($nlo)
		addq $nhi,$Htbl,$nhi
		ldq $Zhi,0($nlo)

		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		lda $cnt,6(zero)
		extbl $Xlo,6,$nlo

		ldq $Tlo1,8($nhi)
		s8addq $remp,$rem_4bit,$remp
		ldq $Thi1,0($nhi)
		srl $Zlo,4,$Zlo

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		xor $t0,$Zlo,$Zlo
		and $nlo,0xf0,$nhi

		xor $Tlo1,$Zlo,$Zlo
		sll $nlo,4,$nlo
		xor $Thi1,$Zhi,$Zhi
		and $nlo,0xf0,$nlo

		addq $nlo,$Htbl,$nlo
		ldq $Tlo0,8($nlo)
		addq $nhi,$Htbl,$nhi
		ldq $Thi0,0($nlo)

		.Looplo$N:
		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		subq $cnt,1,$cnt
		srl $Zlo,4,$Zlo

		ldq $Tlo1,8($nhi)
		xor $rem,$Zhi,$Zhi
		ldq $Thi1,0($nhi)
		s8addq $remp,$rem_4bit,$remp

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		xor $t0,$Zlo,$Zlo
		extbl $Xlo,$cnt,$nlo

		and $nlo,0xf0,$nhi
		xor $Thi0,$Zhi,$Zhi
		xor $Tlo0,$Zlo,$Zlo
		sll $nlo,4,$nlo


		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		and $nlo,0xf0,$nlo
		srl $Zlo,4,$Zlo

		s8addq $remp,$rem_4bit,$remp
		xor $rem,$Zhi,$Zhi
		addq $nlo,$Htbl,$nlo
		addq $nhi,$Htbl,$nhi

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		ldq $Tlo0,8($nlo)
		xor $t0,$Zlo,$Zlo

		xor $Tlo1,$Zlo,$Zlo
		xor $Thi1,$Zhi,$Zhi
		ldq $Thi0,0($nlo)
		bne $cnt,.Looplo$N


		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		lda $cnt,7(zero)
		srl $Zlo,4,$Zlo

		ldq $Tlo1,8($nhi)
		xor $rem,$Zhi,$Zhi
		ldq $Thi1,0($nhi)
		s8addq $remp,$rem_4bit,$remp

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		xor $t0,$Zlo,$Zlo
		extbl $Xhi,$cnt,$nlo

		and $nlo,0xf0,$nhi
		xor $Thi0,$Zhi,$Zhi
		xor $Tlo0,$Zlo,$Zlo
		sll $nlo,4,$nlo

		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		and $nlo,0xf0,$nlo
		srl $Zlo,4,$Zlo

		s8addq $remp,$rem_4bit,$remp
		xor $rem,$Zhi,$Zhi
		addq $nlo,$Htbl,$nlo
		addq $nhi,$Htbl,$nhi

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		ldq $Tlo0,8($nlo)
		xor $t0,$Zlo,$Zlo

		xor $Tlo1,$Zlo,$Zlo
		xor $Thi1,$Zhi,$Zhi
		ldq $Thi0,0($nlo)
		unop


		.Loophi$N:
		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		subq $cnt,1,$cnt
		srl $Zlo,4,$Zlo

		ldq $Tlo1,8($nhi)
		xor $rem,$Zhi,$Zhi
		ldq $Thi1,0($nhi)
		s8addq $remp,$rem_4bit,$remp

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		xor $t0,$Zlo,$Zlo
		extbl $Xhi,$cnt,$nlo

		and $nlo,0xf0,$nhi
		xor $Thi0,$Zhi,$Zhi
		xor $Tlo0,$Zlo,$Zlo
		sll $nlo,4,$nlo


		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		and $nlo,0xf0,$nlo
		srl $Zlo,4,$Zlo

		s8addq $remp,$rem_4bit,$remp
		xor $rem,$Zhi,$Zhi
		addq $nlo,$Htbl,$nlo
		addq $nhi,$Htbl,$nhi

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		ldq $Tlo0,8($nlo)
		xor $t0,$Zlo,$Zlo

		xor $Tlo1,$Zlo,$Zlo
		xor $Thi1,$Zhi,$Zhi
		ldq $Thi0,0($nlo)
		bne $cnt,.Loophi$N


		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		srl $Zlo,4,$Zlo

		ldq $Tlo1,8($nhi)
		xor $rem,$Zhi,$Zhi
		ldq $Thi1,0($nhi)
		s8addq $remp,$rem_4bit,$remp

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		xor $t0,$Zlo,$Zlo

		xor $Tlo0,$Zlo,$Zlo
		xor $Thi0,$Zhi,$Zhi

		and $Zlo,0x0f,$remp
		sll $Zhi,60,$t0
		srl $Zlo,4,$Zlo

		s8addq $remp,$rem_4bit,$remp
		xor $rem,$Zhi,$Zhi

		ldq $rem,0($remp)
		srl $Zhi,4,$Zhi
		xor $Tlo1,$Zlo,$Zlo
		xor $Thi1,$Zhi,$Zhi
		xor $t0,$Zlo,$Zlo
		xor $rem,$Zhi,$Zhi
		___
		}}

		$code=<<___;
		#ifdef __linux__
		#include <asm/regdef.h>
		#else
		#include <asm.h>
		#include <regdef.h>
		#endif

		.text

		.set noat
		.set noreorder
		.globl gcm_gmult_4bit
		.align 4
		.ent gcm_gmult_4bit
		gcm_gmult_4bit:
		.frame sp,0,ra
		.prologue 0

		ldq $Xlo,8($Xi)
		ldq $Xhi,0($Xi)

		br $rem_4bit,.Lpic1
		.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
		___

		&loop();

		$code.=<<___;
		srl $Zlo,24,$t0 # byte swap
		srl $Zlo,8,$t1

		sll $Zlo,8,$t2
		sll $Zlo,24,$Zlo
		zapnot $t0,0x11,$t0
		zapnot $t1,0x22,$t1

		zapnot $Zlo,0x88,$Zlo
		or $t0,$t1,$t0
		zapnot $t2,0x44,$t2

		or $Zlo,$t0,$Zlo
		srl $Zhi,24,$t0
		srl $Zhi,8,$t1

		or $Zlo,$t2,$Zlo
		sll $Zhi,8,$t2
		sll $Zhi,24,$Zhi

		srl $Zlo,32,$Xlo
		sll $Zlo,32,$Zlo

		zapnot $t0,0x11,$t0
		zapnot $t1,0x22,$t1
		or $Zlo,$Xlo,$Xlo

		zapnot $Zhi,0x88,$Zhi
		or $t0,$t1,$t0
		zapnot $t2,0x44,$t2

		or $Zhi,$t0,$Zhi
		or $Zhi,$t2,$Zhi

		srl $Zhi,32,$Xhi
		sll $Zhi,32,$Zhi

		or $Zhi,$Xhi,$Xhi
		stq $Xlo,8($Xi)
		stq $Xhi,0($Xi)

		ret (ra)
		.end gcm_gmult_4bit
		___

		$inhi="s0";
		$inlo="s1";

		$code.=<<___;
		.globl gcm_ghash_4bit
		.align 4
		.ent gcm_ghash_4bit
		gcm_ghash_4bit:
		lda sp,-32(sp)
		stq ra,0(sp)
		stq s0,8(sp)
		stq s1,16(sp)
		.mask 0x04000600,-32
		.frame sp,32,ra
		.prologue 0

		ldq_u $inhi,0($inp)
		ldq_u $Thi0,7($inp)
		ldq_u $inlo,8($inp)
		ldq_u $Tlo0,15($inp)
		ldq $Xhi,0($Xi)
		ldq $Xlo,8($Xi)

		br $rem_4bit,.Lpic2
		.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)

		.Louter:
		extql $inhi,$inp,$inhi
		extqh $Thi0,$inp,$Thi0
		or $inhi,$Thi0,$inhi
		lda $inp,16($inp)

		extql $inlo,$inp,$inlo
		extqh $Tlo0,$inp,$Tlo0
		or $inlo,$Tlo0,$inlo
		subq $len,16,$len

		xor $Xlo,$inlo,$Xlo
		xor $Xhi,$inhi,$Xhi
		___

		&loop();

		$code.=<<___;
		srl $Zlo,24,$t0 # byte swap
		srl $Zlo,8,$t1

		sll $Zlo,8,$t2
		sll $Zlo,24,$Zlo
		zapnot $t0,0x11,$t0
		zapnot $t1,0x22,$t1

		zapnot $Zlo,0x88,$Zlo
		or $t0,$t1,$t0
		zapnot $t2,0x44,$t2

		or $Zlo,$t0,$Zlo
		srl $Zhi,24,$t0
		srl $Zhi,8,$t1

		or $Zlo,$t2,$Zlo
		sll $Zhi,8,$t2
		sll $Zhi,24,$Zhi

		srl $Zlo,32,$Xlo
		sll $Zlo,32,$Zlo
		beq $len,.Ldone

		zapnot $t0,0x11,$t0
		zapnot $t1,0x22,$t1
		or $Zlo,$Xlo,$Xlo
		ldq_u $inhi,0($inp)

		zapnot $Zhi,0x88,$Zhi
		or $t0,$t1,$t0
		zapnot $t2,0x44,$t2
		ldq_u $Thi0,7($inp)

		or $Zhi,$t0,$Zhi
		or $Zhi,$t2,$Zhi
		ldq_u $inlo,8($inp)
		ldq_u $Tlo0,15($inp)

		srl $Zhi,32,$Xhi
		sll $Zhi,32,$Zhi

		or $Zhi,$Xhi,$Xhi
		br zero,.Louter

		.Ldone:
		zapnot $t0,0x11,$t0
		zapnot $t1,0x22,$t1
		or $Zlo,$Xlo,$Xlo

		zapnot $Zhi,0x88,$Zhi
		or $t0,$t1,$t0
		zapnot $t2,0x44,$t2

		or $Zhi,$t0,$Zhi
		or $Zhi,$t2,$Zhi

		srl $Zhi,32,$Xhi
		sll $Zhi,32,$Zhi

		or $Zhi,$Xhi,$Xhi

		stq $Xlo,8($Xi)
		stq $Xhi,0($Xi)

		.set noreorder
		/ldq ra,0(sp)/
		ldq s0,8(sp)
		ldq s1,16(sp)
		lda sp,32(sp)
		ret (ra)
		.end gcm_ghash_4bit

		.align 4
		rem_4bit:
		.quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
		.quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
		.quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
		.quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
		.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
		.align 4

		___
		$output=shift and open STDOUT,">$output";
		print $code;
		close STDOUT;

crypto/modes/asm/ghash-armv4.pl

0 → 100644

+429 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl
		#
		# ====================================================================
		# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
		# project. The module is, however, dual licensed under OpenSSL and
		# CRYPTOGAMS licenses depending on where you obtain it. For further
		# details see http://www.openssl.org/~appro/cryptogams/.
		# ====================================================================
		#
		# April 2010
		#
		# The module implements "4-bit" GCM GHASH function and underlying
		# single multiplication operation in GF(2^128). "4-bit" means that it
		# uses 256 bytes per-key table [+32 bytes shared table]. There is no
		# experimental performance data available yet. The only approximation
		# that can be made at this point is based on code size. Inner loop is
		# 32 instructions long and on single-issue core should execute in <40
		# cycles. Having verified that gcc 3.4 didn't unroll corresponding
		# loop, this assembler loop body was found to be ~3x smaller than
		# compiler-generated one...
		#
		# July 2010
		#
		# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
		# Cortex A8 core and ~25 cycles per processed byte (which was observed
		# to be ~3 times faster than gcc-generated code:-)
		#
		# February 2011
		#
		# Profiler-assisted and platform-specific optimization resulted in 7%
		# improvement on Cortex A8 core and ~23.5 cycles per byte.
		#
		# March 2011
		#
		# Add NEON implementation featuring polynomial multiplication, i.e. no
		# lookup tables involved. On Cortex A8 it was measured to process one
		# byte in 15 cycles or 55% faster than integer-only code.

		# ====================================================================
		# Note about "528B" variant. In ARM case it makes lesser sense to
		# implement it for following reasons:
		#
		# - performance improvement won't be anywhere near 50%, because 128-
		# bit shift operation is neatly fused with 128-bit xor here, and
		# "538B" variant would eliminate only 4-5 instructions out of 32
		# in the inner loop (meaning that estimated improvement is ~15%);
		# - ARM-based systems are often embedded ones and extra memory
		# consumption might be unappreciated (for so little improvement);
		#
		# Byte order [in]dependence. =========================================
		#
		# Caller is expected to maintain specific dword order in Htable,
		# namely with least significant dword of 128-bit value at lower
		# address. This differs completely from C code and has everything to
		# do with ldm instruction and order in which dwords are "consumed" by
		# algorithm. Byte order within these dwords in turn is whatever
		# native byte order on current platform. See gcm128.c for working
		# example...

		while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
		open STDOUT,">$output";

		$Xi="r0"; # argument block
		$Htbl="r1";
		$inp="r2";
		$len="r3";

		$Zll="r4"; # variables
		$Zlh="r5";
		$Zhl="r6";
		$Zhh="r7";
		$Tll="r8";
		$Tlh="r9";
		$Thl="r10";
		$Thh="r11";
		$nlo="r12";
		################# r13 is stack pointer
		$nhi="r14";
		################# r15 is program counter

		$rem_4bit=$inp; # used in gcm_gmult_4bit
		$cnt=$len;

		sub Zsmash() {
		my $i=12;
		my @args=@_;
		for ($Zll,$Zlh,$Zhl,$Zhh) {
		$code.=<<___;
		#if __ARM_ARCH__>=7 && defined(__ARMEL__)
		rev $_,$_
		str $_,[$Xi,#$i]
		#elif defined(__ARMEB__)
		str $_,[$Xi,#$i]
		#else
		mov $Tlh,$_,lsr#8
		strb $_,[$Xi,#$i+3]
		mov $Thl,$_,lsr#16
		strb $Tlh,[$Xi,#$i+2]
		mov $Thh,$_,lsr#24
		strb $Thl,[$Xi,#$i+1]
		strb $Thh,[$Xi,#$i]
		#endif
		___
		$code.="\t".shift(@args)."\n";
		$i-=4;
		}
		}

		$code=<<___;
		#include "arm_arch.h"

		.text
		.code 32

		.type rem_4bit,%object
		.align 5
		rem_4bit:
		.short 0x0000,0x1C20,0x3840,0x2460
		.short 0x7080,0x6CA0,0x48C0,0x54E0
		.short 0xE100,0xFD20,0xD940,0xC560
		.short 0x9180,0x8DA0,0xA9C0,0xB5E0
		.size rem_4bit,.-rem_4bit

		.type rem_4bit_get,%function
		rem_4bit_get:
		sub $rem_4bit,pc,#8
		sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
		b .Lrem_4bit_got
		nop
		.size rem_4bit_get,.-rem_4bit_get

		.global gcm_ghash_4bit
		.type gcm_ghash_4bit,%function
		gcm_ghash_4bit:
		sub r12,pc,#8
		add $len,$inp,$len @ $len to point at the end
		stmdb sp!,{r3-r11,lr} @ save $len/end too
		sub r12,r12,#48 @ &rem_4bit

		ldmia r12,{r4-r11} @ copy rem_4bit ...
		stmdb sp!,{r4-r11} @ ... to stack

		ldrb $nlo,[$inp,#15]
		ldrb $nhi,[$Xi,#15]
		.Louter:
		eor $nlo,$nlo,$nhi
		and $nhi,$nlo,#0xf0
		and $nlo,$nlo,#0x0f
		mov $cnt,#14

		add $Zhh,$Htbl,$nlo,lsl#4
		ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
		add $Thh,$Htbl,$nhi
		ldrb $nlo,[$inp,#14]

		and $nhi,$Zll,#0xf @ rem
		ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
		add $nhi,$nhi,$nhi
		eor $Zll,$Tll,$Zll,lsr#4
		ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
		eor $Zll,$Zll,$Zlh,lsl#28
		ldrb $nhi,[$Xi,#14]
		eor $Zlh,$Tlh,$Zlh,lsr#4
		eor $Zlh,$Zlh,$Zhl,lsl#28
		eor $Zhl,$Thl,$Zhl,lsr#4
		eor $Zhl,$Zhl,$Zhh,lsl#28
		eor $Zhh,$Thh,$Zhh,lsr#4
		eor $nlo,$nlo,$nhi
		and $nhi,$nlo,#0xf0
		and $nlo,$nlo,#0x0f
		eor $Zhh,$Zhh,$Tll,lsl#16

		.Linner:
		add $Thh,$Htbl,$nlo,lsl#4
		and $nlo,$Zll,#0xf @ rem
		subs $cnt,$cnt,#1
		add $nlo,$nlo,$nlo
		ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
		eor $Zll,$Tll,$Zll,lsr#4
		eor $Zll,$Zll,$Zlh,lsl#28
		eor $Zlh,$Tlh,$Zlh,lsr#4
		eor $Zlh,$Zlh,$Zhl,lsl#28
		ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
		eor $Zhl,$Thl,$Zhl,lsr#4
		ldrplb $nlo,[$inp,$cnt]
		eor $Zhl,$Zhl,$Zhh,lsl#28
		eor $Zhh,$Thh,$Zhh,lsr#4

		add $Thh,$Htbl,$nhi
		and $nhi,$Zll,#0xf @ rem
		eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
		add $nhi,$nhi,$nhi
		ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
		eor $Zll,$Tll,$Zll,lsr#4
		ldrplb $Tll,[$Xi,$cnt]
		eor $Zll,$Zll,$Zlh,lsl#28
		eor $Zlh,$Tlh,$Zlh,lsr#4
		ldrh $Tlh,[sp,$nhi]
		eor $Zlh,$Zlh,$Zhl,lsl#28
		eor $Zhl,$Thl,$Zhl,lsr#4
		eor $Zhl,$Zhl,$Zhh,lsl#28
		eorpl $nlo,$nlo,$Tll
		eor $Zhh,$Thh,$Zhh,lsr#4
		andpl $nhi,$nlo,#0xf0
		andpl $nlo,$nlo,#0x0f
		eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
		bpl .Linner

		ldr $len,[sp,#32] @ re-load $len/end
		add $inp,$inp,#16
		mov $nhi,$Zll
		___
		&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
		$code.=<<___;
		bne .Louter

		add sp,sp,#36
		#if __ARM_ARCH__>=5
		ldmia sp!,{r4-r11,pc}
		#else
		ldmia sp!,{r4-r11,lr}
		tst lr,#1
		moveq pc,lr @ be binary compatible with V4, yet
		bx lr @ interoperable with Thumb ISA:-)
		#endif
		.size gcm_ghash_4bit,.-gcm_ghash_4bit

		.global gcm_gmult_4bit
		.type gcm_gmult_4bit,%function
		gcm_gmult_4bit:
		stmdb sp!,{r4-r11,lr}
		ldrb $nlo,[$Xi,#15]
		b rem_4bit_get
		.Lrem_4bit_got:
		and $nhi,$nlo,#0xf0
		and $nlo,$nlo,#0x0f
		mov $cnt,#14

		add $Zhh,$Htbl,$nlo,lsl#4
		ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
		ldrb $nlo,[$Xi,#14]

		add $Thh,$Htbl,$nhi
		and $nhi,$Zll,#0xf @ rem
		ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
		add $nhi,$nhi,$nhi
		eor $Zll,$Tll,$Zll,lsr#4
		ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
		eor $Zll,$Zll,$Zlh,lsl#28
		eor $Zlh,$Tlh,$Zlh,lsr#4
		eor $Zlh,$Zlh,$Zhl,lsl#28
		eor $Zhl,$Thl,$Zhl,lsr#4
		eor $Zhl,$Zhl,$Zhh,lsl#28
		eor $Zhh,$Thh,$Zhh,lsr#4
		and $nhi,$nlo,#0xf0
		eor $Zhh,$Zhh,$Tll,lsl#16
		and $nlo,$nlo,#0x0f

		.Loop:
		add $Thh,$Htbl,$nlo,lsl#4
		and $nlo,$Zll,#0xf @ rem
		subs $cnt,$cnt,#1
		add $nlo,$nlo,$nlo
		ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
		eor $Zll,$Tll,$Zll,lsr#4
		eor $Zll,$Zll,$Zlh,lsl#28
		eor $Zlh,$Tlh,$Zlh,lsr#4
		eor $Zlh,$Zlh,$Zhl,lsl#28
		ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
		eor $Zhl,$Thl,$Zhl,lsr#4
		ldrplb $nlo,[$Xi,$cnt]
		eor $Zhl,$Zhl,$Zhh,lsl#28
		eor $Zhh,$Thh,$Zhh,lsr#4

		add $Thh,$Htbl,$nhi
		and $nhi,$Zll,#0xf @ rem
		eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
		add $nhi,$nhi,$nhi
		ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
		eor $Zll,$Tll,$Zll,lsr#4
		eor $Zll,$Zll,$Zlh,lsl#28
		eor $Zlh,$Tlh,$Zlh,lsr#4
		ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
		eor $Zlh,$Zlh,$Zhl,lsl#28
		eor $Zhl,$Thl,$Zhl,lsr#4
		eor $Zhl,$Zhl,$Zhh,lsl#28
		eor $Zhh,$Thh,$Zhh,lsr#4
		andpl $nhi,$nlo,#0xf0
		andpl $nlo,$nlo,#0x0f
		eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
		bpl .Loop
		___
		&Zsmash();
		$code.=<<___;
		#if __ARM_ARCH__>=5
		ldmia sp!,{r4-r11,pc}
		#else
		ldmia sp!,{r4-r11,lr}
		tst lr,#1
		moveq pc,lr @ be binary compatible with V4, yet
		bx lr @ interoperable with Thumb ISA:-)
		#endif
		.size gcm_gmult_4bit,.-gcm_gmult_4bit
		___
		{
		my $cnt=$Htbl; # $Htbl is used once in the very beginning

		my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
		my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));

		# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
		# in Zo. Or should I say "top bit", because GHASH is specified in
		# reverse bit order? Otherwise straightforward 128-bt H by one input
		# byte multiplication and modulo-reduction, times 16.

		sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
		sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
		sub Q() { shift=~m\|d([1-3]?[02468])\|?"q".($1/2):""; }

		$code.=<<___;
		#if __ARM_ARCH__>=7
		.fpu neon

		.global gcm_gmult_neon
		.type gcm_gmult_neon,%function
		.align 4
		gcm_gmult_neon:
		sub $Htbl,#16 @ point at H in GCM128_CTX
		vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
		vmov.i32 $mod,#0xe1 @ our irreducible polynomial
		vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
		vshr.u64 $mod,#32
		vldmia $Htbl,{$Hhi-$Hlo} @ load H
		veor $zero,$zero
		#ifdef __ARMEL__
		vrev64.8 $IN,$IN
		#endif
		veor $Qpost,$Qpost
		veor $R,$R
		mov $cnt,#16
		veor $Z,$Z
		mov $len,#16
		veor $Zo,$Zo
		vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
		b .Linner_neon
		.size gcm_gmult_neon,.-gcm_gmult_neon

		.global gcm_ghash_neon
		.type gcm_ghash_neon,%function
		.align 4
		gcm_ghash_neon:
		vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
		vmov.i32 $mod,#0xe1 @ our irreducible polynomial
		vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
		vshr.u64 $mod,#32
		vldmia $Xi,{$Hhi-$Hlo} @ load H
		veor $zero,$zero
		nop
		#ifdef __ARMEL__
		vrev64.8 $Z,$Z
		#endif
		.Louter_neon:
		vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
		veor $Qpost,$Qpost
		vld1.64 `&Dlo($IN)`,[$inp]!
		veor $R,$R
		mov $cnt,#16
		#ifdef __ARMEL__
		vrev64.8 $IN,$IN
		#endif
		veor $Zo,$Zo
		veor $IN,$Z @ inp^=Xi
		veor $Z,$Z
		vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
		.Linner_neon:
		subs $cnt,$cnt,#1
		vmull.p8 $Qlo,$Hlo,$xi @ H.loXi[i]
		vmull.p8 $Qhi,$Hhi,$xi @ H.hiXi[i]
		vext.8 $IN,$zero,#1 @ IN>>=8

		veor $Z,$Qpost @ modulo-scheduled part
		vshl.i64 `&Dlo("$R")`,#48
		vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
		veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`

		veor `&Dhi("$Z")`,`&Dlo("$R")`
		vuzp.8 $Qlo,$Qhi
		vsli.8 $Zo,$T,#1 @ compose the "carry" byte
		vext.8 $Z,$zero,#1 @ Z>>=8

		vmull.p8 $R,$Zo,$mod @ "carry"0xe1
		vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
		vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
		veor $Z,$Qhi
		bne .Linner_neon

		veor $Z,$Qpost @ modulo-scheduled artefact
		vshl.i64 `&Dlo("$R")`,#48
		veor `&Dhi("$Z")`,`&Dlo("$R")`

		@ finalization, normalize Z:Zo
		vand $Zo,$mod @ suffices to mask the bit
		vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
		vshl.i64 $Z,#1
		subs $len,#16
		vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
		bne .Louter_neon

		#ifdef __ARMEL__
		vrev64.8 $Z,$Z
		#endif
		sub $Xi,#16
		vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
		vst1.64 `&Dlo("$Z")`,[$Xi,:64]

		bx lr
		.size gcm_ghash_neon,.-gcm_ghash_neon
		#endif
		___
		}
		$code.=<<___;
		.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
		.align 2
		___

		$code =~ s/\`([^\`]*)\`/eval $1/gem;
		$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
		print $code;
		close STDOUT; # enforce flush

crypto/modes/asm/ghash-ia64.pl

0 → 100755

+463 −0

File added.

Preview size limit exceeded, changes collapsed.

crypto/modes/asm/ghash-parisc.pl

0 → 100644

+730 −0

File added.

Preview size limit exceeded, changes collapsed.