Commit dc01af77 authored by Dr. Stephen Henson's avatar Dr. Stephen Henson
Browse files

Sync ASM/modes to add CCM and XTS modes and assembly language optimisation

(from HEAD, original by Andy).
parent 5435d041
Loading
Loading
Loading
Loading
+67 −8
Original line number Diff line number Diff line
@@ -10,15 +10,21 @@ CFLAG=-g
MAKEFILE=	Makefile
AR=		ar r

MODES_ASM_OBJ=

CFLAGS= $(INCLUDES) $(CFLAG)
ASFLAGS= $(INCLUDES) $(ASFLAG)
AFLAGS= $(ASFLAGS)

GENERAL=Makefile
TEST=
APPS=

LIB=$(TOP)/libcrypto.a
LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c
LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o
LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
	ccm128.c xts128.c
LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
	ccm128.o xts128.o $(MODES_ASM_OBJ)

SRC= $(LIBSRC)

@@ -38,6 +44,24 @@ lib: $(LIBOBJ)
	$(RANLIB) $(LIB) || echo Never mind.
	@touch lib

ghash-ia64.s:	asm/ghash-ia64.pl
	$(PERL) asm/ghash-ia64.pl $@ $(CFLAGS)
ghash-x86.s:	asm/ghash-x86.pl
	$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
ghash-x86_64.s:	asm/ghash-x86_64.pl
	$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
ghash-sparcv9.s:	asm/ghash-sparcv9.pl
	$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
ghash-alpha.s:	asm/ghash-alpha.pl
	$(PERL) $< | $(CC) -E - | tee $@ > /dev/null
ghash-parisc.s:	asm/ghash-parisc.pl
	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@

# GNU make "catch all"
ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@

ghash-armv4.o:	ghash-armv4.S

files:
	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

@@ -71,12 +95,47 @@ dclean:
	mv -f Makefile.new $(MAKEFILE)

clean:
	rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
	rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff

# DO NOT DELETE THIS LINE -- make depend depends on it.

cbc128.o: cbc128.c modes.h
cfb128.o: cfb128.c modes.h
ctr128.o: ctr128.c modes.h
cts128.o: cts128.c modes.h
ofb128.o: modes.h ofb128.c
cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h
ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h
cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h
ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h
cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h
gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h
ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c
+451 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
# loops are aggressively modulo-scheduled in respect to references to
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
# scheduling "glitch," because uprofile(1) indicates uniform sample
# distribution, as if all instruction bundles execute in 1.5 cycles.
# Meaning that it could have been even faster, yet 12 cycles is ~60%
# better than gcc-generated code and ~80% than code generated by vendor
# compiler.

$cnt="v0";	# $0
$t0="t0";
$t1="t1";
$t2="t2";
$Thi0="t3";	# $4
$Tlo0="t4";
$Thi1="t5";
$Tlo1="t6";
$rem="t7";	# $8
#################
$Xi="a0";	# $16, input argument block
$Htbl="a1";
$inp="a2";
$len="a3";
$nlo="a4";	# $20
$nhi="a5";
$Zhi="t8";
$Zlo="t9";
$Xhi="t10";	# $24
$Xlo="t11";
$remp="t12";
$rem_4bit="AT";	# $28

{ my $N;
  sub loop() {

	$N++;
$code.=<<___;
.align	4
	extbl	$Xlo,7,$nlo
	and	$nlo,0xf0,$nhi
	sll	$nlo,4,$nlo
	and	$nlo,0xf0,$nlo

	addq	$nlo,$Htbl,$nlo
	ldq	$Zlo,8($nlo)
	addq	$nhi,$Htbl,$nhi
	ldq	$Zhi,0($nlo)

	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	lda	$cnt,6(zero)
	extbl	$Xlo,6,$nlo

	ldq	$Tlo1,8($nhi)
	s8addq	$remp,$rem_4bit,$remp
	ldq	$Thi1,0($nhi)
	srl	$Zlo,4,$Zlo

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	and	$nlo,0xf0,$nhi

	xor	$Tlo1,$Zlo,$Zlo
	sll	$nlo,4,$nlo
	xor	$Thi1,$Zhi,$Zhi
	and	$nlo,0xf0,$nlo

	addq	$nlo,$Htbl,$nlo
	ldq	$Tlo0,8($nlo)
	addq	$nhi,$Htbl,$nhi
	ldq	$Thi0,0($nlo)

.Looplo$N:
	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	subq	$cnt,1,$cnt
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	extbl	$Xlo,$cnt,$nlo

	and	$nlo,0xf0,$nhi
	xor	$Thi0,$Zhi,$Zhi
	xor	$Tlo0,$Zlo,$Zlo
	sll	$nlo,4,$nlo


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	and	$nlo,0xf0,$nlo
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi
	addq	$nlo,$Htbl,$nlo
	addq	$nhi,$Htbl,$nhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	ldq	$Tlo0,8($nlo)
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	ldq	$Thi0,0($nlo)
	bne	$cnt,.Looplo$N


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	lda	$cnt,7(zero)
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	extbl	$Xhi,$cnt,$nlo

	and	$nlo,0xf0,$nhi
	xor	$Thi0,$Zhi,$Zhi
	xor	$Tlo0,$Zlo,$Zlo
	sll	$nlo,4,$nlo

	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	and	$nlo,0xf0,$nlo
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi
	addq	$nlo,$Htbl,$nlo
	addq	$nhi,$Htbl,$nhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	ldq	$Tlo0,8($nlo)
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	ldq	$Thi0,0($nlo)
	unop


.Loophi$N:
	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	subq	$cnt,1,$cnt
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo
	extbl	$Xhi,$cnt,$nlo

	and	$nlo,0xf0,$nhi
	xor	$Thi0,$Zhi,$Zhi
	xor	$Tlo0,$Zlo,$Zlo
	sll	$nlo,4,$nlo


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	and	$nlo,0xf0,$nlo
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi
	addq	$nlo,$Htbl,$nlo
	addq	$nhi,$Htbl,$nhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	ldq	$Tlo0,8($nlo)
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	ldq	$Thi0,0($nlo)
	bne	$cnt,.Loophi$N


	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	srl	$Zlo,4,$Zlo

	ldq	$Tlo1,8($nhi)
	xor	$rem,$Zhi,$Zhi
	ldq	$Thi1,0($nhi)
	s8addq	$remp,$rem_4bit,$remp

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$t0,$Zlo,$Zlo

	xor	$Tlo0,$Zlo,$Zlo
	xor	$Thi0,$Zhi,$Zhi

	and	$Zlo,0x0f,$remp
	sll	$Zhi,60,$t0
	srl	$Zlo,4,$Zlo

	s8addq	$remp,$rem_4bit,$remp
	xor	$rem,$Zhi,$Zhi

	ldq	$rem,0($remp)
	srl	$Zhi,4,$Zhi
	xor	$Tlo1,$Zlo,$Zlo
	xor	$Thi1,$Zhi,$Zhi
	xor	$t0,$Zlo,$Zlo
	xor	$rem,$Zhi,$Zhi
___
}}

$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif

.text

.set	noat
.set	noreorder
.globl	gcm_gmult_4bit
.align	4
.ent	gcm_gmult_4bit
gcm_gmult_4bit:
	.frame	sp,0,ra
	.prologue 0

	ldq	$Xlo,8($Xi)
	ldq	$Xhi,0($Xi)

	br	$rem_4bit,.Lpic1
.Lpic1:	lda	$rem_4bit,rem_4bit-.Lpic1($rem_4bit)
___

	&loop();

$code.=<<___;
	srl	$Zlo,24,$t0	# byte swap
	srl	$Zlo,8,$t1

	sll	$Zlo,8,$t2
	sll	$Zlo,24,$Zlo
	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1

	zapnot	$Zlo,0x88,$Zlo
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zlo,$t0,$Zlo
	srl	$Zhi,24,$t0
	srl	$Zhi,8,$t1

	or	$Zlo,$t2,$Zlo
	sll	$Zhi,8,$t2
	sll	$Zhi,24,$Zhi

	srl	$Zlo,32,$Xlo
	sll	$Zlo,32,$Zlo

	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1
	or	$Zlo,$Xlo,$Xlo

	zapnot	$Zhi,0x88,$Zhi
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zhi,$t0,$Zhi
	or	$Zhi,$t2,$Zhi

	srl	$Zhi,32,$Xhi
	sll	$Zhi,32,$Zhi

	or	$Zhi,$Xhi,$Xhi
	stq	$Xlo,8($Xi)
	stq	$Xhi,0($Xi)

	ret	(ra)
.end	gcm_gmult_4bit
___

$inhi="s0";
$inlo="s1";

$code.=<<___;
.globl	gcm_ghash_4bit
.align	4
.ent	gcm_ghash_4bit
gcm_ghash_4bit:
	lda	sp,-32(sp)
	stq	ra,0(sp)
	stq	s0,8(sp)
	stq	s1,16(sp)
	.mask	0x04000600,-32
	.frame	sp,32,ra
	.prologue 0

	ldq_u	$inhi,0($inp)
	ldq_u	$Thi0,7($inp)
	ldq_u	$inlo,8($inp)
	ldq_u	$Tlo0,15($inp)
	ldq	$Xhi,0($Xi)
	ldq	$Xlo,8($Xi)

	br	$rem_4bit,.Lpic2
.Lpic2:	lda	$rem_4bit,rem_4bit-.Lpic2($rem_4bit)

.Louter:
	extql	$inhi,$inp,$inhi
	extqh	$Thi0,$inp,$Thi0
	or	$inhi,$Thi0,$inhi
	lda	$inp,16($inp)

	extql	$inlo,$inp,$inlo
	extqh	$Tlo0,$inp,$Tlo0
	or	$inlo,$Tlo0,$inlo
	subq	$len,16,$len

	xor	$Xlo,$inlo,$Xlo
	xor	$Xhi,$inhi,$Xhi
___

	&loop();

$code.=<<___;
	srl	$Zlo,24,$t0	# byte swap
	srl	$Zlo,8,$t1

	sll	$Zlo,8,$t2
	sll	$Zlo,24,$Zlo
	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1

	zapnot	$Zlo,0x88,$Zlo
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zlo,$t0,$Zlo
	srl	$Zhi,24,$t0
	srl	$Zhi,8,$t1

	or	$Zlo,$t2,$Zlo
	sll	$Zhi,8,$t2
	sll	$Zhi,24,$Zhi

	srl	$Zlo,32,$Xlo
	sll	$Zlo,32,$Zlo
	beq	$len,.Ldone

	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1
	or	$Zlo,$Xlo,$Xlo
	ldq_u	$inhi,0($inp)

	zapnot	$Zhi,0x88,$Zhi
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2
	ldq_u	$Thi0,7($inp)

	or	$Zhi,$t0,$Zhi
	or	$Zhi,$t2,$Zhi
	ldq_u	$inlo,8($inp)
	ldq_u	$Tlo0,15($inp)

	srl	$Zhi,32,$Xhi
	sll	$Zhi,32,$Zhi

	or	$Zhi,$Xhi,$Xhi
	br	zero,.Louter

.Ldone:
	zapnot	$t0,0x11,$t0
	zapnot	$t1,0x22,$t1
	or	$Zlo,$Xlo,$Xlo

	zapnot	$Zhi,0x88,$Zhi
	or	$t0,$t1,$t0
	zapnot	$t2,0x44,$t2

	or	$Zhi,$t0,$Zhi
	or	$Zhi,$t2,$Zhi

	srl	$Zhi,32,$Xhi
	sll	$Zhi,32,$Zhi

	or	$Zhi,$Xhi,$Xhi

	stq	$Xlo,8($Xi)
	stq	$Xhi,0($Xi)

	.set	noreorder
	/*ldq	ra,0(sp)*/
	ldq	s0,8(sp)
	ldq	s1,16(sp)
	lda	sp,32(sp)
	ret	(ra)
.end	gcm_ghash_4bit

.align	4
rem_4bit:
	.quad	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
	.quad	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
	.quad	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
	.quad	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align	4

___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;
+429 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.

# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
#   bit shift operation is neatly fused with 128-bit xor here, and
#   "538B" variant would eliminate only 4-5 instructions out of 32
#   in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
#   consumption might be unappreciated (for so little improvement);
#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

$Xi="r0";	# argument block
$Htbl="r1";
$inp="r2";
$len="r3";

$Zll="r4";	# variables
$Zlh="r5";
$Zhl="r6";
$Zhh="r7";
$Tll="r8";
$Tlh="r9";
$Thl="r10";
$Thh="r11";
$nlo="r12";
################# r13 is stack pointer
$nhi="r14";
################# r15 is program counter

$rem_4bit=$inp;	# used in gcm_gmult_4bit
$cnt=$len;

sub Zsmash() {
  my $i=12;
  my @args=@_;
  for ($Zll,$Zlh,$Zhl,$Zhh) {
    $code.=<<___;
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
	rev	$_,$_
	str	$_,[$Xi,#$i]
#elif defined(__ARMEB__)
	str	$_,[$Xi,#$i]
#else
	mov	$Tlh,$_,lsr#8
	strb	$_,[$Xi,#$i+3]
	mov	$Thl,$_,lsr#16
	strb	$Tlh,[$Xi,#$i+2]
	mov	$Thh,$_,lsr#24
	strb	$Thl,[$Xi,#$i+1]
	strb	$Thh,[$Xi,#$i]
#endif
___
    $code.="\t".shift(@args)."\n";
    $i-=4;
  }
}

$code=<<___;
#include "arm_arch.h"

.text
.code	32

.type	rem_4bit,%object
.align	5
rem_4bit:
.short	0x0000,0x1C20,0x3840,0x2460
.short	0x7080,0x6CA0,0x48C0,0x54E0
.short	0xE100,0xFD20,0xD940,0xC560
.short	0x9180,0x8DA0,0xA9C0,0xB5E0
.size	rem_4bit,.-rem_4bit

.type	rem_4bit_get,%function
rem_4bit_get:
	sub	$rem_4bit,pc,#8
	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
	b	.Lrem_4bit_got
	nop
.size	rem_4bit_get,.-rem_4bit_get

.global	gcm_ghash_4bit
.type	gcm_ghash_4bit,%function
gcm_ghash_4bit:
	sub	r12,pc,#8
	add	$len,$inp,$len		@ $len to point at the end
	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
	sub	r12,r12,#48		@ &rem_4bit

	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
	stmdb	sp!,{r4-r11}		@ ... to stack

	ldrb	$nlo,[$inp,#15]
	ldrb	$nhi,[$Xi,#15]
.Louter:
	eor	$nlo,$nlo,$nhi
	and	$nhi,$nlo,#0xf0
	and	$nlo,$nlo,#0x0f
	mov	$cnt,#14

	add	$Zhh,$Htbl,$nlo,lsl#4
	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
	add	$Thh,$Htbl,$nhi
	ldrb	$nlo,[$inp,#14]

	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	add	$nhi,$nhi,$nhi
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
	eor	$Zll,$Zll,$Zlh,lsl#28
	ldrb	$nhi,[$Xi,#14]
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	eor	$nlo,$nlo,$nhi
	and	$nhi,$nlo,#0xf0
	and	$nlo,$nlo,#0x0f
	eor	$Zhh,$Zhh,$Tll,lsl#16

.Linner:
	add	$Thh,$Htbl,$nlo,lsl#4
	and	$nlo,$Zll,#0xf		@ rem
	subs	$cnt,$cnt,#1
	add	$nlo,$nlo,$nlo
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
	eor	$Zll,$Tll,$Zll,lsr#4
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
	eor	$Zhl,$Thl,$Zhl,lsr#4
	ldrplb	$nlo,[$inp,$cnt]
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4

	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	add	$nhi,$nhi,$nhi
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrplb	$Tll,[$Xi,$cnt]
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	ldrh	$Tlh,[sp,$nhi]
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eorpl	$nlo,$nlo,$Tll
	eor	$Zhh,$Thh,$Zhh,lsr#4
	andpl	$nhi,$nlo,#0xf0
	andpl	$nlo,$nlo,#0x0f
	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
	bpl	.Linner

	ldr	$len,[sp,#32]		@ re-load $len/end
	add	$inp,$inp,#16
	mov	$nhi,$Zll
___
	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
$code.=<<___;
	bne	.Louter

	add	sp,sp,#36
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r11,pc}
#else
	ldmia	sp!,{r4-r11,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	gcm_ghash_4bit,.-gcm_ghash_4bit

.global	gcm_gmult_4bit
.type	gcm_gmult_4bit,%function
gcm_gmult_4bit:
	stmdb	sp!,{r4-r11,lr}
	ldrb	$nlo,[$Xi,#15]
	b	rem_4bit_get
.Lrem_4bit_got:
	and	$nhi,$nlo,#0xf0
	and	$nlo,$nlo,#0x0f
	mov	$cnt,#14

	add	$Zhh,$Htbl,$nlo,lsl#4
	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
	ldrb	$nlo,[$Xi,#14]

	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	add	$nhi,$nhi,$nhi
	eor	$Zll,$Tll,$Zll,lsr#4
	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	and	$nhi,$nlo,#0xf0
	eor	$Zhh,$Zhh,$Tll,lsl#16
	and	$nlo,$nlo,#0x0f

.Loop:
	add	$Thh,$Htbl,$nlo,lsl#4
	and	$nlo,$Zll,#0xf		@ rem
	subs	$cnt,$cnt,#1
	add	$nlo,$nlo,$nlo
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
	eor	$Zll,$Tll,$Zll,lsr#4
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
	eor	$Zhl,$Thl,$Zhl,lsr#4
	ldrplb	$nlo,[$Xi,$cnt]
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4

	add	$Thh,$Htbl,$nhi
	and	$nhi,$Zll,#0xf		@ rem
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	add	$nhi,$nhi,$nhi
	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
	eor	$Zll,$Tll,$Zll,lsr#4
	eor	$Zll,$Zll,$Zlh,lsl#28
	eor	$Zlh,$Tlh,$Zlh,lsr#4
	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
	eor	$Zlh,$Zlh,$Zhl,lsl#28
	eor	$Zhl,$Thl,$Zhl,lsr#4
	eor	$Zhl,$Zhl,$Zhh,lsl#28
	eor	$Zhh,$Thh,$Zhh,lsr#4
	andpl	$nhi,$nlo,#0xf0
	andpl	$nlo,$nlo,#0x0f
	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
	bpl	.Loop
___
	&Zsmash();
$code.=<<___;
#if __ARM_ARCH__>=5
	ldmia	sp!,{r4-r11,pc}
#else
	ldmia	sp!,{r4-r11,lr}
	tst	lr,#1
	moveq	pc,lr			@ be binary compatible with V4, yet
	bx	lr			@ interoperable with Thumb ISA:-)
#endif
.size	gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
my $cnt=$Htbl;	# $Htbl is used once in the very beginning

my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));

# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
# in Zo. Or should I say "top bit", because GHASH is specified in
# reverse bit order? Otherwise straightforward 128-bt H by one input
# byte multiplication and modulo-reduction, times 16.

sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }

$code.=<<___;
#if __ARM_ARCH__>=7
.fpu	neon

.global	gcm_gmult_neon
.type	gcm_gmult_neon,%function
.align	4
gcm_gmult_neon:
	sub		$Htbl,#16		@ point at H in GCM128_CTX
	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
	vshr.u64	$mod,#32
	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
	veor		$zero,$zero
#ifdef __ARMEL__
	vrev64.8	$IN,$IN
#endif
	veor		$Qpost,$Qpost
	veor		$R,$R
	mov		$cnt,#16
	veor		$Z,$Z
	mov		$len,#16
	veor		$Zo,$Zo
	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
	b		.Linner_neon
.size	gcm_gmult_neon,.-gcm_gmult_neon

.global	gcm_ghash_neon
.type	gcm_ghash_neon,%function
.align	4
gcm_ghash_neon:
	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
	vshr.u64	$mod,#32
	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
	veor		$zero,$zero
	nop
#ifdef __ARMEL__
	vrev64.8	$Z,$Z
#endif
.Louter_neon:
	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
	veor		$Qpost,$Qpost
	vld1.64		`&Dlo($IN)`,[$inp]!
	veor		$R,$R
	mov		$cnt,#16
#ifdef __ARMEL__
	vrev64.8	$IN,$IN
#endif
	veor		$Zo,$Zo
	veor		$IN,$Z			@ inp^=Xi
	veor		$Z,$Z
	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
.Linner_neon:
	subs		$cnt,$cnt,#1
	vmull.p8	$Qlo,$Hlo,$xi		@ H.loXi[i]
	vmull.p8	$Qhi,$Hhi,$xi		@ H.hiXi[i]
	vext.8		$IN,$zero,#1		@ IN>>=8

	veor		$Z,$Qpost		@ modulo-scheduled part
	vshl.i64	`&Dlo("$R")`,#48
	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`

	veor		`&Dhi("$Z")`,`&Dlo("$R")`
	vuzp.8		$Qlo,$Qhi
	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
	vext.8		$Z,$zero,#1		@ Z>>=8

	vmull.p8	$R,$Zo,$mod		@ "carry"0xe1
	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
	veor		$Z,$Qhi
	bne		.Linner_neon

	veor		$Z,$Qpost		@ modulo-scheduled artefact
	vshl.i64	`&Dlo("$R")`,#48
	veor		`&Dhi("$Z")`,`&Dlo("$R")`

	@ finalization, normalize Z:Zo
	vand		$Zo,$mod		@ suffices to mask the bit
	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
	vshl.i64	$Z,#1
	subs		$len,#16
	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
	bne		.Louter_neon

#ifdef __ARMEL__
	vrev64.8	$Z,$Z
#endif
	sub		$Xi,#16	
	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
	vst1.64		`&Dlo("$Z")`,[$Xi,:64]

	bx	lr
.size	gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align  2
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
print $code;
close STDOUT; # enforce flush
+463 −0

File added.

Preview size limit exceeded, changes collapsed.

+730 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading