crypto/chacha/asm/chacha-s390x.pl: add vx code path. (f760137b) · Commits · CYBER - Cyber Security / TS 103 523 MSP / TLMSP / TLMSP OpenSSL

crypto/chacha/asm/chacha-s390x.pl

+557 −259

Original line number	Diff line number	Diff line
		#! /usr/bin/env perl
		# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
		# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
		#
		# Licensed under the Apache License 2.0 (the "License"). You may not use
		# this file except in compliance with the License. You can obtain a copy
		@@ -20,41 +20,46 @@
		#
		# 3 times faster than compiler-generated code.

		$flavour = shift;
		#
		# August 2018
		#
		# Add vx code path.
		#
		# Copyright IBM Corp. 2018
		# Author: Patrick Steuer <patrick.steuer@de.ibm.com>

		use strict;
		use FindBin qw($Bin);
		use lib "$Bin/../..";
		use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);

		my $flavour = shift;

		my ($z,$SIZE_T);
		if ($flavour =~ /3[12]/) {
		$z=0; # S/390 ABI
		$SIZE_T=4;
		$g="";
		} else {
		$z=1; # zSeries ABI
		$SIZE_T=8;
		$g="g";
		}

		my $output;
		while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
		open STDOUT,">$output";

		sub AUTOLOAD() # thunk [simplified] x86-style perlasm
		{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
		$code .= "\t$opcode\t".join(',',@_)."\n";
		}

		my $sp="%r15";

		my $stdframe=16$SIZE_T+48;
		my $frame=$stdframe+4*20;

		my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));

		my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
		my @t=map("%r$_",(8,9));
		my @v=map("%v$_",(16..31));

		sub ROUND {
		my ($a0,$b0,$c0,$d0)=@_;
		my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
		my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
		my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
		my ($xc,$xc_)=map("\"$_\"",@t);
		my @x=map("\"$_\"",@x);
		my ($xc,$xc_)=map("$_",@t);

		# Consider order in which variables are addressed by their
		# index:
		@@ -78,249 +83,542 @@ my @x=map("\"$_\"",@x);
		# 'c' stores and loads in the middle, but none in the beginning
		# or end.

		(
		"&alr (@x[$a0],@x[$b0])", # Q1
		"&alr (@x[$a1],@x[$b1])", # Q2
		"&xr (@x[$d0],@x[$a0])",
		"&xr (@x[$d1],@x[$a1])",
		"&rll (@x[$d0],@x[$d0],16)",
		"&rll (@x[$d1],@x[$d1],16)",

		"&alr ($xc,@x[$d0])",
		"&alr ($xc_,@x[$d1])",
		"&xr (@x[$b0],$xc)",
		"&xr (@x[$b1],$xc_)",
		"&rll (@x[$b0],@x[$b0],12)",
		"&rll (@x[$b1],@x[$b1],12)",

		"&alr (@x[$a0],@x[$b0])",
		"&alr (@x[$a1],@x[$b1])",
		"&xr (@x[$d0],@x[$a0])",
		"&xr (@x[$d1],@x[$a1])",
		"&rll (@x[$d0],@x[$d0],8)",
		"&rll (@x[$d1],@x[$d1],8)",

		"&alr ($xc,@x[$d0])",
		"&alr ($xc_,@x[$d1])",
		"&xr (@x[$b0],$xc)",
		"&xr (@x[$b1],$xc_)",
		"&rll (@x[$b0],@x[$b0],7)",
		"&rll (@x[$b1],@x[$b1],7)",

		"&stm ($xc,$xc_,'$stdframe+48+4$c0($sp)')", # reload pair of 'c's
		"&lm ($xc,$xc_,'$stdframe+48+4$c2($sp)')",

		"&alr (@x[$a2],@x[$b2])", # Q3
		"&alr (@x[$a3],@x[$b3])", # Q4
		"&xr (@x[$d2],@x[$a2])",
		"&xr (@x[$d3],@x[$a3])",
		"&rll (@x[$d2],@x[$d2],16)",
		"&rll (@x[$d3],@x[$d3],16)",

		"&alr ($xc,@x[$d2])",
		"&alr ($xc_,@x[$d3])",
		"&xr (@x[$b2],$xc)",
		"&xr (@x[$b3],$xc_)",
		"&rll (@x[$b2],@x[$b2],12)",
		"&rll (@x[$b3],@x[$b3],12)",

		"&alr (@x[$a2],@x[$b2])",
		"&alr (@x[$a3],@x[$b3])",
		"&xr (@x[$d2],@x[$a2])",
		"&xr (@x[$d3],@x[$a3])",
		"&rll (@x[$d2],@x[$d2],8)",
		"&rll (@x[$d3],@x[$d3],8)",

		"&alr ($xc,@x[$d2])",
		"&alr ($xc_,@x[$d3])",
		"&xr (@x[$b2],$xc)",
		"&xr (@x[$b3],$xc_)",
		"&rll (@x[$b2],@x[$b2],7)",
		"&rll (@x[$b3],@x[$b3],7)"
		);
		alr (@x[$a0],@x[$b0]); # Q1
		alr (@x[$a1],@x[$b1]); # Q2
		xr (@x[$d0],@x[$a0]);
		xr (@x[$d1],@x[$a1]);
		rll (@x[$d0],@x[$d0],16);
		rll (@x[$d1],@x[$d1],16);

		alr ($xc,@x[$d0]);
		alr ($xc_,@x[$d1]);
		xr (@x[$b0],$xc);
		xr (@x[$b1],$xc_);
		rll (@x[$b0],@x[$b0],12);
		rll (@x[$b1],@x[$b1],12);

		alr (@x[$a0],@x[$b0]);
		alr (@x[$a1],@x[$b1]);
		xr (@x[$d0],@x[$a0]);
		xr (@x[$d1],@x[$a1]);
		rll (@x[$d0],@x[$d0],8);
		rll (@x[$d1],@x[$d1],8);

		alr ($xc,@x[$d0]);
		alr ($xc_,@x[$d1]);
		xr (@x[$b0],$xc);
		xr (@x[$b1],$xc_);
		rll (@x[$b0],@x[$b0],7);
		rll (@x[$b1],@x[$b1],7);

		stm ($xc,$xc_,"$stdframe+48+4$c0($sp)"); # reload pair of 'c's
		lm ($xc,$xc_,"$stdframe+48+4$c2($sp)");

		alr (@x[$a2],@x[$b2]); # Q3
		alr (@x[$a3],@x[$b3]); # Q4
		xr (@x[$d2],@x[$a2]);
		xr (@x[$d3],@x[$a3]);
		rll (@x[$d2],@x[$d2],16);
		rll (@x[$d3],@x[$d3],16);

		alr ($xc,@x[$d2]);
		alr ($xc_,@x[$d3]);
		xr (@x[$b2],$xc);
		xr (@x[$b3],$xc_);
		rll (@x[$b2],@x[$b2],12);
		rll (@x[$b3],@x[$b3],12);

		alr (@x[$a2],@x[$b2]);
		alr (@x[$a3],@x[$b3]);
		xr (@x[$d2],@x[$a2]);
		xr (@x[$d3],@x[$a3]);
		rll (@x[$d2],@x[$d2],8);
		rll (@x[$d3],@x[$d3],8);

		alr ($xc,@x[$d2]);
		alr ($xc_,@x[$d3]);
		xr (@x[$b2],$xc);
		xr (@x[$b3],$xc_);
		rll (@x[$b2],@x[$b2],7);
		rll (@x[$b3],@x[$b3],7);
		}

		sub VX_ROUND {
		my ($a0,$b0,$c0,$d0)=@_;
		my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
		my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
		my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));

		vaf (@v[$a0],@v[$a0],@v[$b0]);
		vaf (@v[$a1],@v[$a1],@v[$b1]);
		vaf (@v[$a2],@v[$a2],@v[$b2]);
		vaf (@v[$a3],@v[$a3],@v[$b3]);
		vx (@v[$d0],@v[$d0],@v[$a0]);
		vx (@v[$d1],@v[$d1],@v[$a1]);
		vx (@v[$d2],@v[$d2],@v[$a2]);
		vx (@v[$d3],@v[$d3],@v[$a3]);
		verllf (@v[$d0],@v[$d0],16);
		verllf (@v[$d1],@v[$d1],16);
		verllf (@v[$d2],@v[$d2],16);
		verllf (@v[$d3],@v[$d3],16);

		vaf (@v[$c0],@v[$c0],@v[$d0]);
		vaf (@v[$c1],@v[$c1],@v[$d1]);
		vaf (@v[$c2],@v[$c2],@v[$d2]);
		vaf (@v[$c3],@v[$c3],@v[$d3]);
		vx (@v[$b0],@v[$b0],@v[$c0]);
		vx (@v[$b1],@v[$b1],@v[$c1]);
		vx (@v[$b2],@v[$b2],@v[$c2]);
		vx (@v[$b3],@v[$b3],@v[$c3]);
		verllf (@v[$b0],@v[$b0],12);
		verllf (@v[$b1],@v[$b1],12);
		verllf (@v[$b2],@v[$b2],12);
		verllf (@v[$b3],@v[$b3],12);

		vaf (@v[$a0],@v[$a0],@v[$b0]);
		vaf (@v[$a1],@v[$a1],@v[$b1]);
		vaf (@v[$a2],@v[$a2],@v[$b2]);
		vaf (@v[$a3],@v[$a3],@v[$b3]);
		vx (@v[$d0],@v[$d0],@v[$a0]);
		vx (@v[$d1],@v[$d1],@v[$a1]);
		vx (@v[$d2],@v[$d2],@v[$a2]);
		vx (@v[$d3],@v[$d3],@v[$a3]);
		verllf (@v[$d0],@v[$d0],8);
		verllf (@v[$d1],@v[$d1],8);
		verllf (@v[$d2],@v[$d2],8);
		verllf (@v[$d3],@v[$d3],8);

		vaf (@v[$c0],@v[$c0],@v[$d0]);
		vaf (@v[$c1],@v[$c1],@v[$d1]);
		vaf (@v[$c2],@v[$c2],@v[$d2]);
		vaf (@v[$c3],@v[$c3],@v[$d3]);
		vx (@v[$b0],@v[$b0],@v[$c0]);
		vx (@v[$b1],@v[$b1],@v[$c1]);
		vx (@v[$b2],@v[$b2],@v[$c2]);
		vx (@v[$b3],@v[$b3],@v[$c3]);
		verllf (@v[$b0],@v[$b0],7);
		verllf (@v[$b1],@v[$b1],7);
		verllf (@v[$b2],@v[$b2],7);
		verllf (@v[$b3],@v[$b3],7);
		}

		PERLASM_BEGIN($output);

		INCLUDE ("s390x_arch.h");
		TEXT ();

		################
		# void ChaCha20_ctr32(unsigned char out, const unsigned char inp, size_t len,
		# const unsigned int key[8], const unsigned int counter[4])
		{
		my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));

		# VX CODE PATH
		{
		my $off=$z816+8; # offset(initial state)
		my $frame=$stdframe+4*16+$off;

		GLOBL ("ChaCha20_ctr32");
		TYPE ("ChaCha20_ctr32","\@function");
		ALIGN (32);
		LABEL ("ChaCha20_ctr32");
		larl ("%r1","OPENSSL_s390xcap_P");

		lghi ("%r0",64);
		&{$z? \&cgr:\&cr} ($len,"%r0");
		jle ("_s390x_chacha_novx");

		lg ("%r0","S390X_STFLE+16(%r1)");
		tmhh ("%r0",0x4000); # check for vector facility
		jz ("_s390x_chacha_novx");

		if (!$z) {
		llgfr ($len,$len);
		std ("%f4","16$SIZE_T+28($sp)");
		std ("%f6","16$SIZE_T+38($sp)");
		}
		&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");

		lghi ("%r1",-$frame);
		lgr ("%r0",$sp);
		la ($sp,"0(%r1,$sp)"); # allocate stack frame

		larl ("%r7",".Lsigma");
		&{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain

		vstm ("%v8","%v15","8($sp)") if ($z);

		vlm ("%v1","%v2","0($key)"); # load key
		vl ("%v0","0(%r7)"); # load sigma constant
		vl ("%v3","0($counter)"); # load iv (counter\|\|nonce)
		l ("%r0","0($counter)"); # load counter
		vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack

		srlg ("%r1",$len,8);
		ltgr ("%r1","%r1");
		jz (".Lvx_4x_done");

		ALIGN (16); # process 4 64-byte blocks
		LABEL (".Lvx_4x");
		vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
		# state
		vl ("%v31","16(%r7)");
		vaf ("%v12","%v12","%v31"); # increment counter

		vlr (@v[$_],"%v$_") for (0..15); # copy initial state

		lhi ("%r6",10);
		j (".Loop_vx_4x");

		ALIGN (16);
		LABEL (".Loop_vx_4x");
		VX_ROUND( 0, 4, 8,12); # column round
		VX_ROUND( 0, 5,10,15); # diagonal round
		brct ("%r6",".Loop_vx_4x");

		vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
		# state (mod 32)
		vlm ("%v6","%v7","32(%r7)"); # load vperm operands

		for (0..3) { # blocks 1,2
		vmrhf ("%v0",@v[$_4+0],@v[$_4+1]); # ks = serialize(state)
		vmrhf ("%v1",@v[$_4+2],@v[$_4+3]);
		vperm ("%v".($_+ 8),"%v0","%v1","%v6");
		vperm ("%v".($_+12),"%v0","%v1","%v7");
		}
		vlm ("%v0","%v7","0($inp)"); # load in
		vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
		vstm ("%v0","%v7","0($out)"); # store out

		vlm ("%v6","%v7","32(%r7)"); # restore vperm operands

		for (0..3) { # blocks 2,3
		vmrlf ("%v0",@v[$_4+0],@v[$_4+1]); # ks = serialize(state)
		vmrlf ("%v1",@v[$_4+2],@v[$_4+3]);
		vperm ("%v".($_+ 8),"%v0","%v1","%v6");
		vperm ("%v".($_+12),"%v0","%v1","%v7");
		}
		vlm ("%v0","%v7","128($inp)"); # load in
		vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
		vstm ("%v0","%v7","128($out)"); # store out

		ahi ("%r0",4);
		st ("%r0","48+$off($sp)"); # update initial state

		la ($inp,"256($inp)");
		la ($out,"256($out)");
		brctg ("%r1",".Lvx_4x");

		ALIGN (16);
		LABEL (".Lvx_4x_done");
		lghi ("%r1",0xff);
		ngr ($len,"%r1");
		jnz (".Lvx_rem");

		ALIGN (16);
		LABEL (".Lvx_done");
		vzero ("%v$_") for (16..31); # wipe ks and key copy
		vstm ("%v16","%v17","16+$off($sp)");
		vlm ("%v8","%v15","8($sp)") if ($z);

		la ($sp,"$frame($sp)");
		&{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");

		if (!$z) {
		ld ("%f4","16$SIZE_T+28($sp)");
		ld ("%f6","16$SIZE_T+38($sp)");
		vzero ("%v$_") for (8..15);
		}
		br ("%r14");
		ALIGN (16);
		LABEL (".Lvx_rem");
		lhi ("%r0",64);

		sr ($len,"%r0");
		brc (2,".Lvx_rem_g64"); # cc==2?

		lghi ("%r1",-$stdframe);

		la ($counter,"48+$off($sp)"); # load updated iv
		ar ($len,"%r0"); # restore len

		lgr ("%r7",$counter);
		&{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
		la ($sp,"0(%r1,$sp)");

		bras ("%r14","_s390x_chacha_novx");

		la ($sp,"$stdframe($sp)");
		&{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
		lgr ($counter,"%r7");
		j (".Lvx_done");

		ALIGN (16);
		LABEL (".Lvx_rem_g64");
		vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
		# state
		vl ("%v31","16(%r7)");
		vaf ("%v12","%v12","%v31"); # increment counter

		$code.=<<___;
		.text

		.globl ChaCha20_ctr32
		.type ChaCha20_ctr32,\@function
		.align 32
		ChaCha20_ctr32:
		lt${g}r $len,$len # $len==0?
		bzr %r14
		a${g}hi $len,-64
		l${g}hi %r1,-$frame
		stm${g} %r6,%r15,`6*$SIZE_T`($sp)
		sl${g}r $out,$inp # difference
		la $len,0($inp,$len) # end of input minus 64
		larl %r7,.Lsigma
		lgr %r0,$sp
		la $sp,0(%r1,$sp)
		st${g} %r0,0($sp)

		lmg %r8,%r11,0($key) # load key
		lmg %r12,%r13,0($counter) # load counter
		lmg %r6,%r7,0(%r7) # load sigma constant

		la %r14,0($inp)
		st${g} $out,$frame+3*$SIZE_T($sp)
		st${g} $len,$frame+4*$SIZE_T($sp)
		stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
		srlg @x[12],%r12,32 # 32-bit counter value
		j .Loop_outer

		.align 16
		.Loop_outer:
		lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
		lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
		lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
		stm @t[0],@t[1],$stdframe+48+410($sp) # offload x[10]-x[11]
		lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
		st @x[12],$stdframe+4*12($sp) # save counter
		st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
		lhi %r14,10
		j .Loop

		.align 4
		.Loop:
		___
		foreach (&ROUND(0, 4, 8,12)) { eval; }
		foreach (&ROUND(0, 5,10,15)) { eval; }
		$code.=<<___;
		brct %r14,.Loop

		l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
		stm @t[0],@t[1],$stdframe+48+48($sp) # offload x[8]-x[9]
		lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)

		al @x[0],$stdframe+4*0($sp) # accumulate key schedule
		al @x[1],$stdframe+4*1($sp)
		al @x[2],$stdframe+4*2($sp)
		al @x[3],$stdframe+4*3($sp)
		al @x[4],$stdframe+4*4($sp)
		al @x[5],$stdframe+4*5($sp)
		al @x[6],$stdframe+4*6($sp)
		al @x[7],$stdframe+4*7($sp)
		lrvr @x[0],@x[0]
		lrvr @x[1],@x[1]
		lrvr @x[2],@x[2]
		lrvr @x[3],@x[3]
		lrvr @x[4],@x[4]
		lrvr @x[5],@x[5]
		lrvr @x[6],@x[6]
		lrvr @x[7],@x[7]
		al @x[12],$stdframe+4*12($sp)
		al @x[13],$stdframe+4*13($sp)
		al @x[14],$stdframe+4*14($sp)
		al @x[15],$stdframe+4*15($sp)
		lrvr @x[12],@x[12]
		lrvr @x[13],@x[13]
		lrvr @x[14],@x[14]
		lrvr @x[15],@x[15]

		la @t[0],0(@t[0],%r14) # reconstruct output pointer
		cl${g}r %r14,@t[1]
		jh .Ltail

		x @x[0],4*0(%r14) # xor with input
		x @x[1],4*1(%r14)
		st @x[0],4*0(@t[0]) # store output
		x @x[2],4*2(%r14)
		st @x[1],4*1(@t[0])
		x @x[3],4*3(%r14)
		st @x[2],4*2(@t[0])
		x @x[4],4*4(%r14)
		st @x[3],4*3(@t[0])
		lm @x[0],@x[3],$stdframe+48+48($sp) # load x[8]-x[11]
		x @x[5],4*5(%r14)
		st @x[4],4*4(@t[0])
		x @x[6],4*6(%r14)
		al @x[0],$stdframe+4*8($sp)
		st @x[5],4*5(@t[0])
		x @x[7],4*7(%r14)
		al @x[1],$stdframe+4*9($sp)
		st @x[6],4*6(@t[0])
		x @x[12],4*12(%r14)
		al @x[2],$stdframe+4*10($sp)
		st @x[7],4*7(@t[0])
		x @x[13],4*13(%r14)
		al @x[3],$stdframe+4*11($sp)
		st @x[12],4*12(@t[0])
		x @x[14],4*14(%r14)
		st @x[13],4*13(@t[0])
		x @x[15],4*15(%r14)
		st @x[14],4*14(@t[0])
		lrvr @x[0],@x[0]
		st @x[15],4*15(@t[0])
		lrvr @x[1],@x[1]
		lrvr @x[2],@x[2]
		lrvr @x[3],@x[3]
		lhi @x[12],1
		x @x[0],4*8(%r14)
		al @x[12],$stdframe+4*12($sp) # increment counter
		x @x[1],4*9(%r14)
		st @x[0],4*8(@t[0])
		x @x[2],4*10(%r14)
		st @x[1],4*9(@t[0])
		x @x[3],4*11(%r14)
		st @x[2],4*10(@t[0])
		st @x[3],4*11(@t[0])

		cl${g}r %r14,@t[1] # done yet?
		la %r14,64(%r14)
		jl .Loop_outer

		.Ldone:
		xgr %r0,%r0
		xgr %r1,%r1
		xgr %r2,%r2
		xgr %r3,%r3
		stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
		stmg %r0,%r3,$stdframe+4*12($sp)

		lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
		br %r14

		.align 16
		.Ltail:
		la @t[1],64($t[1])
		stm @x[0],@x[7],$stdframe+4*0($sp)
		sl${g}r @t[1],%r14
		lm @x[0],@x[3],$stdframe+48+48($sp)
		l${g}hi @x[6],0
		stm @x[12],@x[15],$stdframe+4*12($sp)
		al @x[0],$stdframe+4*8($sp)
		al @x[1],$stdframe+4*9($sp)
		al @x[2],$stdframe+4*10($sp)
		al @x[3],$stdframe+4*11($sp)
		lrvr @x[0],@x[0]
		lrvr @x[1],@x[1]
		lrvr @x[2],@x[2]
		lrvr @x[3],@x[3]
		stm @x[0],@x[3],$stdframe+4*8($sp)

		.Loop_tail:
		llgc @x[4],0(@x[6],%r14)
		llgc @x[5],$stdframe(@x[6],$sp)
		xr @x[5],@x[4]
		stc @x[5],0(@x[6],@t[0])
		la @x[6],1(@x[6])
		brct @t[1],.Loop_tail

		j .Ldone
		.size ChaCha20_ctr32,.-ChaCha20_ctr32

		.align 32
		.Lsigma:
		.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
		.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
		.align 4
		___

		foreach (split("\n",$code)) {
		s/\`([^\`]*)\`/eval $1/ge;

		print $_,"\n";
		vlr (@v[$_],"%v$_") for (0..15); # state = initial state

		lhi ("%r6",10);
		j (".Loop_vx_rem");

		ALIGN (16);
		LABEL (".Loop_vx_rem");
		VX_ROUND( 0, 4, 8,12); # column round
		VX_ROUND( 0, 5,10,15); # diagonal round
		brct ("%r6",".Loop_vx_rem");

		vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
		# state (mod 32)
		vlm ("%v6","%v7","32(%r7)"); # load vperm operands

		for (0..3) { # blocks 1,2
		vmrhf ("%v0",@v[$_4+0],@v[$_4+1]); # ks = serialize(state)
		vmrhf ("%v1",@v[$_4+2],@v[$_4+3]);
		vperm ("%v".($_+8),"%v0","%v1","%v6");
		vperm ("%v".($_+12),"%v0","%v1","%v7");
		}
		vlm ("%v0","%v3","0($inp)"); # load in
		vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
		vstm ("%v0","%v3","0($out)"); # store out

		la ($inp,"64($inp)");
		la ($out,"64($out)");

		sr ($len,"%r0");
		brc (4,".Lvx_tail"); # cc==4?

		vlm ("%v0","%v3","0($inp)"); # load in
		vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
		vstm ("%v0","%v3","0($out)"); # store out
		jz (".Lvx_done");

		for (0..3) { # blocks 3,4
		vmrlf ("%v0",@v[$_4+0],@v[$_4+1]); # ks = serialize(state)
		vmrlf ("%v1",@v[$_4+2],@v[$_4+3]);
		vperm ("%v".($_+12),"%v0","%v1","%v6");
		vperm ("%v".($_+8),"%v0","%v1","%v7");
		}
		la ($inp,"64($inp)");
		la ($out,"64($out)");

		sr ($len,"%r0");
		brc (4,".Lvx_tail"); # cc==4?

		vlm ("%v0","%v3","0($inp)"); # load in
		vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
		vstm ("%v0","%v3","0($out)"); # store out
		jz (".Lvx_done");

		la ($inp,"64($inp)");
		la ($out,"64($out)");

		sr ($len,"%r0");
		vlr ("%v".($_+4),"%v$_") for (8..11);
		j (".Lvx_tail");

		ALIGN (16);
		LABEL (".Lvx_tail");
		ar ($len,"%r0"); # restore $len
		ahi ($len,-1);

		lhi ("%r0",16);
		for (0..2) {
		vll ("%v0",$len,($_*16)."($inp)");
		vx ("%v0","%v0","%v".($_+12));
		vstl ("%v0",$len,($_*16)."($out)");
		sr ($len,"%r0");
		brc (4,".Lvx_done"); # cc==4?
		}
		vll ("%v0",$len,"3*16($inp)");
		vx ("%v0","%v0","%v15");
		vstl ("%v0",$len,"3*16($out)");
		j (".Lvx_done");
		SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
		}

		# NOVX CODE PATH
		{
		my $frame=$stdframe+4*20;

		TYPE ("_s390x_chacha_novx","\@function");
		ALIGN (32);
		LABEL ("_s390x_chacha_novx");
		&{$z? \&ltgr:\&ltr} ($len,$len); # $len==0?
		bzr ("%r14");
		&{$z? \&aghi:\&ahi} ($len,-64);
		&{$z? \&lghi:\&lhi} ("%r1",-$frame);
		&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
		&{$z? \&slgr:\&slr} ($out,$inp); # difference
		la ($len,"0($inp,$len)"); # end of input minus 64
		larl ("%r7",".Lsigma");
		lgr ("%r0",$sp);
		la ($sp,"0(%r1,$sp)");
		&{$z? \&stg:\&st} ("%r0","0($sp)");

		lmg ("%r8","%r11","0($key)"); # load key
		lmg ("%r12","%r13","0($counter)"); # load counter
		lmg ("%r6","%r7","0(%r7)"); # load sigma constant

		la ("%r14","0($inp)");
		&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
		&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
		stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
		srlg (@x[12],"%r12",32); # 32-bit counter value
		j (".Loop_outer");

		ALIGN (16);
		LABEL (".Loop_outer");
		lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
		lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
		lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
		stm (@t[0],@t[1],"$stdframe+48+410($sp)");# offload x[10]-x[11]
		lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
		st (@x[12],"$stdframe+4*12($sp)"); # save counter
		&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
		lhi ("%r14",10);
		j (".Loop");

		ALIGN (4);
		LABEL (".Loop");
		ROUND (0, 4, 8,12);
		ROUND (0, 5,10,15);
		brct ("%r14",".Loop");

		&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
		stm (@t[0],@t[1],"$stdframe+48+48($sp)"); # offload x[8]-x[9]
		&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");

		al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
		al (@x[1],"$stdframe+4*1($sp)");
		al (@x[2],"$stdframe+4*2($sp)");
		al (@x[3],"$stdframe+4*3($sp)");
		al (@x[4],"$stdframe+4*4($sp)");
		al (@x[5],"$stdframe+4*5($sp)");
		al (@x[6],"$stdframe+4*6($sp)");
		al (@x[7],"$stdframe+4*7($sp)");
		lrvr (@x[0],@x[0]);
		lrvr (@x[1],@x[1]);
		lrvr (@x[2],@x[2]);
		lrvr (@x[3],@x[3]);
		lrvr (@x[4],@x[4]);
		lrvr (@x[5],@x[5]);
		lrvr (@x[6],@x[6]);
		lrvr (@x[7],@x[7]);
		al (@x[12],"$stdframe+4*12($sp)");
		al (@x[13],"$stdframe+4*13($sp)");
		al (@x[14],"$stdframe+4*14($sp)");
		al (@x[15],"$stdframe+4*15($sp)");
		lrvr (@x[12],@x[12]);
		lrvr (@x[13],@x[13]);
		lrvr (@x[14],@x[14]);
		lrvr (@x[15],@x[15]);

		la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
		&{$z? \&clgr:\&clr} ("%r14",@t[1]);
		jh (".Ltail");

		x (@x[0],"4*0(%r14)"); # xor with input
		x (@x[1],"4*1(%r14)");
		st (@x[0],"4*0(@t[0])"); # store output
		x (@x[2],"4*2(%r14)");
		st (@x[1],"4*1(@t[0])");
		x (@x[3],"4*3(%r14)");
		st (@x[2],"4*2(@t[0])");
		x (@x[4],"4*4(%r14)");
		st (@x[3],"4*3(@t[0])");
		lm (@x[0],@x[3],"$stdframe+48+48($sp)"); # load x[8]-x[11]
		x (@x[5],"4*5(%r14)");
		st (@x[4],"4*4(@t[0])");
		x (@x[6],"4*6(%r14)");
		al (@x[0],"$stdframe+4*8($sp)");
		st (@x[5],"4*5(@t[0])");
		x (@x[7],"4*7(%r14)");
		al (@x[1],"$stdframe+4*9($sp)");
		st (@x[6],"4*6(@t[0])");
		x (@x[12],"4*12(%r14)");
		al (@x[2],"$stdframe+4*10($sp)");
		st (@x[7],"4*7(@t[0])");
		x (@x[13],"4*13(%r14)");
		al (@x[3],"$stdframe+4*11($sp)");
		st (@x[12],"4*12(@t[0])");
		x (@x[14],"4*14(%r14)");
		st (@x[13],"4*13(@t[0])");
		x (@x[15],"4*15(%r14)");
		st (@x[14],"4*14(@t[0])");
		lrvr (@x[0],@x[0]);
		st (@x[15],"4*15(@t[0])");
		lrvr (@x[1],@x[1]);
		lrvr (@x[2],@x[2]);
		lrvr (@x[3],@x[3]);
		lhi (@x[12],1);
		x (@x[0],"4*8(%r14)");
		al (@x[12],"$stdframe+4*12($sp)"); # increment counter
		x (@x[1],"4*9(%r14)");
		st (@x[0],"4*8(@t[0])");
		x (@x[2],"4*10(%r14)");
		st (@x[1],"4*9(@t[0])");
		x (@x[3],"4*11(%r14)");
		st (@x[2],"4*10(@t[0])");
		st (@x[3],"4*11(@t[0])");

		&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
		la ("%r14","64(%r14)");
		jl (".Loop_outer");

		LABEL (".Ldone");
		xgr ("%r0","%r0");
		xgr ("%r1","%r1");
		xgr ("%r2","%r2");
		xgr ("%r3","%r3");
		stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
		stmg ("%r0","%r3","$stdframe+4*12($sp)");

		&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
		br ("%r14");

		ALIGN (16);
		LABEL (".Ltail");
		la (@t[1],"64($t[1])");
		stm (@x[0],@x[7],"$stdframe+4*0($sp)");
		&{$z? \&slgr:\&slr} (@t[1],"%r14");
		lm (@x[0],@x[3],"$stdframe+48+48($sp)");
		&{$z? \&lghi:\&lhi} (@x[6],0);
		stm (@x[12],@x[15],"$stdframe+4*12($sp)");
		al (@x[0],"$stdframe+4*8($sp)");
		al (@x[1],"$stdframe+4*9($sp)");
		al (@x[2],"$stdframe+4*10($sp)");
		al (@x[3],"$stdframe+4*11($sp)");
		lrvr (@x[0],@x[0]);
		lrvr (@x[1],@x[1]);
		lrvr (@x[2],@x[2]);
		lrvr (@x[3],@x[3]);
		stm (@x[0],@x[3],"$stdframe+4*8($sp)");

		LABEL (".Loop_tail");
		llgc (@x[4],"0(@x[6],%r14)");
		llgc (@x[5],"$stdframe(@x[6],$sp)");
		xr (@x[5],@x[4]);
		stc (@x[5],"0(@x[6],@t[0])");
		la (@x[6],"1(@x[6])");
		brct (@t[1],".Loop_tail");

		j (".Ldone");
		SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
		}
		}
		close STDOUT;
		################

		ALIGN (64);
		LABEL (".Lsigma");
		LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
		LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
		LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
		LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
		ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
		ALIGN (4);

		PERLASM_END();

crypto/chacha/build.info

+1 −0

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@ GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl $(PERLASM_SCHEME)
		INCLUDE[chacha-armv4.o]=..
		GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl $(PERLASM_SCHEME)
		INCLUDE[chacha-armv8.o]=..
		INCLUDE[chacha-s390x.o]=..

		BEGINRAW[Makefile(unix)]
		##### CHACHA assembler implementations