Commit 50452b2e authored by Andy Polyakov's avatar Andy Polyakov
Browse files

e_padlock: add CTR mode.

parent d18762f7
Loading
Loading
Loading
Loading
+11 −10
Original line number Diff line number Diff line
@@ -183,7 +183,7 @@ my ($mode,$opcode) = @_;
&set_label("${mode}_pic_point");
	&lea	($ctx,&DWP(16,$ctx));	# control word
	&xor	("eax","eax");
					if ($mode eq "ctr16") {
					if ($mode eq "ctr32") {
	&movq	("mm0",&QWP(-16,$ctx));	# load [upper part of] counter
					} else {
	&xor	("ebx","ebx");
@@ -216,7 +216,7 @@ my ($mode,$opcode) = @_;
	&mov	(&DWP(8,"ebp"),$len);
	&mov	($len,$chunk);
	&mov	(&DWP(12,"ebp"),$chunk);	# chunk
						if ($mode eq "ctr16") {
						if ($mode eq "ctr32") {
	&mov	("ecx",&DWP(-4,$ctx));
	&xor	($out,$out);
	&mov	("eax",&DWP(-8,$ctx));		# borrow $len
@@ -257,7 +257,7 @@ my ($mode,$opcode) = @_;
						}
	&mov	($out,&DWP(0,"ebp"));		# restore parameters
	&mov	($chunk,&DWP(12,"ebp"));
						if ($mode eq "ctr16") {
						if ($mode eq "ctr32") {
	&mov	($inp,&DWP(4,"ebp"));
	&xor	($len,$len);
&set_label("${mode}_xor");
@@ -284,7 +284,7 @@ my ($mode,$opcode) = @_;
	&sub	($len,$chunk);
	&mov	($chunk,$PADLOCK_CHUNK);
	&jnz	(&label("${mode}_loop"));
						if ($mode ne "ctr16") {
						if ($mode ne "ctr32") {
	&test	($out,0x0f);			# out_misaligned
	&jz	(&label("${mode}_done"));
						}
@@ -296,7 +296,7 @@ my ($mode,$opcode) = @_;
	&data_byte(0xf3,0xab);			# rep stosl
&set_label("${mode}_done");
	&lea	("esp",&DWP(24,"ebp"));
						if ($mode ne "ctr16") {
						if ($mode ne "ctr32") {
	&jmp	(&label("${mode}_exit"));

&set_label("${mode}_aligned",16);
@@ -311,7 +311,7 @@ my ($mode,$opcode) = @_;
&set_label("${mode}_exit");			}
	&mov	("eax",1);
	&lea	("esp",&DWP(4,"esp"));		# popf
	&emms	()				if ($mode eq "ctr16");
	&emms	()				if ($mode eq "ctr32");
&set_label("${mode}_abort");
&function_end("padlock_${mode}_encrypt");
}
@@ -320,10 +320,11 @@ my ($mode,$opcode) = @_;
&generate_mode("cbc",0xd0);
&generate_mode("cfb",0xe0);
&generate_mode("ofb",0xe8);
&generate_mode("ctr16",0xc8);	# yes, it implements own ctr with ecb opcode,
				# because hardware ctr was introduced later
				# and even has errata on certain CPU stepping.
				# own implementation *always* works...
&generate_mode("ctr32",0xc8);	# yes, it implements own CTR with ECB opcode,
				# because hardware CTR was introduced later
				# and even has errata on certain C7 stepping.
				# own implementation *always* works, though
				# ~15% slower than dedicated hardware...

&function_begin_B("padlock_xstore");
	&push	("edi");
+64 −3
Original line number Diff line number Diff line
@@ -9,7 +9,8 @@

# September 2011
#
# Assembler helpers for Padlock engine.
# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
# details.

$flavour = shift;
$output  = shift;
@@ -26,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";

$code=".text\n";

$PADLOCK_CHUNK=512;	# Must be a power of 2 larger than 16
$PADLOCK_CHUNK=512;	# Must be a power of 2 between 32 and 2^20

$ctx="%rdx";
$out="%rdi";
@@ -234,9 +235,23 @@ padlock_${mode}_encrypt:
	neg	%rax
	and	\$$PADLOCK_CHUNK-1,$chunk	# chunk%=PADLOCK_CHUNK
	lea	(%rax,%rbp),%rsp
___
$code.=<<___				if ($mode eq "ctr32");
	mov	-4($ctx),%eax		# pull 32-bit counter
	bswap	%eax
	neg	%eax
	and	\$`$PADLOCK_CHUNK/16-1`,%eax
	jz	.L${mode}_loop
	shl	\$4,%eax
	cmp	%rax,$len
	cmova	%rax,$chunk		# don't let counter cross PADLOCK_CHUNK
___
$code.=<<___;
	jmp	.L${mode}_loop
.align	16
.L${mode}_loop:
	cmp	$len,$chunk		# ctr32 artefact
	cmova	$len,$chunk		# ctr32 artefact
	mov	$out,%r8		# save parameters
	mov	$inp,%r9
	mov	$len,%r10
@@ -261,6 +276,16 @@ $code.=<<___ if ($mode !~ /ecb|ctr/);
	movdqa	(%rax),%xmm0
	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
___
$code.=<<___				if ($mode eq "ctr32");
	mov	-4($ctx),%eax		# pull 32-bit counter
	test	\$0xffff0000,%eax
	jnz	.L${mode}_no_corr
	bswap	%eax
	add	\$0x10000,%eax
	bswap	%eax
	mov	%eax,-4($ctx)
.L${mode}_no_corr:
___
$code.=<<___;
	mov	%r8,$out		# restore paramters
	mov	%r11,$chunk
@@ -295,6 +320,29 @@ $code.=<<___;

.align	16
.L${mode}_aligned:
___
$code.=<<___				if ($mode eq "ctr32");
	mov	-4($ctx),%eax		# pull 32-bit counter
	mov	\$`16*0x10000`,$chunk
	bswap	%eax
	cmp	$len,$chunk
	cmova	$len,$chunk
	neg	%eax
	and	\$0xffff,%eax
	jz	.L${mode}_aligned_loop
	shl	\$4,%eax
	cmp	%rax,$len
	cmova	%rax,$chunk		# don't let counter cross 2^16
	jmp	.L${mode}_aligned_loop
.align	16
.L${mode}_aligned_loop:
	cmp	$len,$chunk
	cmova	$len,$chunk
	mov	$len,%r10		# save parameters
	mov	$chunk,$len
	mov	$chunk,%r11
___
$code.=<<___;
	lea	-16($ctx),%rax		# ivp
	lea	16($ctx),%rbx		# key
	shr	\$4,$len		# len/=AES_BLOCK_SIZE
@@ -304,6 +352,19 @@ $code.=<<___ if ($mode !~ /ecb|ctr/);
	movdqa	(%rax),%xmm0
	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
___
$code.=<<___				if ($mode eq "ctr32");
	mov	-4($ctx),%eax		# pull 32-bit counter
	bswap	%eax
	add	\$0x10000,%eax
	bswap	%eax
	mov	%eax,-4($ctx)

	mov	%r11,$chunk		# restore paramters
	mov	%r10,$len
	sub	$chunk,$len
	mov	\$`16*0x10000`,$chunk
	jnz	.L${mode}_aligned_loop
___
$code.=<<___;
.L${mode}_exit:
	mov	\$1,%eax
@@ -320,7 +381,7 @@ ___
&generate_mode("cbc",0xd0);
&generate_mode("cfb",0xe0);
&generate_mode("ofb",0xe8);
&generate_mode("ctr16",0xd8);
&generate_mode("ctr32",0xd8);	# all 64-bit CPUs have working CTR...

$code.=<<___;
.asciz	"VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
+45 −3
Original line number Diff line number Diff line
@@ -76,6 +76,7 @@
#endif
#include <openssl/rand.h>
#include <openssl/err.h>
#include <openssl/modes.h>

#ifndef OPENSSL_NO_HW
#ifndef OPENSSL_NO_HW_PADLOCK
@@ -337,16 +338,19 @@ static int padlock_cipher_nids[] = {
	NID_aes_128_cbc,
	NID_aes_128_cfb,
	NID_aes_128_ofb,
	NID_aes_128_ctr,

	NID_aes_192_ecb,
	NID_aes_192_cbc,
	NID_aes_192_cfb,
	NID_aes_192_ofb,
	NID_aes_192_ctr,

	NID_aes_256_ecb,
	NID_aes_256_cbc,
	NID_aes_256_cfb,
	NID_aes_256_ofb,
	NID_aes_256_ctr
};
static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/
				      sizeof(padlock_cipher_nids[0]));
@@ -505,10 +509,35 @@ padlock_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
	return 1;
}

static void padlock_ctr32_encrypt_glue(const unsigned char *in,
			unsigned char *out, size_t blocks,
			struct padlock_cipher_data *ctx,
			const unsigned char *ivec)
{
	memcpy(ctx->iv,ivec,AES_BLOCK_SIZE);
	padlock_ctr32_encrypt(out,in,ctx,AES_BLOCK_SIZE*blocks);
}

static int
padlock_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,
		   const unsigned char *in_arg, size_t nbytes)
{
	struct padlock_cipher_data *cdata = ALIGNED_CIPHER_DATA(ctx);
	unsigned int num = ctx->num;

	CRYPTO_ctr128_encrypt_ctr32(in_arg,out_arg,nbytes,
			cdata,ctx->iv,ctx->buf,&num,
			(ctr128_f)padlock_ctr32_encrypt_glue);

	ctx->num = (size_t)num;
	return 1;
}

#define EVP_CIPHER_block_size_ECB	AES_BLOCK_SIZE
#define EVP_CIPHER_block_size_CBC	AES_BLOCK_SIZE
#define EVP_CIPHER_block_size_OFB	1
#define EVP_CIPHER_block_size_CFB	1
#define EVP_CIPHER_block_size_CTR	1

/* Declaring so many ciphers by hand would be a pain.
   Instead introduce a bit of preprocessor magic :-) */
@@ -533,16 +562,19 @@ DECLARE_AES_EVP(128,ecb,ECB);
DECLARE_AES_EVP(128,cbc,CBC);
DECLARE_AES_EVP(128,cfb,CFB);
DECLARE_AES_EVP(128,ofb,OFB);
DECLARE_AES_EVP(128,ctr,CTR);

DECLARE_AES_EVP(192,ecb,ECB);
DECLARE_AES_EVP(192,cbc,CBC);
DECLARE_AES_EVP(192,cfb,CFB);
DECLARE_AES_EVP(192,ofb,OFB);
DECLARE_AES_EVP(192,ctr,CTR);

DECLARE_AES_EVP(256,ecb,ECB);
DECLARE_AES_EVP(256,cbc,CBC);
DECLARE_AES_EVP(256,cfb,CFB);
DECLARE_AES_EVP(256,ofb,OFB);
DECLARE_AES_EVP(256,ctr,CTR);

static int
padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid)
@@ -567,6 +599,9 @@ padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid
	  case NID_aes_128_ofb:
	    *cipher = &padlock_aes_128_ofb;
	    break;
	  case NID_aes_128_ctr:
	    *cipher = &padlock_aes_128_ctr;
	    break;

	  case NID_aes_192_ecb:
	    *cipher = &padlock_aes_192_ecb;
@@ -580,6 +615,9 @@ padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid
	  case NID_aes_192_ofb:
	    *cipher = &padlock_aes_192_ofb;
	    break;
	  case NID_aes_192_ctr:
	    *cipher = &padlock_aes_192_ctr;
	    break;

	  case NID_aes_256_ecb:
	    *cipher = &padlock_aes_256_ecb;
@@ -593,6 +631,9 @@ padlock_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid
	  case NID_aes_256_ofb:
	    *cipher = &padlock_aes_256_ofb;
	    break;
	  case NID_aes_256_ctr:
	    *cipher = &padlock_aes_256_ctr;
	    break;

	  default:
	    /* Sorry, we don't support this NID */
@@ -610,6 +651,7 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
{
	struct padlock_cipher_data *cdata;
	int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;
	unsigned long mode = EVP_CIPHER_CTX_mode(ctx);

	if (key==NULL) return 0;	/* ERROR */

@@ -617,7 +659,7 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
	memset(cdata, 0, sizeof(struct padlock_cipher_data));

	/* Prepare Control word. */
	if (EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_OFB_MODE)
	if (mode == EVP_CIPH_OFB_MODE || mode == EVP_CIPH_CTR_MODE)
		cdata->cword.b.encdec = 0;
	else
		cdata->cword.b.encdec = (ctx->encrypt == 0);
@@ -640,8 +682,8 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,
			   and is listed as hardware errata. They most
			   likely will fix it at some point and then
			   a check for stepping would be due here. */
			if ((EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_ECB_MODE ||
			     EVP_CIPHER_CTX_mode(ctx) == EVP_CIPH_CBC_MODE)
			if ((mode == EVP_CIPH_ECB_MODE ||
			     mode == EVP_CIPH_CBC_MODE)
			    && !enc)
				AES_set_decrypt_key(key, key_len, &cdata->ks);
			else