Commit 0edb109f authored by Andy Polyakov's avatar Andy Polyakov
Browse files

evp/e_chacha20_poly1305.c: further improve small-fragment TLS performance.



Improvement coefficients vary with TLS fragment length and platform, on
most Intel processors maximum improvement is ~50%, while on Ryzen - 80%.
The "secret" is new dedicated ChaCha20_128 code path and vectorized xor
helpers.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6638)
parent 2ce71b60
Loading
Loading
Loading
Loading
+46 −8
Original line number Diff line number Diff line
@@ -196,14 +196,23 @@ static int chacha20_poly1305_init_key(EVP_CIPHER_CTX *ctx,
}

#  if !defined(OPENSSL_SMALL_FOOTPRINT)

#   if defined(POLY1305_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
                                 defined(_M_AMD64) || defined(_M_X64))
#    define XOR128_HELPERS
void *xor128_encrypt_n_pad(void *out, const void *inp, void *otp, size_t len);
void *xor128_decrypt_n_pad(void *out, const void *inp, void *otp, size_t len);
static const unsigned char zero[4 * CHACHA_BLK_SIZE] = { 0 };
#   else
static const unsigned char zero[2 * CHACHA_BLK_SIZE] = { 0 };
#   endif

static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
                                        const unsigned char *in, size_t len)
{
    EVP_CHACHA_AEAD_CTX *actx = aead_data(ctx);
    size_t i, tail, tohash_len, plen = actx->tls_payload_length;
    unsigned char *buf, *tohash, *ctr, storage[2 * CHACHA_BLK_SIZE + 32];
    size_t tail, tohash_len, buf_len, plen = actx->tls_payload_length;
    unsigned char *buf, *tohash, *ctr, storage[sizeof(zero) + 32];

    if (len != plen + POLY1305_BLOCK_SIZE)
        return -1;
@@ -212,9 +221,11 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
    ctr = buf + CHACHA_BLK_SIZE;
    tohash = buf + CHACHA_BLK_SIZE - POLY1305_BLOCK_SIZE;

    if (plen <= CHACHA_BLK_SIZE) {
#   ifdef XOR128_HELPERS
    if (plen <= 3 * CHACHA_BLK_SIZE) {
        actx->key.counter[0] = 0;
        ChaCha20_ctr32(buf, zero, 2 * CHACHA_BLK_SIZE, actx->key.key.d,
        buf_len = (plen + 2 * CHACHA_BLK_SIZE - 1) & (0 - CHACHA_BLK_SIZE);
        ChaCha20_ctr32(buf, zero, buf_len, actx->key.key.d,
                       actx->key.counter);
        Poly1305_Init(POLY1305_ctx(actx), buf);
        actx->key.partial_len = 0;
@@ -223,6 +234,31 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
        actx->len.aad = EVP_AEAD_TLS1_AAD_LEN;
        actx->len.text = plen;

        if (plen) {
            if (ctx->encrypt)
                ctr = xor128_encrypt_n_pad(out, in, ctr, plen);
            else
                ctr = xor128_decrypt_n_pad(out, in, ctr, plen);

            in += plen;
            out += plen;
            tohash_len = (size_t)(ctr - tohash);
        }
    }
#   else
    if (plen <= CHACHA_BLK_SIZE) {
        size_t i;

        actx->key.counter[0] = 0;
        ChaCha20_ctr32(buf, zero, (buf_len = 2 * CHACHA_BLK_SIZE),
                       actx->key.key.d, actx->key.counter);
        Poly1305_Init(POLY1305_ctx(actx), buf);
        actx->key.partial_len = 0;
        memcpy(tohash, actx->tls_aad, POLY1305_BLOCK_SIZE);
        tohash_len = POLY1305_BLOCK_SIZE;
        actx->len.aad = EVP_AEAD_TLS1_AAD_LEN;
        actx->len.text = plen;

        if (ctx->encrypt) {
            for (i = 0; i < plen; i++) {
                out[i] = ctr[i] ^= in[i];
@@ -242,10 +278,12 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
        memset(ctr + i, 0, tail);
        ctr += i + tail;
        tohash_len += i + tail;
    } else {
    }
#   endif
    else {
        actx->key.counter[0] = 0;
        ChaCha20_ctr32(buf, zero, CHACHA_BLK_SIZE, actx->key.key.d,
                       actx->key.counter);
        ChaCha20_ctr32(buf, zero, (buf_len = CHACHA_BLK_SIZE),
                       actx->key.key.d, actx->key.counter);
        Poly1305_Init(POLY1305_ctx(actx), buf);
        actx->key.counter[0] = 1;
        actx->key.partial_len = 0;
@@ -300,7 +338,7 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
    }

    Poly1305_Update(POLY1305_ctx(actx), tohash, tohash_len);
    OPENSSL_cleanse(buf, 2 * CHACHA_BLK_SIZE);
    OPENSSL_cleanse(buf, buf_len);
    Poly1305_Final(POLY1305_ctx(actx), ctx->encrypt ? actx->tag
                                                    : tohash);

+104 −0
Original line number Diff line number Diff line
@@ -3753,6 +3753,110 @@ poly1305_emit_base2_44:
.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
___
}	}	}

{	# chacha20-poly1305 helpers
my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
$code.=<<___;
.globl	xor128_encrypt_n_pad
.type	xor128_encrypt_n_pad,\@abi-omnipotent
.align	16
xor128_encrypt_n_pad:
	sub	$otp,$inp
	sub	$otp,$out
	mov	$len,%r10		# put len aside
	shr	\$4,$len		# len / 16
	jz	.Ltail_enc
	nop
.Loop_enc_xmm:
	movdqu	($inp,$otp),%xmm0
	pxor	($otp),%xmm0
	movdqu	%xmm0,($out,$otp)
	movdqa	%xmm0,($otp)
	lea	16($otp),$otp
	dec	$len
	jnz	.Loop_enc_xmm

	and	\$15,%r10		# len % 16
	jz	.Ldone_enc

.Ltail_enc:
	mov	\$16,$len
	sub	%r10,$len
	xor	%eax,%eax
.Loop_enc_byte:
	mov	($inp,$otp),%al
	xor	($otp),%al
	mov	%al,($out,$otp)
	mov	%al,($otp)
	lea	1($otp),$otp
	dec	%r10
	jnz	.Loop_enc_byte

	xor	%eax,%eax
.Loop_enc_pad:
	mov	%al,($otp)
	lea	1($otp),$otp
	dec	$len
	jnz	.Loop_enc_pad

.Ldone_enc:
	mov	$otp,%rax
	ret
.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad

.globl	xor128_decrypt_n_pad
.type	xor128_decrypt_n_pad,\@abi-omnipotent
.align	16
xor128_decrypt_n_pad:
	sub	$otp,$inp
	sub	$otp,$out
	mov	$len,%r10		# put len aside
	shr	\$4,$len		# len / 16
	jz	.Ltail_dec
	nop
.Loop_dec_xmm:
	movdqu	($inp,$otp),%xmm0
	movdqa	($otp),%xmm1
	pxor	%xmm0,%xmm1
	movdqu	%xmm1,($out,$otp)
	movdqa	%xmm0,($otp)
	lea	16($otp),$otp
	dec	$len
	jnz	.Loop_dec_xmm

	pxor	%xmm1,%xmm1
	and	\$15,%r10		# len % 16
	jz	.Ldone_dec

.Ltail_dec:
	mov	\$16,$len
	sub	%r10,$len
	xor	%eax,%eax
	xor	%r11,%r11
.Loop_dec_byte:
	mov	($inp,$otp),%r11b
	mov	($otp),%al
	xor	%r11b,%al
	mov	%al,($out,$otp)
	mov	%r11b,($otp)
	lea	1($otp),$otp
	dec	%r10
	jnz	.Loop_dec_byte

	xor	%eax,%eax
.Loop_dec_pad:
	mov	%al,($otp)
	lea	1($otp),$otp
	dec	$len
	jnz	.Loop_dec_pad

.Ldone_dec:
	mov	$otp,%rax
	ret
.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
___
}
$code.=<<___;
.align	64
.Lconst: