Commit 3c849bc9 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

ec/curve25519.c: reorganize for better accessibility.



Move base 2^64 code to own #if section. It was nested in base 2^51 section,
which arguably might have been tricky to follow.

Reviewed-by: default avatarRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6699)
parent d3e32630
Loading
Loading
Loading
Loading
+150 −143
Original line number Diff line number Diff line
@@ -11,140 +11,8 @@
#include "ec_lcl.h"
#include <openssl/sha.h>

#if defined(X25519_ASM) \
    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
         && !defined(__sparc__) \
         && !(defined(__ANDROID__) && !defined(__clang__)) )
/*
 * Base 2^51 implementation.
 */
# define BASE_2_51_IMPLEMENTED

typedef uint64_t fe51[5];
# if !defined(X25519_ASM)
typedef __uint128_t u128;
# endif

static const uint64_t MASK51 = 0x7ffffffffffff;

static uint64_t load_7(const uint8_t *in)
{
    uint64_t result;

    result = in[0];
    result |= ((uint64_t)in[1]) << 8;
    result |= ((uint64_t)in[2]) << 16;
    result |= ((uint64_t)in[3]) << 24;
    result |= ((uint64_t)in[4]) << 32;
    result |= ((uint64_t)in[5]) << 40;
    result |= ((uint64_t)in[6]) << 48;

    return result;
}

static uint64_t load_6(const uint8_t *in)
{
    uint64_t result;

    result = in[0];
    result |= ((uint64_t)in[1]) << 8;
    result |= ((uint64_t)in[2]) << 16;
    result |= ((uint64_t)in[3]) << 24;
    result |= ((uint64_t)in[4]) << 32;
    result |= ((uint64_t)in[5]) << 40;

    return result;
}

static void fe51_frombytes(fe51 h, const uint8_t *s)
{
    uint64_t h0 = load_7(s);                                /* 56 bits */
    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */

    h1 |= h0 >> 51; h0 &= MASK51;
    h2 |= h1 >> 51; h1 &= MASK51;
    h3 |= h2 >> 51; h2 &= MASK51;
    h4 |= h3 >> 51; h3 &= MASK51;

    h[0] = h0;
    h[1] = h1;
    h[2] = h2;
    h[3] = h3;
    h[4] = h4;
}

static void fe51_tobytes(uint8_t *s, const fe51 h)
{
    uint64_t h0 = h[0];
    uint64_t h1 = h[1];
    uint64_t h2 = h[2];
    uint64_t h3 = h[3];
    uint64_t h4 = h[4];
    uint64_t q;

    /* compare to modulus */
    q = (h0 + 19) >> 51;
    q = (h1 + q) >> 51;
    q = (h2 + q) >> 51;
    q = (h3 + q) >> 51;
    q = (h4 + q) >> 51;

    /* full reduce */
    h0 += 19 * q;
    h1 += h0 >> 51; h0 &= MASK51;
    h2 += h1 >> 51; h1 &= MASK51;
    h3 += h2 >> 51; h2 &= MASK51;
    h4 += h3 >> 51; h3 &= MASK51;
                    h4 &= MASK51;

    /* smash */
    s[0] = (uint8_t)(h0 >> 0);
    s[1] = (uint8_t)(h0 >> 8);
    s[2] = (uint8_t)(h0 >> 16);
    s[3] = (uint8_t)(h0 >> 24);
    s[4] = (uint8_t)(h0 >> 32);
    s[5] = (uint8_t)(h0 >> 40);
    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
    s[7] = (uint8_t)(h1 >> 5);
    s[8] = (uint8_t)(h1 >> 13);
    s[9] = (uint8_t)(h1 >> 21);
    s[10] = (uint8_t)(h1 >> 29);
    s[11] = (uint8_t)(h1 >> 37);
    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
    s[13] = (uint8_t)(h2 >> 2);
    s[14] = (uint8_t)(h2 >> 10);
    s[15] = (uint8_t)(h2 >> 18);
    s[16] = (uint8_t)(h2 >> 26);
    s[17] = (uint8_t)(h2 >> 34);
    s[18] = (uint8_t)(h2 >> 42);
    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
    s[20] = (uint8_t)(h3 >> 7);
    s[21] = (uint8_t)(h3 >> 15);
    s[22] = (uint8_t)(h3 >> 23);
    s[23] = (uint8_t)(h3 >> 31);
    s[24] = (uint8_t)(h3 >> 39);
    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
    s[26] = (uint8_t)(h4 >> 4);
    s[27] = (uint8_t)(h4 >> 12);
    s[28] = (uint8_t)(h4 >> 20);
    s[29] = (uint8_t)(h4 >> 28);
    s[30] = (uint8_t)(h4 >> 36);
    s[31] = (uint8_t)(h4 >> 44);
}

# ifdef X25519_ASM
void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
void x25519_fe51_sqr(fe51 h, const fe51 f);
void x25519_fe51_mul121666(fe51 h, fe51 f);
#  define fe51_mul x25519_fe51_mul
#  define fe51_sq  x25519_fe51_sqr
#  define fe51_mul121666 x25519_fe51_mul121666

#  if defined(__x86_64) || defined(__x86_64__) || \
      defined(_M_AMD64) || defined(_M_X64)
#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
                            defined(_M_AMD64) || defined(_M_X64))

# define BASE_2_64_IMPLEMENTED

@@ -153,7 +21,13 @@ typedef uint64_t fe64[4];
int x25519_fe64_eligible(void);

/*
 * There are no reference C implementations for this radix.
 * Following subroutines perform corresponding operations modulo
 * 2^256-38, i.e. double the curve modulus. However, inputs and
 * outputs are permitted to be partially reduced, i.e. to remain
 * in [0..2^256) range. It's all tied up in final fe64_tobytes
 * that performs full reduction modulo 2^255-19.
 *
 * There are no reference C implementations for these.
 */
void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g);
void x25519_fe64_sqr(fe64 h, const fe64 f);
@@ -377,8 +251,141 @@ static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32],
}
#endif

#if defined(X25519_ASM) \
    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
         && !defined(__sparc__) \
         && !(defined(__ANDROID__) && !defined(__clang__)) )
/*
 * Base 2^51 implementation. It's virtually no different from reference
 * base 2^25.5 implementation in respect to lax boundary conditions for
 * intermediate values and even individual limbs. So that whatever you
 * know about the reference, applies even here...
 */
# define BASE_2_51_IMPLEMENTED

typedef uint64_t fe51[5];

static const uint64_t MASK51 = 0x7ffffffffffff;

static uint64_t load_7(const uint8_t *in)
{
    uint64_t result;

    result = in[0];
    result |= ((uint64_t)in[1]) << 8;
    result |= ((uint64_t)in[2]) << 16;
    result |= ((uint64_t)in[3]) << 24;
    result |= ((uint64_t)in[4]) << 32;
    result |= ((uint64_t)in[5]) << 40;
    result |= ((uint64_t)in[6]) << 48;

    return result;
}

static uint64_t load_6(const uint8_t *in)
{
    uint64_t result;

    result = in[0];
    result |= ((uint64_t)in[1]) << 8;
    result |= ((uint64_t)in[2]) << 16;
    result |= ((uint64_t)in[3]) << 24;
    result |= ((uint64_t)in[4]) << 32;
    result |= ((uint64_t)in[5]) << 40;

    return result;
}

static void fe51_frombytes(fe51 h, const uint8_t *s)
{
    uint64_t h0 = load_7(s);                                /* 56 bits */
    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */

    h1 |= h0 >> 51; h0 &= MASK51;
    h2 |= h1 >> 51; h1 &= MASK51;
    h3 |= h2 >> 51; h2 &= MASK51;
    h4 |= h3 >> 51; h3 &= MASK51;

    h[0] = h0;
    h[1] = h1;
    h[2] = h2;
    h[3] = h3;
    h[4] = h4;
}

static void fe51_tobytes(uint8_t *s, const fe51 h)
{
    uint64_t h0 = h[0];
    uint64_t h1 = h[1];
    uint64_t h2 = h[2];
    uint64_t h3 = h[3];
    uint64_t h4 = h[4];
    uint64_t q;

    /* compare to modulus */
    q = (h0 + 19) >> 51;
    q = (h1 + q) >> 51;
    q = (h2 + q) >> 51;
    q = (h3 + q) >> 51;
    q = (h4 + q) >> 51;

    /* full reduce */
    h0 += 19 * q;
    h1 += h0 >> 51; h0 &= MASK51;
    h2 += h1 >> 51; h1 &= MASK51;
    h3 += h2 >> 51; h2 &= MASK51;
    h4 += h3 >> 51; h3 &= MASK51;
                    h4 &= MASK51;

    /* smash */
    s[0] = (uint8_t)(h0 >> 0);
    s[1] = (uint8_t)(h0 >> 8);
    s[2] = (uint8_t)(h0 >> 16);
    s[3] = (uint8_t)(h0 >> 24);
    s[4] = (uint8_t)(h0 >> 32);
    s[5] = (uint8_t)(h0 >> 40);
    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
    s[7] = (uint8_t)(h1 >> 5);
    s[8] = (uint8_t)(h1 >> 13);
    s[9] = (uint8_t)(h1 >> 21);
    s[10] = (uint8_t)(h1 >> 29);
    s[11] = (uint8_t)(h1 >> 37);
    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
    s[13] = (uint8_t)(h2 >> 2);
    s[14] = (uint8_t)(h2 >> 10);
    s[15] = (uint8_t)(h2 >> 18);
    s[16] = (uint8_t)(h2 >> 26);
    s[17] = (uint8_t)(h2 >> 34);
    s[18] = (uint8_t)(h2 >> 42);
    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
    s[20] = (uint8_t)(h3 >> 7);
    s[21] = (uint8_t)(h3 >> 15);
    s[22] = (uint8_t)(h3 >> 23);
    s[23] = (uint8_t)(h3 >> 31);
    s[24] = (uint8_t)(h3 >> 39);
    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
    s[26] = (uint8_t)(h4 >> 4);
    s[27] = (uint8_t)(h4 >> 12);
    s[28] = (uint8_t)(h4 >> 20);
    s[29] = (uint8_t)(h4 >> 28);
    s[30] = (uint8_t)(h4 >> 36);
    s[31] = (uint8_t)(h4 >> 44);
}

# if defined(X25519_ASM)
void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
void x25519_fe51_sqr(fe51 h, const fe51 f);
void x25519_fe51_mul121666(fe51 h, fe51 f);
#  define fe51_mul x25519_fe51_mul
#  define fe51_sq  x25519_fe51_sqr
#  define fe51_mul121666 x25519_fe51_mul121666
# else

typedef __uint128_t u128;

static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
{
    u128 h0, h1, h2, h3, h4;