Commit 205fd638 authored by Matt Caswell's avatar Matt Caswell
Browse files

Run util/openssl-format-source on the Curve448 code

parent 1308e022
Loading
Loading
Loading
Loading
+10 −9
Original line number Diff line number Diff line
@@ -16,15 +16,16 @@
# define ARCH_WORD_BITS 32

static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a) {
uint32_t word_is_zero(uint32_t a)
{
    /* let's hope the compiler isn't clever enough to optimize this. */
    return (((uint64_t)a) - 1) >> 32;
}

static __inline__ __attribute((always_inline, unused))
uint64_t widemul(uint32_t a, uint32_t b) {
uint64_t widemul(uint32_t a, uint32_t b)
{
    return ((uint64_t)a) * b;
}

#endif                          /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
+48 −52
Original line number Diff line number Diff line
@@ -20,7 +20,8 @@
# define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
#endif

void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { 
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;

@@ -37,31 +38,27 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {

    FOR_LIMB(j, 0, 8, {
             accum2 = 0;
    
             FOR_LIMB(i, 0, j + 1, {
                      accum2 += widemul(a[j - i], b[i]);
                      accum1 += widemul(aa[j - i], bb[i]);
                      accum0 += widemul(a[8 + j - i], b[8 + i]);
        });
        
        accum1 -= accum2;
        accum0 += accum2;
                      }
             ); accum1 -= accum2; accum0 += accum2;
             accum2 = 0;
    
             FOR_LIMB(i, j + 1, 8, {
            accum0 -= widemul(a[8+j-i], b[i]);
            accum2 += widemul(aa[8+j-i], bb[i]);
                      accum0 -=
                      widemul(a[8 + j - i], b[i]);
                      accum2 +=
                      widemul(aa[8 + j - i],
                              bb[i]);
                      accum1 += widemul(a[16 + j - i], b[8 + i]);
        });

                      }
             );
             accum1 += accum2;
             accum0 += accum2;

             c[j] = ((uint32_t)(accum0)) & mask;
             c[j + 8] = ((uint32_t)(accum1)) & mask;

        accum0 >>= 28;
        accum1 >>= 28;
             accum0 >>= 28; accum1 >>= 28;
             });

    accum0 += accum1;
@@ -76,7 +73,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[1] += ((uint32_t)(accum1));
}

void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;
    uint64_t accum0 = 0, accum8 = 0;
@@ -86,9 +84,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    assert(b < 1 << 28);

    FOR_LIMB(i, 0, 8, {
        accum0 += widemul(b, a[i]);
        accum8 += widemul(b, a[i+8]);

             accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]);
             c[i] = accum0 & mask; accum0 >>= 28;
             c[i + 8] = accum8 & mask; accum8 >>= 28;
             });
@@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    c[1] += accum8 >> 28;
}

void gf_sqr (gf_s *__restrict__ cs, const gf as) {
void gf_sqr(gf_s * __restrict__ cs, const gf as)
{
    gf_mul(cs, as, as);         /* Performs better with a dedicated square */
}
+17 −14
Original line number Diff line number Diff line
@@ -16,7 +16,8 @@

#define LIMB_PLACE_VALUE(i) 28

void gf_add_RAW (gf out, const gf a, const gf b) {
void gf_add_RAW(gf out, const gf a, const gf b)
{
    unsigned int i;

    for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
@@ -24,7 +25,8 @@ void gf_add_RAW (gf out, const gf a, const gf b) {
    }
}

void gf_sub_RAW (gf out, const gf a, const gf b) {
void gf_sub_RAW(gf out, const gf a, const gf b)
{
    unsigned int i;

    for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
@@ -32,7 +34,8 @@ void gf_sub_RAW (gf out, const gf a, const gf b) {
    }
}

void gf_bias (gf a, int amt) {
void gf_bias(gf a, int amt)
{
    unsigned int i;
    uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt;

@@ -41,7 +44,8 @@ void gf_bias (gf a, int amt) {
    }
}

void gf_weak_reduce (gf a) {
void gf_weak_reduce(gf a)
{
    uint32_t mask = (1 << 28) - 1;
    uint32_t tmp = a->limb[15] >> 28;
    unsigned int i;
@@ -52,4 +56,3 @@ void gf_weak_reduce (gf a) {
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
}
+15 −11
Original line number Diff line number Diff line
@@ -16,17 +16,21 @@
# define ARCH_WORD_BITS 32

static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a) {
uint32_t word_is_zero(uint32_t a)
{
    uint32_t ret;
 asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
    return ret;
}

static __inline__ __attribute((always_inline, unused))
uint64_t widemul(uint32_t a, uint32_t b) {
    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
uint64_t widemul(uint32_t a, uint32_t b)
{
    /*
     * Could be UMULL, but it's hard to express to CC that the registers must
     * be different
     */
    return ((uint64_t)a) * b;
}

#endif                          /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
+232 −214
Original line number Diff line number Diff line
@@ -13,17 +13,14 @@
#include "f_field.h"

static inline void __attribute__ ((gnu_inline, always_inline))
smlal (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
) {
    smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
{

#ifdef  __ARMEL__
    uint32_t lo = *acc, hi = (*acc) >> 32;

    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
        : [lo]"+&r"(lo), [hi]"+&r"(hi)
    __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
                         [hi] "+&r"(hi)
                         :[a] "r"(a),[b] "r"(b));

    *acc = lo + (((uint64_t)hi) << 32);
@@ -33,16 +30,13 @@ smlal (
}

static inline void __attribute__ ((gnu_inline, always_inline))
smlal2 (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
) {
    smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
{
#ifdef __ARMEL__
    uint32_t lo = *acc, hi = (*acc) >> 32;

    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
        : [lo]"+&r"(lo), [hi]"+&r"(hi)
    __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
                         [hi] "+&r"(hi)
                         :[a] "r"(a),[b] "r"(2 * b));

    *acc = lo + (((uint64_t)hi) << 32);
@@ -52,16 +46,13 @@ smlal2 (
}

static inline void __attribute__ ((gnu_inline, always_inline))
smull (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
) {
    smull(uint64_t *acc, const uint32_t a, const uint32_t b)
{
#ifdef __ARMEL__
    uint32_t lo, hi;

    __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
        : [lo]"=&r"(lo), [hi]"=&r"(hi)
    __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo),
                         [hi] "=&r"(hi)
                         :[a] "r"(a),[b] "r"(b));

    *acc = lo + (((uint64_t)hi) << 32);
@@ -71,11 +62,8 @@ smull (
}

static inline void __attribute__ ((gnu_inline, always_inline))
smull2 (
    uint64_t *acc,
    const uint32_t a,
    const uint32_t b
) {
    smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
{
#ifdef __ARMEL__
    uint32_t lo, hi;

@@ -89,7 +77,8 @@ smull2 (
#endif
}

void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{

    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;
@@ -284,7 +273,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum0, ax, bx = b[8]);
        smlal(&accum2, ax = aa[5], bx);

        
        smlal(&accum0, ax = a[13], bx = b[7]);
        smlal(&accum2, ax = a[14], bx);
        smlal(&accum0, ax, bx = b[6]);
@@ -307,7 +295,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum1, ax, bx = b[0]);
        smlal(&accum3, ax = a[13], bx);

        
        smlal(&accum1, ax = a[5], bx = bm[7]);
        smlal(&accum3, ax = a[6], bx);
        smlal(&accum1, ax, bx = bm[6]);
@@ -365,7 +352,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum0, ax, bx = b[8]);
        smlal(&accum2, ax = aa[7], bx);

        
        smlal(&accum0, ax = a[15], bx = b[7]);

        /* t terms */
@@ -388,7 +374,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum1, ax, bx = b[0]);
        smlal(&accum3, ax = a[15], bx);

        
        smlal(&accum1, ax = a[7], bx = bm[7]);

        /* 1 terms */
@@ -435,7 +420,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[1] += ((uint32_t)(accum1));
}

void gf_sqr (gf_s *__restrict__ cs, const gf as) {
void gf_sqr(gf_s * __restrict__ cs, const gf as)
{
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;

@@ -498,8 +484,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum2, ax = bm[0], bx = bm[1]);
        smlal(&accum0, ax, ax);

        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
        tmp = -accum3;
        accum3 = tmp - accum2;
        accum2 = tmp;
        tmp = -accum1;
        accum1 = tmp - accum0;
        accum0 = tmp;

        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
@@ -560,9 +550,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum2, ax = bm[1], bx);
        smlal(&accum0, ax, ax);

        
        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
        tmp = -accum3;
        accum3 = tmp - accum2;
        accum2 = tmp;
        tmp = -accum1;
        accum1 = tmp - accum0;
        accum0 = tmp;

        accum0 += accumC0;
        accum1 += accumC1;
@@ -596,7 +589,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum2, ax = a[10], bx);
        smlal(&accum0, ax, ax);

        
        smlal2(&accum0, ax = a[5], bx = a[7]);
        smlal2(&accum2, ax = a[6], bx);
        smlal(&accum0, ax, ax);
@@ -630,9 +622,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum2, ax = bm[2], bx);
        smlal(&accum0, ax, ax);

        
        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
        tmp = -accum3;
        accum3 = tmp - accum2;
        accum2 = tmp;
        tmp = -accum1;
        accum1 = tmp - accum0;
        accum0 = tmp;

        accum0 += accumC0;
        accum1 += accumC1;
@@ -664,7 +659,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum2, ax = a[11], bx);
        smlal(&accum0, ax, ax);

        
        smlal(&accum0, ax = a[7], bx = a[7]);

        /* t terms */
@@ -699,9 +693,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum2, ax = bm[3], bx);
        smlal(&accum0, ax, ax);

        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
        
        tmp = -accum3;
        accum3 = tmp - accum2;
        accum2 = tmp;
        tmp = -accum1;
        accum1 = tmp - accum0;
        accum0 = tmp;

        accum0 += accumC0;
        accum1 += accumC1;
@@ -729,11 +726,8 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    c[1] += ((uint32_t)(accum1));
}

void gf_mulw_unsigned (
    gf_s *__restrict__ cs,
    const gf as,
    uint32_t b
) {
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
    uint32_t mask = (1ull << 28) - 1;
    assert(b <= mask);

@@ -745,75 +739,99 @@ void gf_mulw_unsigned (
    int i;

    uint32_t c0, c8, n0, n8;
    c0 = a[0]; c8 = a[8];
    c0 = a[0];
    c8 = a[8];
    accum0 = widemul(b, c0);
    accum8 = widemul(b, c8);

    c[0] = accum0 & mask; accum0 >>= 28;
    c[8] = accum8 & mask; accum8 >>= 28;
    c[0] = accum0 & mask;
    accum0 >>= 28;
    c[8] = accum8 & mask;
    accum8 >>= 28;

    i = 1;
    {
        n0 = a[i]; n8 = a[i+8];
        n0 = a[i];
        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        c[i] = accum0 & mask;
        accum0 >>= 28;
        c[i + 8] = accum8 & mask;
        accum8 >>= 28;
        i++;
    }
    {
        c0 = a[i]; c8 = a[i+8];
        c0 = a[i];
        c8 = a[i + 8];
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        c[i] = accum0 & mask;
        accum0 >>= 28;
        c[i + 8] = accum8 & mask;
        accum8 >>= 28;
        i++;
    }
    {
        n0 = a[i]; n8 = a[i+8];
        n0 = a[i];
        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        c[i] = accum0 & mask;
        accum0 >>= 28;
        c[i + 8] = accum8 & mask;
        accum8 >>= 28;
        i++;
    }
    {
        c0 = a[i]; c8 = a[i+8];
        c0 = a[i];
        c8 = a[i + 8];
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        c[i] = accum0 & mask;
        accum0 >>= 28;
        c[i + 8] = accum8 & mask;
        accum8 >>= 28;
        i++;
    }
    {
        n0 = a[i]; n8 = a[i+8];
        n0 = a[i];
        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        c[i] = accum0 & mask;
        accum0 >>= 28;
        c[i + 8] = accum8 & mask;
        accum8 >>= 28;
        i++;
    }
    {
        c0 = a[i]; c8 = a[i+8];
        c0 = a[i];
        c8 = a[i + 8];
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        c[i] = accum0 & mask;
        accum0 >>= 28;
        c[i + 8] = accum8 & mask;
        accum8 >>= 28;
        i++;
    }
    {
        n0 = a[i]; n8 = a[i+8];
        n0 = a[i];
        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

        c[i] = accum0 & mask; accum0 >>= 28;
        c[i+8] = accum8 & mask; accum8 >>= 28;
        c[i] = accum0 & mask;
        accum0 >>= 28;
        c[i + 8] = accum8 & mask;
        accum8 >>= 28;
        i++;
    }

Loading