Loading crypto/ec/curve448/arch_32/arch_intrinsics.h +10 −9 Original line number Diff line number Diff line Loading @@ -16,15 +16,16 @@ # define ARCH_WORD_BITS 32 static __inline__ __attribute((always_inline, unused)) uint32_t word_is_zero(uint32_t a) { uint32_t word_is_zero(uint32_t a) { /* let's hope the compiler isn't clever enough to optimize this. */ return (((uint64_t)a) - 1) >> 32; } static __inline__ __attribute((always_inline, unused)) uint64_t widemul(uint32_t a, uint32_t b) { uint64_t widemul(uint32_t a, uint32_t b) { return ((uint64_t)a) * b; } #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ crypto/ec/curve448/arch_32/f_impl.c +48 −52 Original line number Diff line number Diff line Loading @@ -20,7 +20,8 @@ # define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0) #endif void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; Loading @@ -37,31 +38,27 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { FOR_LIMB(j, 0, 8, { accum2 = 0; FOR_LIMB(i, 0, j + 1, { accum2 += widemul(a[j - i], b[i]); accum1 += widemul(aa[j - i], bb[i]); accum0 += widemul(a[8 + j - i], b[8 + i]); }); accum1 -= accum2; accum0 += accum2; } ); accum1 -= accum2; accum0 += accum2; accum2 = 0; FOR_LIMB(i, j + 1, 8, { accum0 -= widemul(a[8+j-i], b[i]); accum2 += widemul(aa[8+j-i], bb[i]); accum0 -= widemul(a[8 + j - i], b[i]); accum2 += widemul(aa[8 + j - i], bb[i]); accum1 += widemul(a[16 + j - i], b[8 + i]); }); } ); accum1 += accum2; accum0 += accum2; c[j] = ((uint32_t)(accum0)) & mask; c[j + 8] = ((uint32_t)(accum1)) & mask; accum0 >>= 28; accum1 >>= 28; accum0 >>= 28; accum1 >>= 28; }); accum0 += accum1; Loading @@ -76,7 +73,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += ((uint32_t)(accum1)); } void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) { const uint32_t *a = as->limb; uint32_t *c = cs->limb; uint64_t accum0 = 0, accum8 = 0; Loading @@ -86,9 +84,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { assert(b < 1 << 28); FOR_LIMB(i, 0, 8, { accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i+8]); accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]); c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; }); Loading @@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { c[1] += accum8 >> 28; } void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as) { gf_mul(cs, as, as); /* Performs better with a dedicated square */ } crypto/ec/curve448/arch_32/f_impl.h +17 −14 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ #define LIMB_PLACE_VALUE(i) 28 void gf_add_RAW (gf out, const gf a, const gf b) { void gf_add_RAW(gf out, const gf a, const gf b) { unsigned int i; for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) { Loading @@ -24,7 +25,8 @@ void gf_add_RAW (gf out, const gf a, const gf b) { } } void gf_sub_RAW (gf out, const gf a, const gf b) { void gf_sub_RAW(gf out, const gf a, const gf b) { unsigned int i; for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) { Loading @@ -32,7 +34,8 @@ void gf_sub_RAW (gf out, const gf a, const gf b) { } } void gf_bias (gf a, int amt) { void gf_bias(gf a, int amt) { unsigned int i; uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt; Loading @@ -41,7 +44,8 @@ void gf_bias (gf a, int amt) { } } void gf_weak_reduce (gf a) { void gf_weak_reduce(gf a) { uint32_t mask = (1 << 28) - 1; uint32_t tmp = a->limb[15] >> 28; unsigned int i; Loading @@ -52,4 +56,3 @@ void gf_weak_reduce (gf a) { } a->limb[0] = (a->limb[0] & mask) + tmp; } crypto/ec/curve448/arch_arm_32/arch_intrinsics.h +15 −11 Original line number Diff line number Diff line Loading @@ -16,17 +16,21 @@ # define ARCH_WORD_BITS 32 static __inline__ __attribute((always_inline, unused)) uint32_t word_is_zero(uint32_t a) { uint32_t word_is_zero(uint32_t a) { uint32_t ret; asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc"); return ret; } static __inline__ __attribute((always_inline, unused)) uint64_t widemul(uint32_t a, uint32_t b) { /* Could be UMULL, but it's hard to express to CC that the registers must be different */ uint64_t widemul(uint32_t a, uint32_t b) { /* * Could be UMULL, but it's hard to express to CC that the registers must * be different */ return ((uint64_t)a) * b; } #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ crypto/ec/curve448/arch_arm_32/f_impl.c +232 −214 Original line number Diff line number Diff line Loading @@ -13,17 +13,14 @@ #include "f_field.h" static inline void __attribute__ ((gnu_inline, always_inline)) smlal ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smlal(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo = *acc, hi = (*acc) >> 32; __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" : [lo]"+&r"(lo), [hi]"+&r"(hi) __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo), [hi] "+&r"(hi) :[a] "r"(a),[b] "r"(b)); *acc = lo + (((uint64_t)hi) << 32); Loading @@ -33,16 +30,13 @@ smlal ( } static inline void __attribute__ ((gnu_inline, always_inline)) smlal2 ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smlal2(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo = *acc, hi = (*acc) >> 32; __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" : [lo]"+&r"(lo), [hi]"+&r"(hi) __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo), [hi] "+&r"(hi) :[a] "r"(a),[b] "r"(2 * b)); *acc = lo + (((uint64_t)hi) << 32); Loading @@ -52,16 +46,13 @@ smlal2 ( } static inline void __attribute__ ((gnu_inline, always_inline)) smull ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smull(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo, hi; __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" : [lo]"=&r"(lo), [hi]"=&r"(hi) __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo), [hi] "=&r"(hi) :[a] "r"(a),[b] "r"(b)); *acc = lo + (((uint64_t)hi) << 32); Loading @@ -71,11 +62,8 @@ smull ( } static inline void __attribute__ ((gnu_inline, always_inline)) smull2 ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smull2(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo, hi; Loading @@ -89,7 +77,8 @@ smull2 ( #endif } void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; Loading Loading @@ -284,7 +273,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[5], bx); smlal(&accum0, ax = a[13], bx = b[7]); smlal(&accum2, ax = a[14], bx); smlal(&accum0, ax, bx = b[6]); Loading @@ -307,7 +295,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[13], bx); smlal(&accum1, ax = a[5], bx = bm[7]); smlal(&accum3, ax = a[6], bx); smlal(&accum1, ax, bx = bm[6]); Loading Loading @@ -365,7 +352,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[7], bx); smlal(&accum0, ax = a[15], bx = b[7]); /* t terms */ Loading @@ -388,7 +374,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[15], bx); smlal(&accum1, ax = a[7], bx = bm[7]); /* 1 terms */ Loading Loading @@ -435,7 +420,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += ((uint32_t)(accum1)); } void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as) { const uint32_t *a = as->limb; uint32_t *c = cs->limb; Loading Loading @@ -498,8 +484,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[0], bx = bm[1]); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum2 += accum0 >> 28; accum3 += accum1 >> 28; Loading Loading @@ -560,9 +550,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[1], bx); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum0 += accumC0; accum1 += accumC1; Loading Loading @@ -596,7 +589,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = a[10], bx); smlal(&accum0, ax, ax); smlal2(&accum0, ax = a[5], bx = a[7]); smlal2(&accum2, ax = a[6], bx); smlal(&accum0, ax, ax); Loading Loading @@ -630,9 +622,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[2], bx); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum0 += accumC0; accum1 += accumC1; Loading Loading @@ -664,7 +659,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = a[11], bx); smlal(&accum0, ax, ax); smlal(&accum0, ax = a[7], bx = a[7]); /* t terms */ Loading Loading @@ -699,9 +693,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[3], bx); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum0 += accumC0; accum1 += accumC1; Loading Loading @@ -729,11 +726,8 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { c[1] += ((uint32_t)(accum1)); } void gf_mulw_unsigned ( gf_s *__restrict__ cs, const gf as, uint32_t b ) { void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) { uint32_t mask = (1ull << 28) - 1; assert(b <= mask); Loading @@ -745,75 +739,99 @@ void gf_mulw_unsigned ( int i; uint32_t c0, c8, n0, n8; c0 = a[0]; c8 = a[8]; c0 = a[0]; c8 = a[8]; accum0 = widemul(b, c0); accum8 = widemul(b, c8); c[0] = accum0 & mask; accum0 >>= 28; c[8] = accum8 & mask; accum8 >>= 28; c[0] = accum0 & mask; accum0 >>= 28; c[8] = accum8 & mask; accum8 >>= 28; i = 1; { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { c0 = a[i]; c8 = a[i+8]; c0 = a[i]; c8 = a[i + 8]; smlal(&accum0, b, c0); smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { c0 = a[i]; c8 = a[i+8]; c0 = a[i]; c8 = a[i + 8]; smlal(&accum0, b, c0); smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { c0 = a[i]; c8 = a[i+8]; c0 = a[i]; c8 = a[i + 8]; smlal(&accum0, b, c0); smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } Loading Loading
crypto/ec/curve448/arch_32/arch_intrinsics.h +10 −9 Original line number Diff line number Diff line Loading @@ -16,15 +16,16 @@ # define ARCH_WORD_BITS 32 static __inline__ __attribute((always_inline, unused)) uint32_t word_is_zero(uint32_t a) { uint32_t word_is_zero(uint32_t a) { /* let's hope the compiler isn't clever enough to optimize this. */ return (((uint64_t)a) - 1) >> 32; } static __inline__ __attribute((always_inline, unused)) uint64_t widemul(uint32_t a, uint32_t b) { uint64_t widemul(uint32_t a, uint32_t b) { return ((uint64_t)a) * b; } #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
crypto/ec/curve448/arch_32/f_impl.c +48 −52 Original line number Diff line number Diff line Loading @@ -20,7 +20,8 @@ # define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0) #endif void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; Loading @@ -37,31 +38,27 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { FOR_LIMB(j, 0, 8, { accum2 = 0; FOR_LIMB(i, 0, j + 1, { accum2 += widemul(a[j - i], b[i]); accum1 += widemul(aa[j - i], bb[i]); accum0 += widemul(a[8 + j - i], b[8 + i]); }); accum1 -= accum2; accum0 += accum2; } ); accum1 -= accum2; accum0 += accum2; accum2 = 0; FOR_LIMB(i, j + 1, 8, { accum0 -= widemul(a[8+j-i], b[i]); accum2 += widemul(aa[8+j-i], bb[i]); accum0 -= widemul(a[8 + j - i], b[i]); accum2 += widemul(aa[8 + j - i], bb[i]); accum1 += widemul(a[16 + j - i], b[8 + i]); }); } ); accum1 += accum2; accum0 += accum2; c[j] = ((uint32_t)(accum0)) & mask; c[j + 8] = ((uint32_t)(accum1)) & mask; accum0 >>= 28; accum1 >>= 28; accum0 >>= 28; accum1 >>= 28; }); accum0 += accum1; Loading @@ -76,7 +73,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += ((uint32_t)(accum1)); } void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) { const uint32_t *a = as->limb; uint32_t *c = cs->limb; uint64_t accum0 = 0, accum8 = 0; Loading @@ -86,9 +84,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { assert(b < 1 << 28); FOR_LIMB(i, 0, 8, { accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i+8]); accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]); c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; }); Loading @@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { c[1] += accum8 >> 28; } void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as) { gf_mul(cs, as, as); /* Performs better with a dedicated square */ }
crypto/ec/curve448/arch_32/f_impl.h +17 −14 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ #define LIMB_PLACE_VALUE(i) 28 void gf_add_RAW (gf out, const gf a, const gf b) { void gf_add_RAW(gf out, const gf a, const gf b) { unsigned int i; for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) { Loading @@ -24,7 +25,8 @@ void gf_add_RAW (gf out, const gf a, const gf b) { } } void gf_sub_RAW (gf out, const gf a, const gf b) { void gf_sub_RAW(gf out, const gf a, const gf b) { unsigned int i; for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) { Loading @@ -32,7 +34,8 @@ void gf_sub_RAW (gf out, const gf a, const gf b) { } } void gf_bias (gf a, int amt) { void gf_bias(gf a, int amt) { unsigned int i; uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt; Loading @@ -41,7 +44,8 @@ void gf_bias (gf a, int amt) { } } void gf_weak_reduce (gf a) { void gf_weak_reduce(gf a) { uint32_t mask = (1 << 28) - 1; uint32_t tmp = a->limb[15] >> 28; unsigned int i; Loading @@ -52,4 +56,3 @@ void gf_weak_reduce (gf a) { } a->limb[0] = (a->limb[0] & mask) + tmp; }
crypto/ec/curve448/arch_arm_32/arch_intrinsics.h +15 −11 Original line number Diff line number Diff line Loading @@ -16,17 +16,21 @@ # define ARCH_WORD_BITS 32 static __inline__ __attribute((always_inline, unused)) uint32_t word_is_zero(uint32_t a) { uint32_t word_is_zero(uint32_t a) { uint32_t ret; asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc"); return ret; } static __inline__ __attribute((always_inline, unused)) uint64_t widemul(uint32_t a, uint32_t b) { /* Could be UMULL, but it's hard to express to CC that the registers must be different */ uint64_t widemul(uint32_t a, uint32_t b) { /* * Could be UMULL, but it's hard to express to CC that the registers must * be different */ return ((uint64_t)a) * b; } #endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
crypto/ec/curve448/arch_arm_32/f_impl.c +232 −214 Original line number Diff line number Diff line Loading @@ -13,17 +13,14 @@ #include "f_field.h" static inline void __attribute__ ((gnu_inline, always_inline)) smlal ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smlal(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo = *acc, hi = (*acc) >> 32; __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" : [lo]"+&r"(lo), [hi]"+&r"(hi) __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo), [hi] "+&r"(hi) :[a] "r"(a),[b] "r"(b)); *acc = lo + (((uint64_t)hi) << 32); Loading @@ -33,16 +30,13 @@ smlal ( } static inline void __attribute__ ((gnu_inline, always_inline)) smlal2 ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smlal2(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo = *acc, hi = (*acc) >> 32; __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" : [lo]"+&r"(lo), [hi]"+&r"(hi) __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo), [hi] "+&r"(hi) :[a] "r"(a),[b] "r"(2 * b)); *acc = lo + (((uint64_t)hi) << 32); Loading @@ -52,16 +46,13 @@ smlal2 ( } static inline void __attribute__ ((gnu_inline, always_inline)) smull ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smull(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo, hi; __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" : [lo]"=&r"(lo), [hi]"=&r"(hi) __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo), [hi] "=&r"(hi) :[a] "r"(a),[b] "r"(b)); *acc = lo + (((uint64_t)hi) << 32); Loading @@ -71,11 +62,8 @@ smull ( } static inline void __attribute__ ((gnu_inline, always_inline)) smull2 ( uint64_t *acc, const uint32_t a, const uint32_t b ) { smull2(uint64_t *acc, const uint32_t a, const uint32_t b) { #ifdef __ARMEL__ uint32_t lo, hi; Loading @@ -89,7 +77,8 @@ smull2 ( #endif } void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; uint32_t *c = cs->limb; Loading Loading @@ -284,7 +273,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[5], bx); smlal(&accum0, ax = a[13], bx = b[7]); smlal(&accum2, ax = a[14], bx); smlal(&accum0, ax, bx = b[6]); Loading @@ -307,7 +295,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[13], bx); smlal(&accum1, ax = a[5], bx = bm[7]); smlal(&accum3, ax = a[6], bx); smlal(&accum1, ax, bx = bm[6]); Loading Loading @@ -365,7 +352,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum0, ax, bx = b[8]); smlal(&accum2, ax = aa[7], bx); smlal(&accum0, ax = a[15], bx = b[7]); /* t terms */ Loading @@ -388,7 +374,6 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { smlal(&accum1, ax, bx = b[0]); smlal(&accum3, ax = a[15], bx); smlal(&accum1, ax = a[7], bx = bm[7]); /* 1 terms */ Loading Loading @@ -435,7 +420,8 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { c[1] += ((uint32_t)(accum1)); } void gf_sqr (gf_s *__restrict__ cs, const gf as) { void gf_sqr(gf_s * __restrict__ cs, const gf as) { const uint32_t *a = as->limb; uint32_t *c = cs->limb; Loading Loading @@ -498,8 +484,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[0], bx = bm[1]); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum2 += accum0 >> 28; accum3 += accum1 >> 28; Loading Loading @@ -560,9 +550,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[1], bx); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum0 += accumC0; accum1 += accumC1; Loading Loading @@ -596,7 +589,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = a[10], bx); smlal(&accum0, ax, ax); smlal2(&accum0, ax = a[5], bx = a[7]); smlal2(&accum2, ax = a[6], bx); smlal(&accum0, ax, ax); Loading Loading @@ -630,9 +622,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[2], bx); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum0 += accumC0; accum1 += accumC1; Loading Loading @@ -664,7 +659,6 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = a[11], bx); smlal(&accum0, ax, ax); smlal(&accum0, ax = a[7], bx = a[7]); /* t terms */ Loading Loading @@ -699,9 +693,12 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { smlal2(&accum2, ax = bm[3], bx); smlal(&accum0, ax, ax); tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp; tmp = -accum3; accum3 = tmp - accum2; accum2 = tmp; tmp = -accum1; accum1 = tmp - accum0; accum0 = tmp; accum0 += accumC0; accum1 += accumC1; Loading Loading @@ -729,11 +726,8 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) { c[1] += ((uint32_t)(accum1)); } void gf_mulw_unsigned ( gf_s *__restrict__ cs, const gf as, uint32_t b ) { void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) { uint32_t mask = (1ull << 28) - 1; assert(b <= mask); Loading @@ -745,75 +739,99 @@ void gf_mulw_unsigned ( int i; uint32_t c0, c8, n0, n8; c0 = a[0]; c8 = a[8]; c0 = a[0]; c8 = a[8]; accum0 = widemul(b, c0); accum8 = widemul(b, c8); c[0] = accum0 & mask; accum0 >>= 28; c[8] = accum8 & mask; accum8 >>= 28; c[0] = accum0 & mask; accum0 >>= 28; c[8] = accum8 & mask; accum8 >>= 28; i = 1; { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { c0 = a[i]; c8 = a[i+8]; c0 = a[i]; c8 = a[i + 8]; smlal(&accum0, b, c0); smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { c0 = a[i]; c8 = a[i+8]; c0 = a[i]; c8 = a[i + 8]; smlal(&accum0, b, c0); smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { c0 = a[i]; c8 = a[i+8]; c0 = a[i]; c8 = a[i + 8]; smlal(&accum0, b, c0); smlal(&accum8, b, c8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } { n0 = a[i]; n8 = a[i+8]; n0 = a[i]; n8 = a[i + 8]; smlal(&accum0, b, n0); smlal(&accum8, b, n8); c[i] = accum0 & mask; accum0 >>= 28; c[i+8] = accum8 & mask; accum8 >>= 28; c[i] = accum0 & mask; accum0 >>= 28; c[i + 8] = accum8 & mask; accum8 >>= 28; i++; } Loading