tls: P256: add 64-bit montgomery reduce (disabled), small optimization in 32-bit code
function old new delta sp_512to256_mont_reduce_8 191 185 -6 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
832626227e
commit
90b0d33044
@ -705,36 +705,174 @@ static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Shift the result in the high 256 bits down to the bottom.
|
/* Shift the result in the high 256 bits down to the bottom. */
|
||||||
*/
|
|
||||||
static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
|
static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
|
||||||
{
|
{
|
||||||
memcpy(r, a + 8, sizeof(*r) * 8);
|
memcpy(r, a + 8, sizeof(*r) * 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Disabled for now. Seems to work, but ugly and 40 bytes larger on x86-64.
|
||||||
|
#if 0 //UNALIGNED_LE_64BIT
|
||||||
|
/* 64-bit little-endian optimized version.
|
||||||
|
* See generic 32-bit version below for explanation.
|
||||||
|
* The benefit of this version is: even though r[3] calculation is atrocious,
|
||||||
|
* we call sp_256_mul_add_4() four times, not 8.
|
||||||
|
*/
|
||||||
|
static int sp_256_mul_add_4(uint64_t *r /*, const uint64_t* a, uint64_t b*/)
|
||||||
|
{
|
||||||
|
uint64_t b = r[0];
|
||||||
|
|
||||||
|
# if 0
|
||||||
|
const uint64_t* a = (const void*)p256_mod;
|
||||||
|
//a[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
|
||||||
|
uint128_t t;
|
||||||
|
int i;
|
||||||
|
t = 0;
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
uint32_t t_hi;
|
||||||
|
uint128_t m = ((uint128_t)b * a[i]) + r[i];
|
||||||
|
t += m;
|
||||||
|
t_hi = (t < m);
|
||||||
|
r[i] = (uint64_t)t;
|
||||||
|
t = (t >> 64) | ((uint128_t)t_hi << 64);
|
||||||
|
}
|
||||||
|
r[4] += (uint64_t)t;
|
||||||
|
return (r[4] < (uint64_t)t); /* 1 if addition overflowed */
|
||||||
|
# else
|
||||||
|
// Unroll, then optimize the above loop:
|
||||||
|
//uint32_t t_hi;
|
||||||
|
//uint128_t m;
|
||||||
|
uint64_t t64, t64u;
|
||||||
|
|
||||||
|
//m = ((uint128_t)b * a[0]) + r[0];
|
||||||
|
// Since b is r[0] and a[0] is ffffffffffffffff, the above optimizes to:
|
||||||
|
// m = r[0] * ffffffffffffffff + r[0] = (r[0] << 64 - r[0]) + r[0] = r[0] << 64;
|
||||||
|
//t += m;
|
||||||
|
// t = r[0] << 64 = b << 64;
|
||||||
|
//t_hi = (t < m);
|
||||||
|
// t_hi = 0;
|
||||||
|
//r[0] = (uint64_t)t;
|
||||||
|
// r[0] = 0;
|
||||||
|
//the store can be eliminated since caller won't look at lower 256 bits of the result
|
||||||
|
//t = (t >> 64) | ((uint128_t)t_hi << 64);
|
||||||
|
// t = b;
|
||||||
|
|
||||||
|
//m = ((uint128_t)b * a[1]) + r[1];
|
||||||
|
// Since a[1] is 00000000ffffffff, the above optimizes to:
|
||||||
|
// m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
|
||||||
|
//t += m;
|
||||||
|
// t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
|
||||||
|
//t_hi = (t < m);
|
||||||
|
// t_hi = 0;
|
||||||
|
//r[1] = (uint64_t)t;
|
||||||
|
r[1] += (b << 32);
|
||||||
|
//t = (t >> 64) | ((uint128_t)t_hi << 64);
|
||||||
|
t64 = (r[1] < (b << 32));
|
||||||
|
t64 += (b >> 32);
|
||||||
|
|
||||||
|
//m = ((uint128_t)b * a[2]) + r[2];
|
||||||
|
// Since a[2] is 0000000000000000, the above optimizes to:
|
||||||
|
// m = b * 0 + r[2] = r[2];
|
||||||
|
//t += m;
|
||||||
|
// t = t64 + r[2];
|
||||||
|
//t_hi = (t < m);
|
||||||
|
// t_hi = 0;
|
||||||
|
//r[2] = (uint64_t)t;
|
||||||
|
r[2] += t64;
|
||||||
|
//t = (t >> 64) | ((uint128_t)t_hi << 64);
|
||||||
|
t64 = (r[2] < t64);
|
||||||
|
|
||||||
|
//m = ((uint128_t)b * a[3]) + r[3];
|
||||||
|
// Since a[3] is ffffffff00000001, the above optimizes to:
|
||||||
|
// m = b * ffffffff00000001 + r[3];
|
||||||
|
// m = b + b*ffffffff00000000 + r[3]
|
||||||
|
// m = b + (b*ffffffff << 32) + r[3]
|
||||||
|
// m = b + (((b<<32) - b) << 32) + r[3]
|
||||||
|
//t += m;
|
||||||
|
// t = t64 + (uint128_t)b + ((((uint128_t)b << 32) - b) << 32) + r[3];
|
||||||
|
t64 += b;
|
||||||
|
t64u = (t64 < b);
|
||||||
|
t64 += r[3];
|
||||||
|
t64u += (t64 < r[3]);
|
||||||
|
{
|
||||||
|
uint64_t lo,hi;
|
||||||
|
//lo = (((b << 32) - b) << 32
|
||||||
|
//hi = (((uint128_t)b << 32) - b) >> 32
|
||||||
|
//but without uint128_t:
|
||||||
|
hi = (b << 32) - b; /* form lower 32 bits of "hi" part 1 */
|
||||||
|
b = (b >> 32) - (/*borrowed above?*/(b << 32) < b); /* upper 32 bits of "hi" are in b */
|
||||||
|
lo = hi << 32; /* (use "hi" value to calculate "lo",... */
|
||||||
|
t64 += lo; /* ...consume... */
|
||||||
|
t64u += (t64 < lo); /* ..."lo") */
|
||||||
|
hi >>= 32; /* form lower 32 bits of "hi" part 2 */
|
||||||
|
hi |= (b << 32); /* combine lower and upper */
|
||||||
|
t64u += hi; /* consume "hi" */
|
||||||
|
}
|
||||||
|
//t_hi = (t < m);
|
||||||
|
// t_hi = 0;
|
||||||
|
//r[3] = (uint64_t)t;
|
||||||
|
r[3] = t64;
|
||||||
|
//t = (t >> 64) | ((uint128_t)t_hi << 64);
|
||||||
|
// t = t64u;
|
||||||
|
|
||||||
|
r[4] += t64u;
|
||||||
|
return (r[4] < t64u); /* 1 if addition overflowed */
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* aa/*, const sp_digit* m, sp_digit mp*/)
|
||||||
|
{
|
||||||
|
// const sp_digit* m = p256_mod;
|
||||||
|
int i;
|
||||||
|
uint64_t *a = (void*)aa;
|
||||||
|
|
||||||
|
sp_digit carry = 0;
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
// mu = a[i];
|
||||||
|
if (sp_256_mul_add_4(a+i /*, m, mu*/)) {
|
||||||
|
int j = i + 4;
|
||||||
|
inc_next_word:
|
||||||
|
if (++j > 7) { /* a[8] array has no more words? */
|
||||||
|
carry++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (++a[j] == 0) /* did this overflow too? */
|
||||||
|
goto inc_next_word;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sp_512to256_mont_shift_8(r, aa);
|
||||||
|
if (carry != 0)
|
||||||
|
sp_256_sub_8_p256_mod(r);
|
||||||
|
sp_256_norm_8(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* Generic 32-bit version */
|
||||||
|
|
||||||
/* Mul a by scalar b and add into r. (r += a * b)
|
/* Mul a by scalar b and add into r. (r += a * b)
|
||||||
* a = p256_mod
|
* a = p256_mod
|
||||||
* b = r[0]
|
* b = r[0]
|
||||||
*/
|
*/
|
||||||
static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
|
static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
|
||||||
{
|
{
|
||||||
// const sp_digit* a = p256_mod;
|
|
||||||
//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
|
|
||||||
sp_digit b = r[0];
|
sp_digit b = r[0];
|
||||||
|
|
||||||
uint64_t t;
|
uint64_t t;
|
||||||
|
|
||||||
// t = 0;
|
# if 0
|
||||||
// for (i = 0; i < 8; i++) {
|
const sp_digit* a = p256_mod;
|
||||||
// uint32_t t_hi;
|
//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
|
||||||
// uint64_t m = ((uint64_t)b * a[i]) + r[i];
|
int i;
|
||||||
// t += m;
|
t = 0;
|
||||||
// t_hi = (t < m);
|
for (i = 0; i < 8; i++) {
|
||||||
// r[i] = (sp_digit)t;
|
uint32_t t_hi;
|
||||||
// t = (t >> 32) | ((uint64_t)t_hi << 32);
|
uint64_t m = ((uint64_t)b * a[i]) + r[i];
|
||||||
// }
|
t += m;
|
||||||
// r[8] += (sp_digit)t;
|
t_hi = (t < m);
|
||||||
|
r[i] = (sp_digit)t;
|
||||||
|
t = (t >> 32) | ((uint64_t)t_hi << 32);
|
||||||
|
}
|
||||||
|
r[8] += (sp_digit)t;
|
||||||
|
return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
|
||||||
|
# else
|
||||||
// Unroll, then optimize the above loop:
|
// Unroll, then optimize the above loop:
|
||||||
//uint32_t t_hi;
|
//uint32_t t_hi;
|
||||||
uint64_t m;
|
uint64_t m;
|
||||||
@ -748,7 +886,8 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
|
|||||||
//t_hi = (t < m);
|
//t_hi = (t < m);
|
||||||
// t_hi = 0;
|
// t_hi = 0;
|
||||||
//r[0] = (sp_digit)t;
|
//r[0] = (sp_digit)t;
|
||||||
r[0] = 0;
|
// r[0] = 0;
|
||||||
|
//the store can be eliminated since caller won't look at lower 256 bits of the result
|
||||||
//t = (t >> 32) | ((uint64_t)t_hi << 32);
|
//t = (t >> 32) | ((uint64_t)t_hi << 32);
|
||||||
// t = b;
|
// t = b;
|
||||||
|
|
||||||
@ -840,6 +979,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
|
|||||||
|
|
||||||
r[8] += (sp_digit)t;
|
r[8] += (sp_digit)t;
|
||||||
return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
|
return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
|
||||||
|
# endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Reduce the number back to 256 bits using Montgomery reduction.
|
/* Reduce the number back to 256 bits using Montgomery reduction.
|
||||||
@ -861,7 +1001,7 @@ static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
|
|||||||
* Then a multiple of modulus is added to make T divisible by B^2.
|
* Then a multiple of modulus is added to make T divisible by B^2.
|
||||||
* [In our case, it is (p256_mp_mod * a[1]) << 32.]
|
* [In our case, it is (p256_mp_mod * a[1]) << 32.]
|
||||||
* And so on. Eventually T is divisible by R, and after division by R
|
* And so on. Eventually T is divisible by R, and after division by R
|
||||||
* the algorithm is in the same place as the usual Montgomery reduction was.
|
* the algorithm is in the same place as the usual Montgomery reduction.
|
||||||
*
|
*
|
||||||
* TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic?
|
* TODO: Can conditionally use 64-bit (if bit-little-endian arch) logic?
|
||||||
*/
|
*/
|
||||||
@ -914,6 +1054,7 @@ static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a/*, const sp_digit
|
|||||||
sp_256_norm_8(r);
|
sp_256_norm_8(r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Multiply two Montogmery form numbers mod the modulus (prime).
|
/* Multiply two Montogmery form numbers mod the modulus (prime).
|
||||||
* (r = a * b mod m)
|
* (r = a * b mod m)
|
||||||
|
Loading…
Reference in New Issue
Block a user