busybox/networking/tls_pstm_montgomery_reduce.c
Denys Vlasenko 37bdd8f8cb tls: pstm code shrink
Optimize ABI calling convention and "dead code" cases where return value
is known to be always "success".

function                                             old     new   delta
pstm_mod                                             113    1227   +1114
pstm_exptmod                                        1463    1532     +69
pstm_montgomery_reduce                               381     393     +12
pstm_sqr_comba                                       478     487      +9
pstm_mul_comba                                       447     452      +5
der_binary_to_pstm                                    42      45      +3
pstm_count_bits                                       48      46      -2
pstm_clear                                            72      70      -2
pstm_clamp                                            57      55      -2
pstm_zero                                             38      34      -4
pstm_init_size                                        46      42      -4
pstm_init_for_read_unsigned_bin                       24      20      -4
pstm_grow                                             72      68      -4
pstm_unsigned_bin_size                                37      32      -5
pstm_cmp_mag                                          78      72      -6
pstm_copy                                             92      84      -8
pstm_mul_d                                           224     215      -9
pstm_rshd                                            104      94     -10
pstm_mul_2                                           156     146     -10
tls_handshake                                       2085    2072     -13
psRsaEncryptPub                                      421     408     -13
pstm_lshd                                            109      95     -14
pstm_cmp                                              54      39     -15
s_pstm_sub                                           228     212     -16
pstm_init_copy                                        72      52     -20
pstm_read_unsigned_bin                               109      88     -21
pstm_mulmod                                          120      99     -21
s_pstm_add                                           337     314     -23
pstm_add                                             108      84     -24
pstm_mul_2d                                          186     161     -25
pstm_sub                                             102      74     -28
pstm_to_unsigned_bin                                 151     120     -31
pstm_set                                              34       -     -34
pstm_div_2d                                          409     373     -36
pstm_init                                             42       -     -42
pstm_exch                                             50       -     -50
pstm_montgomery_setup                                 89       -     -89
pstm_2expt                                            96       -     -96
pstm_montgomery_calc_normalization                   140       -    -140
pstm_div                                            1522       -   -1522
------------------------------------------------------------------------------
(add/remove: 0/7 grow/shrink: 6/27 up/down: 1212/-2343)     Total: -1131 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
2019-01-01 15:40:43 +01:00

428 lines
12 KiB
C

/*
* Copyright (C) 2017 Denys Vlasenko
*
* Licensed under GPLv2, see file LICENSE in this source tree.
*/
#include "tls.h"
/* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
* Changes are flagged with //bbox
*/
/**
* @file pstm_montgomery_reduce.c
* @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
*
* Multiprecision Montgomery Reduction.
*/
/*
* Copyright (c) 2013-2015 INSIDE Secure Corporation
* Copyright (c) PeerSec Networks, 2002-2011
* All Rights Reserved
*
* The latest version of this code is available at http://www.matrixssl.org
*
* This software is open source; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This General Public License does NOT permit incorporating this software
* into proprietary programs. If you are unable to comply with the GPL, a
* commercial license for this software may be purchased from INSIDE at
* http://www.insidesecure.com/eng/Company/Locations
*
* This program is distributed in WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
* http://www.gnu.org/copyleft/gpl.html
*/
/******************************************************************************/
//bbox
//#include "../cryptoApi.h"
#ifndef DISABLE_PSTM
/******************************************************************************/
#if defined(PSTM_X86)
/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
#endif
//#pragma message ("Using 32 bit x86 Assembly Optimizations")
#define MONT_START
#define MONT_FINI
#define LOOP_END
#define LOOP_START \
mu = c[x] * mp
#define INNERMUL \
asm( \
"movl %5,%%eax \n\t" \
"mull %4 \n\t" \
"addl %1,%%eax \n\t" \
"adcl $0,%%edx \n\t" \
"addl %%eax,%0 \n\t" \
"adcl $0,%%edx \n\t" \
"movl %%edx,%1 \n\t" \
:"=g"(_c[LO]), "=r"(cy) \
:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
: "%eax", "%edx", "%cc")
#define PROPCARRY \
asm( \
"addl %1,%0 \n\t" \
"setb %%al \n\t" \
"movzbl %%al,%1 \n\t" \
:"=g"(_c[LO]), "=r"(cy) \
:"0"(_c[LO]), "1"(cy) \
: "%eax", "%cc")
/******************************************************************************/
#elif defined(PSTM_X86_64)
/* x86-64 optimized */
#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
#endif
//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
#define MONT_START
#define MONT_FINI
#define LOOP_END
#define LOOP_START \
mu = c[x] * mp
#define INNERMUL \
asm( \
"movq %5,%%rax \n\t" \
"mulq %4 \n\t" \
"addq %1,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"addq %%rax,%0 \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rdx,%1 \n\t" \
:"=g"(_c[LO]), "=r"(cy) \
:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
: "%rax", "%rdx", "cc")
#define INNERMUL8 \
asm( \
"movq 0(%5),%%rax \n\t" \
"movq 0(%2),%%r10 \n\t" \
"movq 0x8(%5),%%r11 \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq 0x8(%2),%%r10 \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
"movq %%r11,%%rax \n\t" \
"movq 0x10(%5),%%r11 \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq 0x10(%2),%%r10 \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0x8(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
"movq %%r11,%%rax \n\t" \
"movq 0x18(%5),%%r11 \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq 0x18(%2),%%r10 \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0x10(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
"movq %%r11,%%rax \n\t" \
"movq 0x20(%5),%%r11 \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq 0x20(%2),%%r10 \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0x18(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
"movq %%r11,%%rax \n\t" \
"movq 0x28(%5),%%r11 \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq 0x28(%2),%%r10 \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0x20(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
"movq %%r11,%%rax \n\t" \
"movq 0x30(%5),%%r11 \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq 0x30(%2),%%r10 \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0x28(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
"movq %%r11,%%rax \n\t" \
"movq 0x38(%5),%%r11 \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq 0x38(%2),%%r10 \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0x30(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
"movq %%r11,%%rax \n\t" \
"mulq %4 \n\t" \
"addq %%r10,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"addq %3,%%rax \n\t" \
"adcq $0,%%rdx \n\t" \
"movq %%rax,0x38(%0) \n\t" \
"movq %%rdx,%1 \n\t" \
\
:"=r"(_c), "=r"(cy) \
: "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
: "%rax", "%rdx", "%r10", "%r11", "cc")
#define PROPCARRY \
asm( \
"addq %1,%0 \n\t" \
"setb %%al \n\t" \
"movzbq %%al,%1 \n\t" \
:"=g"(_c[LO]), "=r"(cy) \
:"0"(_c[LO]), "1"(cy) \
: "%rax", "cc")
/******************************************************************************/
#elif defined(PSTM_ARM)
#define MONT_START
#define MONT_FINI
#define LOOP_END
#define LOOP_START \
mu = c[x] * mp
#ifdef __thumb2__
//#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
#define INNERMUL \
asm( \
" LDR r0,%1 \n\t" \
" ADDS r0,r0,%0 \n\t" \
" ITE CS \n\t" \
" MOVCS %0,#1 \n\t" \
" MOVCC %0,#0 \n\t" \
" UMLAL r0,%0,%3,%4 \n\t" \
" STR r0,%1 \n\t" \
:"=r"(cy),"=m"(_c[0])\
:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
:"r0","%cc");
#define PROPCARRY \
asm( \
" LDR r0,%1 \n\t" \
" ADDS r0,r0,%0 \n\t" \
" STR r0,%1 \n\t" \
" ITE CS \n\t" \
" MOVCS %0,#1 \n\t" \
" MOVCC %0,#0 \n\t" \
:"=r"(cy),"=m"(_c[0])\
:"0"(cy),"m"(_c[0])\
:"r0","%cc");
#else /* Non-Thumb2 code */
//#pragma message ("Using 32 bit ARM Assembly Optimizations")
#define INNERMUL \
asm( \
" LDR r0,%1 \n\t" \
" ADDS r0,r0,%0 \n\t" \
" MOVCS %0,#1 \n\t" \
" MOVCC %0,#0 \n\t" \
" UMLAL r0,%0,%3,%4 \n\t" \
" STR r0,%1 \n\t" \
:"=r"(cy),"=m"(_c[0])\
:"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
:"r0","%cc");
#define PROPCARRY \
asm( \
" LDR r0,%1 \n\t" \
" ADDS r0,r0,%0 \n\t" \
" STR r0,%1 \n\t" \
" MOVCS %0,#1 \n\t" \
" MOVCC %0,#0 \n\t" \
:"=r"(cy),"=m"(_c[0])\
:"0"(cy),"m"(_c[0])\
:"r0","%cc");
#endif /* __thumb2__ */
/******************************************************************************/
#elif defined(PSTM_MIPS)
/* MIPS32 */
//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
#define MONT_START
#define MONT_FINI
#define LOOP_END
#define LOOP_START \
mu = c[x] * mp
#define INNERMUL \
asm( \
" multu %3,%4 \n\t" \
" mflo $12 \n\t" \
" mfhi $13 \n\t" \
" addu $12,$12,%0 \n\t" \
" sltu $10,$12,%0 \n\t" \
" addu $13,$13,$10 \n\t" \
" lw $10,%1 \n\t" \
" addu $12,$12,$10 \n\t" \
" sltu $10,$12,$10 \n\t" \
" addu %0,$13,$10 \n\t" \
" sw $12,%1 \n\t" \
:"=r"(cy),"=m"(_c[0])\
:"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
:"$10","$12","$13")\
; ++tmpm;
#define PROPCARRY \
asm( \
" lw $10,%1 \n\t" \
" addu $10,$10,%0 \n\t" \
" sw $10,%1 \n\t" \
" sltu %0,$10,%0 \n\t" \
:"=r"(cy),"=m"(_c[0])\
:"r"(cy),"r"(_c[0])\
:"$10");
/******************************************************************************/
#else
/* ISO C code */
#define MONT_START
#define MONT_FINI
#define LOOP_END
#define LOOP_START \
mu = c[x] * mp
#define INNERMUL \
do { pstm_word t; \
t = ((pstm_word)_c[0] + (pstm_word)cy) + \
(((pstm_word)mu) * ((pstm_word)*tmpm++)); \
_c[0] = (pstm_digit)t; \
cy = (pstm_digit)(t >> DIGIT_BIT); \
} while (0)
#define PROPCARRY \
do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
#endif
/******************************************************************************/
#define LO 0
/* computes x/R == x (mod N) via Montgomery Reduction */
int32 FAST_FUNC pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
pstm_digit mp, pstm_digit *paD, uint32 paDlen)
{
pstm_digit *c, *_c, *tmpm, mu;
int32 oldused, x, y;
int pa; //bbox: was int16
pa = m->used;
if (pa > a->alloc) {
/* Sanity test for bad numbers. This will confirm no buffer overruns */
return PS_LIMIT_FAIL;
}
if (paD && paDlen >= (uint32)2*pa+1) {
c = paD;
memset(c, 0x0, paDlen);
} else {
c = xzalloc(2*pa+1);//bbox
}
/* copy the input */
oldused = a->used;
for (x = 0; x < oldused; x++) {
c[x] = a->dp[x];
}
MONT_START;
for (x = 0; x < pa; x++) {
pstm_digit cy = 0;
/* get Mu for this round */
LOOP_START;
_c = c + x;
tmpm = m->dp;
y = 0;
#ifdef PSTM_X86_64
for (; y < (pa & ~7); y += 8) {
INNERMUL8;
_c += 8;
tmpm += 8;
}
#endif /* PSTM_X86_64 */
for (; y < pa; y++) {
INNERMUL;
++_c;
}
LOOP_END;
while (cy) {
PROPCARRY;
++_c;
}
}
/* now copy out */
_c = c + pa;
tmpm = a->dp;
for (x = 0; x < pa+1; x++) {
*tmpm++ = *_c++;
}
for (; x < oldused; x++) {
*tmpm++ = 0;
}
MONT_FINI;
a->used = pa+1;
pstm_clamp(a);
/* reuse x as return code */
x = PSTM_OKAY;
/* if A >= m then A = A - m */
if (pstm_cmp_mag (a, m) != PSTM_LT) {
if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
x = PS_MEM_FAIL;
}
}
if (paDlen < (uint32)2*pa+1) {
psFree(c, pool);
}
return x;
}
#endif /* !DISABLE_PSTM */
/******************************************************************************/