Optimize ABI calling convention and "dead code" cases where return value is known to be always "success". function old new delta pstm_mod 113 1227 +1114 pstm_exptmod 1463 1532 +69 pstm_montgomery_reduce 381 393 +12 pstm_sqr_comba 478 487 +9 pstm_mul_comba 447 452 +5 der_binary_to_pstm 42 45 +3 pstm_count_bits 48 46 -2 pstm_clear 72 70 -2 pstm_clamp 57 55 -2 pstm_zero 38 34 -4 pstm_init_size 46 42 -4 pstm_init_for_read_unsigned_bin 24 20 -4 pstm_grow 72 68 -4 pstm_unsigned_bin_size 37 32 -5 pstm_cmp_mag 78 72 -6 pstm_copy 92 84 -8 pstm_mul_d 224 215 -9 pstm_rshd 104 94 -10 pstm_mul_2 156 146 -10 tls_handshake 2085 2072 -13 psRsaEncryptPub 421 408 -13 pstm_lshd 109 95 -14 pstm_cmp 54 39 -15 s_pstm_sub 228 212 -16 pstm_init_copy 72 52 -20 pstm_read_unsigned_bin 109 88 -21 pstm_mulmod 120 99 -21 s_pstm_add 337 314 -23 pstm_add 108 84 -24 pstm_mul_2d 186 161 -25 pstm_sub 102 74 -28 pstm_to_unsigned_bin 151 120 -31 pstm_set 34 - -34 pstm_div_2d 409 373 -36 pstm_init 42 - -42 pstm_exch 50 - -50 pstm_montgomery_setup 89 - -89 pstm_2expt 96 - -96 pstm_montgomery_calc_normalization 140 - -140 pstm_div 1522 - -1522 ------------------------------------------------------------------------------ (add/remove: 0/7 grow/shrink: 6/27 up/down: 1212/-2343) Total: -1131 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
		
			
				
	
	
		
			1114 lines
		
	
	
		
			43 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1114 lines
		
	
	
		
			43 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * Copyright (C) 2017 Denys Vlasenko
 | 
						|
 *
 | 
						|
 * Licensed under GPLv2, see file LICENSE in this source tree.
 | 
						|
 */
 | 
						|
#include "tls.h"
 | 
						|
 | 
						|
/* The file is taken almost verbatim from matrixssl-3-7-2b-open/crypto/math/.
 | 
						|
 * Changes are flagged with //bbox
 | 
						|
 */
 | 
						|
 | 
						|
/**
 | 
						|
 *	@file    pstm_sqr_comba.c
 | 
						|
 *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
 | 
						|
 *
 | 
						|
 *	Multiprecision Squaring with Comba technique.
 | 
						|
 */
 | 
						|
/*
 | 
						|
 *	Copyright (c) 2013-2015 INSIDE Secure Corporation
 | 
						|
 *	Copyright (c) PeerSec Networks, 2002-2011
 | 
						|
 *	All Rights Reserved
 | 
						|
 *
 | 
						|
 *	The latest version of this code is available at http://www.matrixssl.org
 | 
						|
 *
 | 
						|
 *	This software is open source; you can redistribute it and/or modify
 | 
						|
 *	it under the terms of the GNU General Public License as published by
 | 
						|
 *	the Free Software Foundation; either version 2 of the License, or
 | 
						|
 *	(at your option) any later version.
 | 
						|
 *
 | 
						|
 *	This General Public License does NOT permit incorporating this software
 | 
						|
 *	into proprietary programs.  If you are unable to comply with the GPL, a
 | 
						|
 *	commercial license for this software may be purchased from INSIDE at
 | 
						|
 *	http://www.insidesecure.com/eng/Company/Locations
 | 
						|
 *
 | 
						|
 *	This program is distributed in WITHOUT ANY WARRANTY; without even the
 | 
						|
 *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | 
						|
 *	See the GNU General Public License for more details.
 | 
						|
 *
 | 
						|
 *	You should have received a copy of the GNU General Public License
 | 
						|
 *	along with this program; if not, write to the Free Software
 | 
						|
 *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | 
						|
 *	http://www.gnu.org/copyleft/gpl.html
 | 
						|
 */
 | 
						|
/******************************************************************************/
 | 
						|
 | 
						|
//bbox
 | 
						|
//#include "../cryptoApi.h"
 | 
						|
#ifndef DISABLE_PSTM
 | 
						|
 | 
						|
/******************************************************************************/
 | 
						|
#if defined(PSTM_X86)
 | 
						|
/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
 | 
						|
#if !defined(__GNUC__) || !defined(__i386__)
 | 
						|
#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
 | 
						|
#endif
 | 
						|
//#pragma message ("Using 32 bit x86 Assembly Optimizations")
 | 
						|
 | 
						|
#define COMBA_START
 | 
						|
 | 
						|
#define CLEAR_CARRY \
 | 
						|
   c0 = c1 = c2 = 0;
 | 
						|
 | 
						|
#define COMBA_STORE(x) \
 | 
						|
   x = c0;
 | 
						|
 | 
						|
#define COMBA_STORE2(x) \
 | 
						|
   x = c1;
 | 
						|
 | 
						|
#define CARRY_FORWARD \
 | 
						|
   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 | 
						|
 | 
						|
#define COMBA_FINI
 | 
						|
 | 
						|
#define SQRADD(i, j)                                      \
 | 
						|
asm(                                            \
 | 
						|
	 "movl  %6,%%eax     \n\t"                            \
 | 
						|
	 "mull  %%eax        \n\t"                            \
 | 
						|
	 "addl  %%eax,%0     \n\t"                            \
 | 
						|
	 "adcl  %%edx,%1     \n\t"                            \
 | 
						|
	 "adcl  $0,%2        \n\t"                            \
 | 
						|
	 :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
 | 
						|
	//bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
 | 
						|
 | 
						|
#define SQRADD2(i, j)                                     \
 | 
						|
asm(                                            \
 | 
						|
	 "movl  %6,%%eax     \n\t"                            \
 | 
						|
	 "mull  %7           \n\t"                            \
 | 
						|
	 "addl  %%eax,%0     \n\t"                            \
 | 
						|
	 "adcl  %%edx,%1     \n\t"                            \
 | 
						|
	 "adcl  $0,%2        \n\t"                            \
 | 
						|
	 "addl  %%eax,%0     \n\t"                            \
 | 
						|
	 "adcl  %%edx,%1     \n\t"                            \
 | 
						|
	 "adcl  $0,%2        \n\t"                            \
 | 
						|
	 :"=rm"(c0), "=rm"(c1), "=rm"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","%cc");
 | 
						|
	//bbox: ^^^ replaced "=r" with "=rm": %ebx is not available on shared build
 | 
						|
 | 
						|
#define SQRADDSC(i, j)                                    \
 | 
						|
asm(                                                     \
 | 
						|
	 "movl  %6,%%eax     \n\t"                            \
 | 
						|
	 "mull  %7           \n\t"                            \
 | 
						|
	 "movl  %%eax,%0     \n\t"                            \
 | 
						|
	 "movl  %%edx,%1     \n\t"                            \
 | 
						|
	 "xorl  %2,%2        \n\t"                            \
 | 
						|
	 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
 | 
						|
 | 
						|
#define SQRADDAC(i, j)                                    \
 | 
						|
asm(                                                     \
 | 
						|
	 "movl  %6,%%eax     \n\t"                            \
 | 
						|
	 "mull  %7           \n\t"                            \
 | 
						|
	 "addl  %%eax,%0     \n\t"                            \
 | 
						|
	 "adcl  %%edx,%1     \n\t"                            \
 | 
						|
	 "adcl  $0,%2        \n\t"                            \
 | 
						|
	 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
 | 
						|
 | 
						|
#define SQRADDDB                                          \
 | 
						|
asm(                                                     \
 | 
						|
	 "addl %6,%0         \n\t"                            \
 | 
						|
	 "adcl %7,%1         \n\t"                            \
 | 
						|
	 "adcl %8,%2         \n\t"                            \
 | 
						|
	 "addl %6,%0         \n\t"                            \
 | 
						|
	 "adcl %7,%1         \n\t"                            \
 | 
						|
	 "adcl %8,%2         \n\t"                            \
 | 
						|
	 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
 | 
						|
 | 
						|
/******************************************************************************/
 | 
						|
#elif defined(PSTM_X86_64)
 | 
						|
/* x86-64 optimized */
 | 
						|
#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
 | 
						|
#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
 | 
						|
#endif
 | 
						|
//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
 | 
						|
 | 
						|
#define COMBA_START
 | 
						|
 | 
						|
#define CLEAR_CARRY \
 | 
						|
c0 = c1 = c2 = 0;
 | 
						|
 | 
						|
#define COMBA_STORE(x) \
 | 
						|
x = c0;
 | 
						|
 | 
						|
#define COMBA_STORE2(x) \
 | 
						|
x = c1;
 | 
						|
 | 
						|
#define CARRY_FORWARD \
 | 
						|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 | 
						|
 | 
						|
#define COMBA_FINI
 | 
						|
 | 
						|
#define SQRADD(i, j)                                     \
 | 
						|
asm(                                                     \
 | 
						|
	"movq  %6,%%rax     \n\t"                            \
 | 
						|
	"mulq  %%rax        \n\t"                            \
 | 
						|
	"addq  %%rax,%0     \n\t"                            \
 | 
						|
	"adcq  %%rdx,%1     \n\t"                            \
 | 
						|
	"adcq  $0,%2        \n\t"                            \
 | 
						|
	:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
 | 
						|
 | 
						|
#define SQRADD2(i, j)                                    \
 | 
						|
asm(                                                     \
 | 
						|
	"movq  %6,%%rax     \n\t"                            \
 | 
						|
	"mulq  %7           \n\t"                            \
 | 
						|
	"addq  %%rax,%0     \n\t"                            \
 | 
						|
	"adcq  %%rdx,%1     \n\t"                            \
 | 
						|
	"adcq  $0,%2        \n\t"                            \
 | 
						|
	"addq  %%rax,%0     \n\t"                            \
 | 
						|
	"adcq  %%rdx,%1     \n\t"                            \
 | 
						|
	"adcq  $0,%2        \n\t"                            \
 | 
						|
	:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
 | 
						|
 | 
						|
#define SQRADDSC(i, j)                                   \
 | 
						|
asm(                                                     \
 | 
						|
	"movq  %6,%%rax     \n\t"                            \
 | 
						|
	"mulq  %7           \n\t"                            \
 | 
						|
	"movq  %%rax,%0     \n\t"                            \
 | 
						|
	"movq  %%rdx,%1     \n\t"                            \
 | 
						|
	"xorq  %2,%2        \n\t"                            \
 | 
						|
	:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
 | 
						|
 | 
						|
#define SQRADDAC(i, j)                                   \
 | 
						|
asm(                                                     \
 | 
						|
	"movq  %6,%%rax     \n\t"                            \
 | 
						|
	"mulq  %7           \n\t"                            \
 | 
						|
	"addq  %%rax,%0     \n\t"                            \
 | 
						|
	"adcq  %%rdx,%1     \n\t"                            \
 | 
						|
	"adcq  $0,%2        \n\t"                            \
 | 
						|
	:"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
 | 
						|
 | 
						|
#define SQRADDDB                                         \
 | 
						|
asm(                                                     \
 | 
						|
	"addq %6,%0         \n\t"                            \
 | 
						|
	"adcq %7,%1         \n\t"                            \
 | 
						|
	"adcq %8,%2         \n\t"                            \
 | 
						|
	"addq %6,%0         \n\t"                            \
 | 
						|
	"adcq %7,%1         \n\t"                            \
 | 
						|
	"adcq %8,%2         \n\t"                            \
 | 
						|
	:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
 | 
						|
 | 
						|
/******************************************************************************/
 | 
						|
#elif defined(PSTM_ARM)
 | 
						|
/* ARM code */
 | 
						|
//#pragma message ("Using 32 bit ARM Assembly Optimizations")
 | 
						|
 | 
						|
#define COMBA_START
 | 
						|
 | 
						|
#define CLEAR_CARRY \
 | 
						|
c0 = c1 = c2 = 0;
 | 
						|
 | 
						|
#define COMBA_STORE(x) \
 | 
						|
x = c0;
 | 
						|
 | 
						|
#define COMBA_STORE2(x) \
 | 
						|
x = c1;
 | 
						|
 | 
						|
#define CARRY_FORWARD \
 | 
						|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 | 
						|
 | 
						|
#define COMBA_FINI
 | 
						|
 | 
						|
/* multiplies point i and j, updates carry "c1" and digit c2 */
 | 
						|
#define SQRADD(i, j)                                             \
 | 
						|
asm(                                                             \
 | 
						|
"  UMULL  r0,r1,%6,%6              \n\t"                         \
 | 
						|
"  ADDS   %0,%0,r0                 \n\t"                         \
 | 
						|
"  ADCS   %1,%1,r1                 \n\t"                         \
 | 
						|
"  ADC    %2,%2,#0                 \n\t"                         \
 | 
						|
:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
 | 
						|
 | 
						|
/* for squaring some of the terms are doubled... */
 | 
						|
#define SQRADD2(i, j)                                            \
 | 
						|
asm(                                                             \
 | 
						|
"  UMULL  r0,r1,%6,%7              \n\t"                         \
 | 
						|
"  ADDS   %0,%0,r0                 \n\t"                         \
 | 
						|
"  ADCS   %1,%1,r1                 \n\t"                         \
 | 
						|
"  ADC    %2,%2,#0                 \n\t"                         \
 | 
						|
"  ADDS   %0,%0,r0                 \n\t"                         \
 | 
						|
"  ADCS   %1,%1,r1                 \n\t"                         \
 | 
						|
"  ADC    %2,%2,#0                 \n\t"                         \
 | 
						|
:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
 | 
						|
 | 
						|
#define SQRADDSC(i, j)                                           \
 | 
						|
asm(                                                             \
 | 
						|
"  UMULL  %0,%1,%6,%7              \n\t"                         \
 | 
						|
"  SUB    %2,%2,%2                 \n\t"                         \
 | 
						|
:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
 | 
						|
 | 
						|
#define SQRADDAC(i, j)                                           \
 | 
						|
asm(                                                             \
 | 
						|
"  UMULL  r0,r1,%6,%7              \n\t"                         \
 | 
						|
"  ADDS   %0,%0,r0                 \n\t"                         \
 | 
						|
"  ADCS   %1,%1,r1                 \n\t"                         \
 | 
						|
"  ADC    %2,%2,#0                 \n\t"                         \
 | 
						|
:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
 | 
						|
 | 
						|
#define SQRADDDB                                                 \
 | 
						|
asm(                                                             \
 | 
						|
"  ADDS  %0,%0,%3                     \n\t"                      \
 | 
						|
"  ADCS  %1,%1,%4                     \n\t"                      \
 | 
						|
"  ADC   %2,%2,%5                     \n\t"                      \
 | 
						|
"  ADDS  %0,%0,%3                     \n\t"                      \
 | 
						|
"  ADCS  %1,%1,%4                     \n\t"                      \
 | 
						|
"  ADC   %2,%2,%5                     \n\t"                      \
 | 
						|
:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
 | 
						|
 | 
						|
/******************************************************************************/
 | 
						|
#elif defined(PSTM_MIPS)
 | 
						|
/* MIPS32 */
 | 
						|
//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
 | 
						|
 | 
						|
#define COMBA_START
 | 
						|
 | 
						|
#define CLEAR_CARRY \
 | 
						|
c0 = c1 = c2 = 0;
 | 
						|
 | 
						|
#define COMBA_STORE(x) \
 | 
						|
x = c0;
 | 
						|
 | 
						|
#define COMBA_STORE2(x) \
 | 
						|
x = c1;
 | 
						|
 | 
						|
#define CARRY_FORWARD \
 | 
						|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 | 
						|
 | 
						|
#define COMBA_FINI
 | 
						|
 | 
						|
/* multiplies point i and j, updates carry "c1" and digit c2 */
 | 
						|
#define SQRADD(i, j)               \
 | 
						|
asm(                               \
 | 
						|
	" multu  %6,%6          \n\t"  \
 | 
						|
	" mflo   $12            \n\t"  \
 | 
						|
	" mfhi   $13            \n\t"  \
 | 
						|
	" addu    %0,%0,$12     \n\t"  \
 | 
						|
	" sltu   $12,%0,$12     \n\t"  \
 | 
						|
	" addu    %1,%1,$13     \n\t"  \
 | 
						|
	" sltu   $13,%1,$13     \n\t"  \
 | 
						|
	" addu    %1,%1,$12     \n\t"  \
 | 
						|
	" sltu   $12,%1,$12     \n\t"  \
 | 
						|
	" addu    %2,%2,$13     \n\t"  \
 | 
						|
	" addu    %2,%2,$12     \n\t"  \
 | 
						|
	:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
 | 
						|
 | 
						|
/* for squaring some of the terms are doubled... */
 | 
						|
#define SQRADD2(i, j)             \
 | 
						|
asm(                              \
 | 
						|
	" multu  %6,%7          \n\t" \
 | 
						|
	" mflo   $12            \n\t" \
 | 
						|
	" mfhi   $13            \n\t" \
 | 
						|
	\
 | 
						|
	" addu    %0,%0,$12     \n\t" \
 | 
						|
	" sltu   $14,%0,$12     \n\t" \
 | 
						|
	" addu    %1,%1,$13     \n\t" \
 | 
						|
	" sltu   $15,%1,$13     \n\t" \
 | 
						|
	" addu    %1,%1,$14     \n\t" \
 | 
						|
	" sltu   $14,%1,$14     \n\t" \
 | 
						|
	" addu    %2,%2,$15     \n\t" \
 | 
						|
	" addu    %2,%2,$14     \n\t" \
 | 
						|
	\
 | 
						|
	" addu    %0,%0,$12     \n\t" \
 | 
						|
	" sltu   $14,%0,$12     \n\t" \
 | 
						|
	" addu    %1,%1,$13     \n\t" \
 | 
						|
	" sltu   $15,%1,$13     \n\t" \
 | 
						|
	" addu    %1,%1,$14     \n\t" \
 | 
						|
	" sltu   $14,%1,$14     \n\t" \
 | 
						|
	" addu    %2,%2,$15     \n\t" \
 | 
						|
	" addu    %2,%2,$14     \n\t" \
 | 
						|
	:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
 | 
						|
 | 
						|
#define SQRADDSC(i, j)             \
 | 
						|
asm(                               \
 | 
						|
	" multu  %6,%7          \n\t"  \
 | 
						|
	" mflo   %0             \n\t"  \
 | 
						|
	" mfhi   %1             \n\t"  \
 | 
						|
	" xor    %2,%2,%2       \n\t"  \
 | 
						|
	:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
 | 
						|
 | 
						|
#define SQRADDAC(i, j)            \
 | 
						|
asm(                              \
 | 
						|
	" multu  %6,%7          \n\t" \
 | 
						|
	" mflo   $12            \n\t" \
 | 
						|
	" mfhi   $13            \n\t" \
 | 
						|
	" addu    %0,%0,$12     \n\t" \
 | 
						|
	" sltu   $12,%0,$12     \n\t" \
 | 
						|
	" addu    %1,%1,$13     \n\t" \
 | 
						|
	" sltu   $13,%1,$13     \n\t" \
 | 
						|
	" addu    %1,%1,$12     \n\t" \
 | 
						|
	" sltu   $12,%1,$12     \n\t" \
 | 
						|
	" addu    %2,%2,$13     \n\t" \
 | 
						|
	" addu    %2,%2,$12     \n\t" \
 | 
						|
	:"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
 | 
						|
 | 
						|
#define SQRADDDB                   \
 | 
						|
asm(                               \
 | 
						|
	" addu    %0,%0,%3       \n\t" \
 | 
						|
	" sltu   $10,%0,%3       \n\t" \
 | 
						|
	" addu    %1,%1,$10      \n\t" \
 | 
						|
	" sltu   $10,%1,$10      \n\t" \
 | 
						|
	" addu    %1,%1,%4       \n\t" \
 | 
						|
	" sltu   $11,%1,%4       \n\t" \
 | 
						|
	" addu    %2,%2,$10      \n\t" \
 | 
						|
	" addu    %2,%2,$11      \n\t" \
 | 
						|
	" addu    %2,%2,%5       \n\t" \
 | 
						|
	\
 | 
						|
	" addu    %0,%0,%3       \n\t" \
 | 
						|
	" sltu   $10,%0,%3       \n\t" \
 | 
						|
	" addu    %1,%1,$10      \n\t" \
 | 
						|
	" sltu   $10,%1,$10      \n\t" \
 | 
						|
	" addu    %1,%1,%4       \n\t" \
 | 
						|
	" sltu   $11,%1,%4       \n\t" \
 | 
						|
	" addu    %2,%2,$10      \n\t" \
 | 
						|
	" addu    %2,%2,$11      \n\t" \
 | 
						|
	" addu    %2,%2,%5       \n\t" \
 | 
						|
	:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
 | 
						|
 | 
						|
#else
 | 
						|
/******************************************************************************/
 | 
						|
#define PSTM_ISO
 | 
						|
/* ISO C portable code */
 | 
						|
 | 
						|
#define COMBA_START
 | 
						|
 | 
						|
#define CLEAR_CARRY \
 | 
						|
   c0 = c1 = c2 = 0;
 | 
						|
 | 
						|
#define COMBA_STORE(x) \
 | 
						|
   x = c0;
 | 
						|
 | 
						|
#define COMBA_STORE2(x) \
 | 
						|
   x = c1;
 | 
						|
 | 
						|
#define CARRY_FORWARD \
 | 
						|
   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
 | 
						|
 | 
						|
#define COMBA_FINI
 | 
						|
 | 
						|
/* multiplies point i and j, updates carry "c1" and digit c2 */
 | 
						|
#define SQRADD(i, j)													\
 | 
						|
   do { pstm_word t;													\
 | 
						|
   t = c0 + ((pstm_word)i) * ((pstm_word)j);  c0 = (pstm_digit)t;		\
 | 
						|
   t = c1 + (t >> DIGIT_BIT);											\
 | 
						|
   c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT);				\
 | 
						|
   } while (0);
 | 
						|
 | 
						|
 | 
						|
/* for squaring some of the terms are doubled... */
 | 
						|
#define SQRADD2(i, j)											\
 | 
						|
   do { pstm_word t;											\
 | 
						|
   t  = ((pstm_word)i) * ((pstm_word)j);						\
 | 
						|
   tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;					\
 | 
						|
   tt = (pstm_word)c1 + (tt >> DIGIT_BIT);						\
 | 
						|
   c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);	\
 | 
						|
   tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt;					\
 | 
						|
   tt = (pstm_word)c1 + (tt >> DIGIT_BIT);						\
 | 
						|
   c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT);	\
 | 
						|
   } while (0);
 | 
						|
 | 
						|
#define SQRADDSC(i, j)										\
 | 
						|
   do { pstm_word t;										\
 | 
						|
	  t =  ((pstm_word)i) * ((pstm_word)j);					\
 | 
						|
	  sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0;	\
 | 
						|
   } while (0);
 | 
						|
 | 
						|
#define SQRADDAC(i, j)														\
 | 
						|
   do { pstm_word t;														\
 | 
						|
   t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j);					\
 | 
						|
   sc0 = (pstm_digit)t;														\
 | 
						|
   t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t;			\
 | 
						|
   sc2 += (pstm_digit)(t >> DIGIT_BIT);										\
 | 
						|
   } while (0);
 | 
						|
 | 
						|
#define SQRADDDB															\
 | 
						|
   do { pstm_word t;														\
 | 
						|
   t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0);				\
 | 
						|
   c0 = (pstm_digit)t;														\
 | 
						|
   t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT);			\
 | 
						|
   c1 = (pstm_digit)t;														\
 | 
						|
   c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT);						\
 | 
						|
   } while (0);
 | 
						|
 | 
						|
#endif /* ISO_C */
 | 
						|
 | 
						|
/******************************************************************************/
 | 
						|
/*
 | 
						|
	Non-unrolled comba squarer
 | 
						|
 */
 | 
						|
//bbox: pool unused
 | 
						|
#define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
 | 
						|
        pstm_sqr_comba_gen(      A, B, paD, paDlen)
 | 
						|
static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
 | 
						|
			pstm_digit *paD, uint32 paDlen)
 | 
						|
{
 | 
						|
	int		paDfail, pa; //bbox: was int16
 | 
						|
	int32       ix, iz;
 | 
						|
	pstm_digit  c0, c1, c2, *dst;
 | 
						|
#ifdef PSTM_ISO
 | 
						|
	pstm_word   tt;
 | 
						|
#endif
 | 
						|
 | 
						|
	paDfail = 0;
 | 
						|
	/* get size of output and trim */
 | 
						|
	pa = A->used + A->used;
 | 
						|
 | 
						|
	/* number of output digits to produce */
 | 
						|
	COMBA_START;
 | 
						|
	CLEAR_CARRY;
 | 
						|
/*
 | 
						|
	If b is not large enough grow it and continue
 | 
						|
*/
 | 
						|
	if (B->alloc < pa) {
 | 
						|
		if (pstm_grow(B, pa) != PSTM_OKAY) {
 | 
						|
			return PS_MEM_FAIL;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	if (paD != NULL) {
 | 
						|
		if (paDlen < (sizeof(pstm_digit) * pa)) {
 | 
						|
			paDfail = 1; /* have a paD, but it's not big enough */
 | 
						|
			dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
 | 
						|
		} else {
 | 
						|
			dst = paD;
 | 
						|
			memset(dst, 0x0, paDlen);
 | 
						|
		}
 | 
						|
	} else {
 | 
						|
		dst = xzalloc(sizeof(pstm_digit) * pa);//bbox
 | 
						|
	}
 | 
						|
 | 
						|
	for (ix = 0; ix < pa; ix++) {
 | 
						|
		int32      tx, ty, iy;
 | 
						|
		pstm_digit *tmpy, *tmpx;
 | 
						|
 | 
						|
		/* get offsets into the two bignums */
 | 
						|
		ty = min(A->used-1, ix);
 | 
						|
		tx = ix - ty;
 | 
						|
 | 
						|
		/* setup temp aliases */
 | 
						|
		tmpx = A->dp + tx;
 | 
						|
		tmpy = A->dp + ty;
 | 
						|
 | 
						|
/*
 | 
						|
			This is the number of times the loop will iterate,
 | 
						|
				while (tx++ < a->used && ty-- >= 0) { ... }
 | 
						|
*/
 | 
						|
		iy = min(A->used-tx, ty+1);
 | 
						|
 | 
						|
/*
 | 
						|
		now for squaring tx can never equal ty. We halve the distance since
 | 
						|
		they approach at a rate of 2x and we have to round because odd cases
 | 
						|
		need to be executed
 | 
						|
*/
 | 
						|
		iy = min(iy, (ty-tx+1)>>1);
 | 
						|
 | 
						|
		/* forward carries */
 | 
						|
		CARRY_FORWARD;
 | 
						|
 | 
						|
		/* execute loop */
 | 
						|
		for (iz = 0; iz < iy; iz++) {
 | 
						|
			SQRADD2(*tmpx++, *tmpy--);
 | 
						|
		}
 | 
						|
 | 
						|
		/* even columns have the square term in them */
 | 
						|
		if ((ix&1) == 0) {
 | 
						|
			SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
 | 
						|
		}
 | 
						|
 | 
						|
		/* store it */
 | 
						|
		COMBA_STORE(dst[ix]);
 | 
						|
	}
 | 
						|
 | 
						|
	COMBA_FINI;
 | 
						|
/*
 | 
						|
	setup dest
 | 
						|
 */
 | 
						|
	iz  = B->used;
 | 
						|
	B->used = pa;
 | 
						|
	{
 | 
						|
		pstm_digit *tmpc;
 | 
						|
		tmpc = B->dp;
 | 
						|
		for (ix = 0; ix < pa; ix++) {
 | 
						|
			*tmpc++ = dst[ix];
 | 
						|
		}
 | 
						|
		/*	clear unused digits (that existed in the old copy of c) */
 | 
						|
		for (; ix < iz; ix++) {
 | 
						|
			*tmpc++ = 0;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	pstm_clamp(B);
 | 
						|
 | 
						|
	if ((paD == NULL) || paDfail == 1) {
 | 
						|
		psFree(dst, pool);
 | 
						|
	}
 | 
						|
	return PS_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/******************************************************************************/
 | 
						|
/*
 | 
						|
	Unrolled Comba loop for 1024 bit keys
 | 
						|
 */
 | 
						|
#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
 | 
						|
static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
 | 
						|
{
 | 
						|
	pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
 | 
						|
#ifdef PSTM_ISO
 | 
						|
	pstm_word   tt;
 | 
						|
#endif
 | 
						|
 | 
						|
	if (B->alloc < 32) {
 | 
						|
		if (pstm_grow(B, 32) != PSTM_OKAY) {
 | 
						|
			return PS_MEM_FAIL;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	a = A->dp;
 | 
						|
	sc0 = sc1 = sc2 = 0;
 | 
						|
 | 
						|
	COMBA_START;
 | 
						|
 | 
						|
   /* clear carries */
 | 
						|
   CLEAR_CARRY;
 | 
						|
 | 
						|
   /* output 0 */
 | 
						|
   SQRADD(a[0],a[0]);
 | 
						|
   COMBA_STORE(b[0]);
 | 
						|
 | 
						|
   /* output 1 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[1]);
 | 
						|
   COMBA_STORE(b[1]);
 | 
						|
 | 
						|
   /* output 2 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
 | 
						|
   COMBA_STORE(b[2]);
 | 
						|
 | 
						|
   /* output 3 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
 | 
						|
   COMBA_STORE(b[3]);
 | 
						|
 | 
						|
   /* output 4 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
 | 
						|
   COMBA_STORE(b[4]);
 | 
						|
 | 
						|
   /* output 5 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[5]);
 | 
						|
 | 
						|
   /* output 6 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
 | 
						|
   COMBA_STORE(b[6]);
 | 
						|
 | 
						|
   /* output 7 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[7]);
 | 
						|
 | 
						|
   /* output 8 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
 | 
						|
   COMBA_STORE(b[8]);
 | 
						|
 | 
						|
   /* output 9 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[9]);
 | 
						|
 | 
						|
   /* output 10 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
 | 
						|
   COMBA_STORE(b[10]);
 | 
						|
 | 
						|
   /* output 11 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[11]);
 | 
						|
 | 
						|
   /* output 12 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
 | 
						|
   COMBA_STORE(b[12]);
 | 
						|
 | 
						|
   /* output 13 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[13]);
 | 
						|
 | 
						|
   /* output 14 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
 | 
						|
   COMBA_STORE(b[14]);
 | 
						|
 | 
						|
   /* output 15 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[15]);
 | 
						|
 | 
						|
   /* output 16 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
 | 
						|
   COMBA_STORE(b[16]);
 | 
						|
 | 
						|
   /* output 17 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[17]);
 | 
						|
 | 
						|
   /* output 18 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
 | 
						|
   COMBA_STORE(b[18]);
 | 
						|
 | 
						|
   /* output 19 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[19]);
 | 
						|
 | 
						|
   /* output 20 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
 | 
						|
   COMBA_STORE(b[20]);
 | 
						|
 | 
						|
   /* output 21 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[21]);
 | 
						|
 | 
						|
   /* output 22 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
 | 
						|
   COMBA_STORE(b[22]);
 | 
						|
 | 
						|
   /* output 23 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[23]);
 | 
						|
 | 
						|
   /* output 24 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
 | 
						|
   COMBA_STORE(b[24]);
 | 
						|
 | 
						|
   /* output 25 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[25]);
 | 
						|
 | 
						|
   /* output 26 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
 | 
						|
   COMBA_STORE(b[26]);
 | 
						|
 | 
						|
   /* output 27 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
 | 
						|
   COMBA_STORE(b[27]);
 | 
						|
 | 
						|
   /* output 28 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
 | 
						|
   COMBA_STORE(b[28]);
 | 
						|
 | 
						|
   /* output 29 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[14], a[15]);
 | 
						|
   COMBA_STORE(b[29]);
 | 
						|
 | 
						|
   /* output 30 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD(a[15], a[15]);
 | 
						|
   COMBA_STORE(b[30]);
 | 
						|
   COMBA_STORE2(b[31]);
 | 
						|
   COMBA_FINI;
 | 
						|
 | 
						|
   B->used = 32;
 | 
						|
   B->sign = PSTM_ZPOS;
 | 
						|
   memcpy(B->dp, b, 32 * sizeof(pstm_digit));
 | 
						|
   pstm_clamp(B);
 | 
						|
   return PSTM_OKAY;
 | 
						|
}
 | 
						|
#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
 | 
						|
 | 
						|
 | 
						|
#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
 | 
						|
static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
 | 
						|
{
 | 
						|
   pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
 | 
						|
#ifdef PSTM_ISO
 | 
						|
   pstm_word tt;
 | 
						|
#endif
 | 
						|
 | 
						|
	if (B->alloc < 64) {
 | 
						|
		if (pstm_grow(B, 64) != PSTM_OKAY) {
 | 
						|
			return PS_MEM_FAIL;
 | 
						|
		}
 | 
						|
	}
 | 
						|
	sc0 = sc1 = sc2 = 0;
 | 
						|
   a = A->dp;
 | 
						|
   COMBA_START;
 | 
						|
 | 
						|
   /* clear carries */
 | 
						|
   CLEAR_CARRY;
 | 
						|
 | 
						|
   /* output 0 */
 | 
						|
   SQRADD(a[0],a[0]);
 | 
						|
   COMBA_STORE(b[0]);
 | 
						|
 | 
						|
   /* output 1 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[1]);
 | 
						|
   COMBA_STORE(b[1]);
 | 
						|
 | 
						|
   /* output 2 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
 | 
						|
   COMBA_STORE(b[2]);
 | 
						|
 | 
						|
   /* output 3 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
 | 
						|
   COMBA_STORE(b[3]);
 | 
						|
 | 
						|
   /* output 4 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
 | 
						|
   COMBA_STORE(b[4]);
 | 
						|
 | 
						|
   /* output 5 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[5]);
 | 
						|
 | 
						|
   /* output 6 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
 | 
						|
   COMBA_STORE(b[6]);
 | 
						|
 | 
						|
   /* output 7 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[7]);
 | 
						|
 | 
						|
   /* output 8 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
 | 
						|
   COMBA_STORE(b[8]);
 | 
						|
 | 
						|
   /* output 9 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[9]);
 | 
						|
 | 
						|
   /* output 10 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
 | 
						|
   COMBA_STORE(b[10]);
 | 
						|
 | 
						|
   /* output 11 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[11]);
 | 
						|
 | 
						|
   /* output 12 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
 | 
						|
   COMBA_STORE(b[12]);
 | 
						|
 | 
						|
   /* output 13 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[13]);
 | 
						|
 | 
						|
   /* output 14 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
 | 
						|
   COMBA_STORE(b[14]);
 | 
						|
 | 
						|
   /* output 15 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[15]);
 | 
						|
 | 
						|
   /* output 16 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
 | 
						|
   COMBA_STORE(b[16]);
 | 
						|
 | 
						|
   /* output 17 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[17]);
 | 
						|
 | 
						|
   /* output 18 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
 | 
						|
   COMBA_STORE(b[18]);
 | 
						|
 | 
						|
   /* output 19 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[19]);
 | 
						|
 | 
						|
   /* output 20 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
 | 
						|
   COMBA_STORE(b[20]);
 | 
						|
 | 
						|
   /* output 21 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[21]);
 | 
						|
 | 
						|
   /* output 22 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
 | 
						|
   COMBA_STORE(b[22]);
 | 
						|
 | 
						|
   /* output 23 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[23]);
 | 
						|
 | 
						|
   /* output 24 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
 | 
						|
   COMBA_STORE(b[24]);
 | 
						|
 | 
						|
   /* output 25 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[25]);
 | 
						|
 | 
						|
   /* output 26 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
 | 
						|
   COMBA_STORE(b[26]);
 | 
						|
 | 
						|
   /* output 27 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[27]);
 | 
						|
 | 
						|
   /* output 28 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
 | 
						|
   COMBA_STORE(b[28]);
 | 
						|
 | 
						|
   /* output 29 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[29]);
 | 
						|
 | 
						|
   /* output 30 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
 | 
						|
   COMBA_STORE(b[30]);
 | 
						|
 | 
						|
   /* output 31 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[31]);
 | 
						|
 | 
						|
   /* output 32 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
 | 
						|
   COMBA_STORE(b[32]);
 | 
						|
 | 
						|
   /* output 33 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[33]);
 | 
						|
 | 
						|
   /* output 34 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
 | 
						|
   COMBA_STORE(b[34]);
 | 
						|
 | 
						|
   /* output 35 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[35]);
 | 
						|
 | 
						|
   /* output 36 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
 | 
						|
   COMBA_STORE(b[36]);
 | 
						|
 | 
						|
   /* output 37 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[37]);
 | 
						|
 | 
						|
   /* output 38 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
 | 
						|
   COMBA_STORE(b[38]);
 | 
						|
 | 
						|
   /* output 39 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[39]);
 | 
						|
 | 
						|
   /* output 40 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
 | 
						|
   COMBA_STORE(b[40]);
 | 
						|
 | 
						|
   /* output 41 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[41]);
 | 
						|
 | 
						|
   /* output 42 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
 | 
						|
   COMBA_STORE(b[42]);
 | 
						|
 | 
						|
   /* output 43 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[43]);
 | 
						|
 | 
						|
   /* output 44 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
 | 
						|
   COMBA_STORE(b[44]);
 | 
						|
 | 
						|
   /* output 45 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[45]);
 | 
						|
 | 
						|
   /* output 46 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
 | 
						|
   COMBA_STORE(b[46]);
 | 
						|
 | 
						|
   /* output 47 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[47]);
 | 
						|
 | 
						|
   /* output 48 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
 | 
						|
   COMBA_STORE(b[48]);
 | 
						|
 | 
						|
   /* output 49 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[49]);
 | 
						|
 | 
						|
   /* output 50 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
 | 
						|
   COMBA_STORE(b[50]);
 | 
						|
 | 
						|
   /* output 51 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[51]);
 | 
						|
 | 
						|
   /* output 52 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
 | 
						|
   COMBA_STORE(b[52]);
 | 
						|
 | 
						|
   /* output 53 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[53]);
 | 
						|
 | 
						|
   /* output 54 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
 | 
						|
   COMBA_STORE(b[54]);
 | 
						|
 | 
						|
   /* output 55 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[55]);
 | 
						|
 | 
						|
   /* output 56 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
 | 
						|
   COMBA_STORE(b[56]);
 | 
						|
 | 
						|
   /* output 57 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
 | 
						|
   COMBA_STORE(b[57]);
 | 
						|
 | 
						|
   /* output 58 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
 | 
						|
   COMBA_STORE(b[58]);
 | 
						|
 | 
						|
   /* output 59 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
 | 
						|
   COMBA_STORE(b[59]);
 | 
						|
 | 
						|
   /* output 60 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
 | 
						|
   COMBA_STORE(b[60]);
 | 
						|
 | 
						|
   /* output 61 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD2(a[30], a[31]);
 | 
						|
   COMBA_STORE(b[61]);
 | 
						|
 | 
						|
   /* output 62 */
 | 
						|
   CARRY_FORWARD;
 | 
						|
   SQRADD(a[31], a[31]);
 | 
						|
   COMBA_STORE(b[62]);
 | 
						|
   COMBA_STORE2(b[63]);
 | 
						|
   COMBA_FINI;
 | 
						|
 | 
						|
   B->used = 64;
 | 
						|
   B->sign = PSTM_ZPOS;
 | 
						|
   memcpy(B->dp, b, 64 * sizeof(pstm_digit));
 | 
						|
   pstm_clamp(B);
 | 
						|
   return PSTM_OKAY;
 | 
						|
}
 | 
						|
#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
 | 
						|
 | 
						|
/******************************************************************************/
 | 
						|
/*
 | 
						|
 */
 | 
						|
int32 FAST_FUNC pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
 | 
						|
		uint32 paDlen)
 | 
						|
{
 | 
						|
#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
 | 
						|
	if (A->used == 16) {
 | 
						|
		return pstm_sqr_comba16(A, B);
 | 
						|
	} else {
 | 
						|
#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
 | 
						|
		if (A->used == 32) {
 | 
						|
			return pstm_sqr_comba32(A, B);
 | 
						|
		}
 | 
						|
#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
 | 
						|
		return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
 | 
						|
	}
 | 
						|
#else
 | 
						|
#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
 | 
						|
	if (A->used == 32) {
 | 
						|
		return pstm_sqr_comba32(A, B);
 | 
						|
	}
 | 
						|
#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
 | 
						|
	return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
 | 
						|
#endif
 | 
						|
}
 | 
						|
 | 
						|
#endif /* DISABLE_PSTM */
 | 
						|
/******************************************************************************/
 |