tls: speed up xor'ing of aligned 16-byte buffers

function                                             old     new   delta
xorbuf_aligned_AES_BLOCK_SIZE                          -      23     +23
xwrite_encrypted                                     585     580      -5
aesgcm_GHASH                                         233     228      -5
GMULT                                                192     187      -5
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/3 up/down: 23/-15)              Total: 8 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2018-11-24 14:08:29 +01:00
parent 941440cf16
commit 03569bc50f
3 changed files with 36 additions and 17 deletions

View File

@ -357,6 +357,20 @@ void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
xorbuf3(dst, dst, src, count); xorbuf3(dst, dst, src, count);
} }
void FAST_FUNC xorbuf_aligned_AES_BLOCK_SIZE(void *dst, const void *src)
{
unsigned long *d = dst;
const unsigned long *s = src;
d[0] ^= s[0];
#if ULONG_MAX <= 0xffffffffffffffff
d[1] ^= s[1];
#if ULONG_MAX == 0xffffffff
d[2] ^= s[2];
d[3] ^= s[3];
#endif
#endif
}
/* Nondestructively see the current hash value */ /* Nondestructively see the current hash value */
static unsigned sha_peek(md5sha_ctx_t *ctx, void *buffer) static unsigned sha_peek(md5sha_ctx_t *ctx, void *buffer)
{ {
@ -802,10 +816,10 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty
{ {
#define COUNTER(v) (*(uint32_t*)(v + 12)) #define COUNTER(v) (*(uint32_t*)(v + 12))
uint8_t aad[13 + 3] ALIGNED(4); /* +3 creates [16] buffer, simplifying GHASH() */ uint8_t aad[13 + 3] ALIGNED_long; /* +3 creates [16] buffer, simplifying GHASH() */
uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */ uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */
uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16] uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16]
uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16] uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16]
uint8_t *buf; uint8_t *buf;
struct record_hdr *xhdr; struct record_hdr *xhdr;
unsigned remaining; unsigned remaining;
@ -850,7 +864,7 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty
aesgcm_GHASH(tls->H, aad, /*sizeof(aad),*/ tls->outbuf + OUTBUF_PFX, size, authtag /*, sizeof(authtag)*/); aesgcm_GHASH(tls->H, aad, /*sizeof(aad),*/ tls->outbuf + OUTBUF_PFX, size, authtag /*, sizeof(authtag)*/);
COUNTER(nonce) = htonl(1); COUNTER(nonce) = htonl(1);
aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch); aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
xorbuf(authtag, scratch, sizeof(authtag)); xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch);
memcpy(buf, authtag, sizeof(authtag)); memcpy(buf, authtag, sizeof(authtag));
#undef COUNTER #undef COUNTER
@ -938,10 +952,10 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size)
{ {
#define COUNTER(v) (*(uint32_t*)(v + 12)) #define COUNTER(v) (*(uint32_t*)(v + 12))
//uint8_t aad[13 + 3] ALIGNED(4); /* +3 creates [16] buffer, simplifying GHASH() */ //uint8_t aad[13 + 3] ALIGNED_long; /* +3 creates [16] buffer, simplifying GHASH() */
uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */ uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */
uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16] uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16]
//uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16] //uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16]
unsigned remaining; unsigned remaining;
unsigned cnt; unsigned cnt;
@ -973,7 +987,7 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size)
//aesgcm_GHASH(tls->H, aad, tls->inbuf + RECHDR_LEN, size, authtag); //aesgcm_GHASH(tls->H, aad, tls->inbuf + RECHDR_LEN, size, authtag);
//COUNTER(nonce) = htonl(1); //COUNTER(nonce) = htonl(1);
//aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch); //aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch);
//xorbuf(authtag, scratch, sizeof(authtag)); //xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch);
//memcmp(buf, authtag, sizeof(authtag)) || DIE("HASH DOES NOT MATCH!"); //memcmp(buf, authtag, sizeof(authtag)) || DIE("HASH DOES NOT MATCH!");
#undef COUNTER #undef COUNTER

View File

@ -81,8 +81,12 @@ typedef int16_t int16;
#define AES_BLOCK_SIZE 16 #define AES_BLOCK_SIZE 16
void tls_get_random(void *buf, unsigned len) FAST_FUNC; void tls_get_random(void *buf, unsigned len) FAST_FUNC;
void xorbuf(void* buf, const void* mask, unsigned count) FAST_FUNC; void xorbuf(void* buf, const void* mask, unsigned count) FAST_FUNC;
#define ALIGNED_long ALIGNED(sizeof(long))
void xorbuf_aligned_AES_BLOCK_SIZE(void* buf, const void* mask) FAST_FUNC;
#define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS) #define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS)
#define psFree(p, pool) free(p) #define psFree(p, pool) free(p)

View File

@ -50,8 +50,8 @@ static void RIGHTSHIFTX(byte* x)
static void GMULT(byte* X, byte* Y) static void GMULT(byte* X, byte* Y)
{ {
byte Z[AES_BLOCK_SIZE]; byte Z[AES_BLOCK_SIZE] ALIGNED_long;
byte V[AES_BLOCK_SIZE]; byte V[AES_BLOCK_SIZE] ALIGNED_long;
int i, j; int i, j;
XMEMSET(Z, 0, AES_BLOCK_SIZE); XMEMSET(Z, 0, AES_BLOCK_SIZE);
@ -62,7 +62,7 @@ static void GMULT(byte* X, byte* Y)
for (j = 0; j < 8; j++) for (j = 0; j < 8; j++)
{ {
if (y & 0x80) { if (y & 0x80) {
xorbuf(Z, V, AES_BLOCK_SIZE); xorbuf_aligned_AES_BLOCK_SIZE(Z, V);
} }
RIGHTSHIFTX(V); RIGHTSHIFTX(V);
@ -86,8 +86,8 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
byte* s //, unsigned sSz byte* s //, unsigned sSz
) )
{ {
byte x[AES_BLOCK_SIZE] ALIGNED(4); byte x[AES_BLOCK_SIZE] ALIGNED_long;
byte scratch[AES_BLOCK_SIZE] ALIGNED(4); byte scratch[AES_BLOCK_SIZE] ALIGNED_long;
word32 blocks, partial; word32 blocks, partial;
//was: byte* h = aes->H; //was: byte* h = aes->H;
@ -116,6 +116,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
blocks = cSz / AES_BLOCK_SIZE; blocks = cSz / AES_BLOCK_SIZE;
partial = cSz % AES_BLOCK_SIZE; partial = cSz % AES_BLOCK_SIZE;
while (blocks--) { while (blocks--) {
//xorbuf_aligned_AES_BLOCK_SIZE(x, c); - c is not guaranteed to be aligned
xorbuf(x, c, AES_BLOCK_SIZE); xorbuf(x, c, AES_BLOCK_SIZE);
GMULT(x, h); GMULT(x, h);
c += AES_BLOCK_SIZE; c += AES_BLOCK_SIZE;
@ -124,7 +125,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
//XMEMSET(scratch, 0, AES_BLOCK_SIZE); //XMEMSET(scratch, 0, AES_BLOCK_SIZE);
//XMEMCPY(scratch, c, partial); //XMEMCPY(scratch, c, partial);
//xorbuf(x, scratch, AES_BLOCK_SIZE); //xorbuf(x, scratch, AES_BLOCK_SIZE);
xorbuf(x, c, partial); xorbuf(x, c, partial);//same result as above
GMULT(x, h); GMULT(x, h);
} }
} }
@ -132,7 +133,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h,
/* Hash in the lengths of A and C in bits */ /* Hash in the lengths of A and C in bits */
FlattenSzInBits(&scratch[0], aSz); FlattenSzInBits(&scratch[0], aSz);
FlattenSzInBits(&scratch[8], cSz); FlattenSzInBits(&scratch[8], cSz);
xorbuf(x, scratch, AES_BLOCK_SIZE); xorbuf_aligned_AES_BLOCK_SIZE(x, scratch);
GMULT(x, h); GMULT(x, h);
/* Copy the result into s. */ /* Copy the result into s. */