From 03569bc50f0d731aa3af94ab600adc59eaac3162 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 24 Nov 2018 14:08:29 +0100 Subject: [PATCH] tls: speed up xor'ing of aligned 16-byte buffers function old new delta xorbuf_aligned_AES_BLOCK_SIZE - 23 +23 xwrite_encrypted 585 580 -5 aesgcm_GHASH 233 228 -5 GMULT 192 187 -5 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/3 up/down: 23/-15) Total: 8 bytes Signed-off-by: Denys Vlasenko --- networking/tls.c | 34 ++++++++++++++++++++++++---------- networking/tls.h | 4 ++++ networking/tls_aesgcm.c | 15 ++++++++------- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/networking/tls.c b/networking/tls.c index 1f8c21f8b..b774340ae 100644 --- a/networking/tls.c +++ b/networking/tls.c @@ -357,6 +357,20 @@ void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count) xorbuf3(dst, dst, src, count); } +void FAST_FUNC xorbuf_aligned_AES_BLOCK_SIZE(void *dst, const void *src) +{ + unsigned long *d = dst; + const unsigned long *s = src; + d[0] ^= s[0]; +#if ULONG_MAX <= 0xffffffffffffffff + d[1] ^= s[1]; + #if ULONG_MAX == 0xffffffff + d[2] ^= s[2]; + d[3] ^= s[3]; + #endif +#endif +} + /* Nondestructively see the current hash value */ static unsigned sha_peek(md5sha_ctx_t *ctx, void *buffer) { @@ -802,10 +816,10 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty { #define COUNTER(v) (*(uint32_t*)(v + 12)) - uint8_t aad[13 + 3] ALIGNED(4); /* +3 creates [16] buffer, simplifying GHASH() */ - uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */ - uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16] - uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16] + uint8_t aad[13 + 3] ALIGNED_long; /* +3 creates [16] buffer, simplifying GHASH() */ + uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */ + uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16] + uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16] uint8_t *buf; struct record_hdr *xhdr; unsigned remaining; @@ -850,7 +864,7 @@ static void xwrite_encrypted_aesgcm(tls_state_t *tls, unsigned size, unsigned ty aesgcm_GHASH(tls->H, aad, /*sizeof(aad),*/ tls->outbuf + OUTBUF_PFX, size, authtag /*, sizeof(authtag)*/); COUNTER(nonce) = htonl(1); aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch); - xorbuf(authtag, scratch, sizeof(authtag)); + xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch); memcpy(buf, authtag, sizeof(authtag)); #undef COUNTER @@ -938,10 +952,10 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size) { #define COUNTER(v) (*(uint32_t*)(v + 12)) - //uint8_t aad[13 + 3] ALIGNED(4); /* +3 creates [16] buffer, simplifying GHASH() */ - uint8_t nonce[12 + 4] ALIGNED(4); /* +4 creates space for AES block counter */ - uint8_t scratch[AES_BLOCK_SIZE] ALIGNED(4); //[16] - //uint8_t authtag[AES_BLOCK_SIZE] ALIGNED(4); //[16] + //uint8_t aad[13 + 3] ALIGNED_long; /* +3 creates [16] buffer, simplifying GHASH() */ + uint8_t nonce[12 + 4] ALIGNED_long; /* +4 creates space for AES block counter */ + uint8_t scratch[AES_BLOCK_SIZE] ALIGNED_long; //[16] + //uint8_t authtag[AES_BLOCK_SIZE] ALIGNED_long; //[16] unsigned remaining; unsigned cnt; @@ -973,7 +987,7 @@ static void tls_aesgcm_decrypt(tls_state_t *tls, uint8_t *buf, int size) //aesgcm_GHASH(tls->H, aad, tls->inbuf + RECHDR_LEN, size, authtag); //COUNTER(nonce) = htonl(1); //aes_encrypt_one_block(&tls->aes_encrypt, nonce, scratch); - //xorbuf(authtag, scratch, sizeof(authtag)); + //xorbuf_aligned_AES_BLOCK_SIZE(authtag, scratch); //memcmp(buf, authtag, sizeof(authtag)) || DIE("HASH DOES NOT MATCH!"); #undef COUNTER diff --git a/networking/tls.h b/networking/tls.h index 4b0dc7459..494ed78c4 100644 --- a/networking/tls.h +++ b/networking/tls.h @@ -81,8 +81,12 @@ typedef int16_t int16; #define AES_BLOCK_SIZE 16 void tls_get_random(void *buf, unsigned len) FAST_FUNC; + void xorbuf(void* buf, const void* mask, unsigned count) FAST_FUNC; +#define ALIGNED_long ALIGNED(sizeof(long)) +void xorbuf_aligned_AES_BLOCK_SIZE(void* buf, const void* mask) FAST_FUNC; + #define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS) #define psFree(p, pool) free(p) diff --git a/networking/tls_aesgcm.c b/networking/tls_aesgcm.c index db720e5f6..fd72540c4 100644 --- a/networking/tls_aesgcm.c +++ b/networking/tls_aesgcm.c @@ -50,8 +50,8 @@ static void RIGHTSHIFTX(byte* x) static void GMULT(byte* X, byte* Y) { - byte Z[AES_BLOCK_SIZE]; - byte V[AES_BLOCK_SIZE]; + byte Z[AES_BLOCK_SIZE] ALIGNED_long; + byte V[AES_BLOCK_SIZE] ALIGNED_long; int i, j; XMEMSET(Z, 0, AES_BLOCK_SIZE); @@ -62,7 +62,7 @@ static void GMULT(byte* X, byte* Y) for (j = 0; j < 8; j++) { if (y & 0x80) { - xorbuf(Z, V, AES_BLOCK_SIZE); + xorbuf_aligned_AES_BLOCK_SIZE(Z, V); } RIGHTSHIFTX(V); @@ -86,8 +86,8 @@ void FAST_FUNC aesgcm_GHASH(byte* h, byte* s //, unsigned sSz ) { - byte x[AES_BLOCK_SIZE] ALIGNED(4); - byte scratch[AES_BLOCK_SIZE] ALIGNED(4); + byte x[AES_BLOCK_SIZE] ALIGNED_long; + byte scratch[AES_BLOCK_SIZE] ALIGNED_long; word32 blocks, partial; //was: byte* h = aes->H; @@ -116,6 +116,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h, blocks = cSz / AES_BLOCK_SIZE; partial = cSz % AES_BLOCK_SIZE; while (blocks--) { + //xorbuf_aligned_AES_BLOCK_SIZE(x, c); - c is not guaranteed to be aligned xorbuf(x, c, AES_BLOCK_SIZE); GMULT(x, h); c += AES_BLOCK_SIZE; @@ -124,7 +125,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h, //XMEMSET(scratch, 0, AES_BLOCK_SIZE); //XMEMCPY(scratch, c, partial); //xorbuf(x, scratch, AES_BLOCK_SIZE); - xorbuf(x, c, partial); + xorbuf(x, c, partial);//same result as above GMULT(x, h); } } @@ -132,7 +133,7 @@ void FAST_FUNC aesgcm_GHASH(byte* h, /* Hash in the lengths of A and C in bits */ FlattenSzInBits(&scratch[0], aSz); FlattenSzInBits(&scratch[8], cSz); - xorbuf(x, scratch, AES_BLOCK_SIZE); + xorbuf_aligned_AES_BLOCK_SIZE(x, scratch); GMULT(x, h); /* Copy the result into s. */