diff --git a/networking/tls_aesgcm.c b/networking/tls_aesgcm.c
index 688df85fb..008dc9b5d 100644
--- a/networking/tls_aesgcm.c
+++ b/networking/tls_aesgcm.c
@@ -35,17 +35,66 @@ static ALWAYS_INLINE void FlattenSzInBits(byte* buf, word32 sz)
 
 static void RIGHTSHIFTX(byte* x)
 {
-    int i;
-    int carryOut = 0;
-    int carryIn = 0;
-    int borrow = x[15] & 0x01;
+#define l ((unsigned long*)x)
+#if 0
 
+    // Generic byte-at-a-time algorithm
+    int i;
+    byte carryIn = (x[15] & 0x01) ? 0xE1 : 0;
     for (i = 0; i < AES_BLOCK_SIZE; i++) {
-        carryOut = x[i] & 0x01;
-        x[i] = (x[i] >> 1) | (carryIn ? 0x80 : 0);
+        byte carryOut = (x[i] << 7); // zero, or 0x80
+        x[i] = (x[i] >> 1) ^ carryIn;
         carryIn = carryOut;
     }
-    if (borrow) x[0] ^= 0xE1;
+
+#elif BB_BIG_ENDIAN
+
+    // Big-endian can shift-right in larger than byte chunks
+    // (we use the fact that 'x' is long-aligned)
+    unsigned long carryIn = (x[15] & 0x01)
+        ? ((unsigned long)0xE1 << (LONG_BIT-8))
+        : 0;
+# if ULONG_MAX <= 0xffffffff
+    int i;
+    for (i = 0; i < AES_BLOCK_SIZE/sizeof(long); i++) {
+        unsigned long carryOut = l[i] << (LONG_BIT-1); // zero, or 0x800..00
+        l[i] = (l[i] >> 1) ^ carryIn;
+        carryIn = carryOut;
+    }
+# else
+    // 64-bit code: need to process only 2 words
+    unsigned long carryOut = l[0] << (LONG_BIT-1); // zero, or 0x800..00
+    l[0] = (l[0] >> 1) ^ carryIn;
+    l[1] = (l[1] >> 1) ^ carryOut;
+# endif
+
+#else /* LITTLE_ENDIAN */
+
+    // In order to use word-sized ops, little-endian needs to byteswap.
+    // On x86, code size increase is ~10 bytes compared to byte-by-byte.
+    unsigned long carryIn = (x[15] & 0x01)
+        ? ((unsigned long)0xE1 << (LONG_BIT-8))
+        : 0;
+# if ULONG_MAX <= 0xffffffff
+    int i;
+    for (i = 0; i < AES_BLOCK_SIZE/sizeof(long); i++) {
+        unsigned long ti = SWAP_BE32(l[i]);
+        unsigned long carryOut = ti << (LONG_BIT-1); // zero, or 0x800..00
+        ti = (ti >> 1) ^ carryIn;
+        l[i] = SWAP_BE32(ti);
+        carryIn = carryOut;
+    }
+# else
+    // 64-bit code: need to process only 2 words
+    unsigned long tt = SWAP_BE64(l[0]);
+    unsigned long carryOut = tt << (LONG_BIT-1); // zero, or 0x800..00
+    tt = (tt >> 1) ^ carryIn; l[0] = SWAP_BE64(tt);
+    tt = SWAP_BE64(l[1]);
+    tt = (tt >> 1) ^ carryOut; l[1] = SWAP_BE64(tt);
+# endif
+
+#endif /* LITTLE_ENDIAN */
+#undef l
 }
 
 static void GMULT(byte* X, byte* Y)