From c2a51b0cf16918482c993c4998a2a920e499a43f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 5 Feb 2018 00:34:08 +0100 Subject: [PATCH] bzip2: work around bad compiler optimization gc-6.1.1 x86_64: function old new delta generateMTFValues 380 367 -13 gcc-4.3.1 386: function old new delta inner_loop - 41 +41 generateMTFValues 357 294 -63 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 41/-63) Total: -22 bytes gcc-6.3.0 386: function old new delta inner_loop - 36 +36 generateMTFValues 363 250 -113 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 36/-113) Total: -77 bytes The last case, gcc-6.3.0, runs almost 3 times faster after this change. Signed-off-by: Denys Vlasenko --- archival/libarchive/bz/compress.c | 75 ++++++++++++++++++------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/archival/libarchive/bz/compress.c b/archival/libarchive/bz/compress.c index f65076758..462740b6c 100644 --- a/archival/libarchive/bz/compress.c +++ b/archival/libarchive/bz/compress.c @@ -158,6 +158,38 @@ void makeMaps_e(EState* s) /*---------------------------------------------------*/ +/* + * This bit of code is performance-critical. + * On 32bit x86, gcc-6.3.0 was observed to spill ryy_j to stack, + * resulting in abysmal performance (x3 slowdown). + * Forcing it into a separate function alleviates register pressure, + * and spillage no longer happens. + * Other versions of gcc do not exhibit this problem, but out-of-line code + * seems to be helping them too (code is both smaller and faster). + * Therefore NOINLINE is enabled for the entire 32bit x86 arch for now, + * without a check for gcc version. + */ +static +#if defined __i386__ +NOINLINE +#endif +int inner_loop(uint8_t *yy, uint8_t ll_i) +{ + register uint8_t rtmp; + register uint8_t* ryy_j; + rtmp = yy[1]; + yy[1] = yy[0]; + ryy_j = &(yy[1]); + while (ll_i != rtmp) { + register uint8_t rtmp2; + ryy_j++; + rtmp2 = rtmp; + rtmp = *ryy_j; + *ryy_j = rtmp2; + } + yy[0] = rtmp; + return ryy_j - &(yy[0]); +} static NOINLINE void generateMTFValues(EState* s) { @@ -165,7 +197,6 @@ void generateMTFValues(EState* s) int i; int zPend; int32_t wr; - int32_t EOB; /* * After sorting (eg, here), @@ -189,15 +220,12 @@ void generateMTFValues(EState* s) * compressBlock(). */ uint32_t* ptr = s->ptr; - uint8_t* block = s->block; - uint16_t* mtfv = s->mtfv; makeMaps_e(s); - EOB = s->nInUse+1; wr = 0; zPend = 0; - for (i = 0; i <= EOB; i++) + for (i = 0; i <= s->nInUse+1; i++) s->mtfFreq[i] = 0; for (i = 0; i < s->nInUse; i++) @@ -211,7 +239,7 @@ void generateMTFValues(EState* s) j = ptr[i] - 1; if (j < 0) j += s->nblock; - ll_i = s->unseqToSeq[block[j]]; + ll_i = s->unseqToSeq[s->block[j]]; AssertD(ll_i < s->nInUse, "generateMTFValues(2a)"); if (yy[0] == ll_i) { @@ -225,15 +253,15 @@ void generateMTFValues(EState* s) while (1) { #if 0 if (zPend & 1) { - mtfv[wr] = BZ_RUNB; wr++; + s->mtfv[wr] = BZ_RUNB; wr++; s->mtfFreq[BZ_RUNB]++; } else { - mtfv[wr] = BZ_RUNA; wr++; + s->mtfv[wr] = BZ_RUNA; wr++; s->mtfFreq[BZ_RUNA]++; } #else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */ unsigned run = zPend & 1; - mtfv[wr] = run; + s->mtfv[wr] = run; wr++; s->mtfFreq[run]++; #endif @@ -247,36 +275,19 @@ void generateMTFValues(EState* s) goto end; zPend = 0; } - { - register uint8_t rtmp; - register uint8_t* ryy_j; - register uint8_t rll_i; - rtmp = yy[1]; - yy[1] = yy[0]; - ryy_j = &(yy[1]); - rll_i = ll_i; - while (rll_i != rtmp) { - register uint8_t rtmp2; - ryy_j++; - rtmp2 = rtmp; - rtmp = *ryy_j; - *ryy_j = rtmp2; - } - yy[0] = rtmp; - j = ryy_j - &(yy[0]); - mtfv[wr] = j+1; - wr++; - s->mtfFreq[j+1]++; - } + j = inner_loop(yy, ll_i); + s->mtfv[wr] = j+1; + wr++; + s->mtfFreq[j+1]++; } i = -1; if (zPend > 0) goto process_zPend; /* "process it and come back here" */ end: - mtfv[wr] = EOB; + s->mtfv[wr] = s->nInUse+1; wr++; - s->mtfFreq[EOB]++; + s->mtfFreq[s->nInUse+1]++; s->nMTF = wr; }