bzip2: work around bad compiler optimization
gc-6.1.1 x86_64: function old new delta generateMTFValues 380 367 -13 gcc-4.3.1 386: function old new delta inner_loop - 41 +41 generateMTFValues 357 294 -63 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 41/-63) Total: -22 bytes gcc-6.3.0 386: function old new delta inner_loop - 36 +36 generateMTFValues 363 250 -113 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 36/-113) Total: -77 bytes The last case, gcc-6.3.0, runs almost 3 times faster after this change. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
		@@ -158,6 +158,38 @@ void makeMaps_e(EState* s)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*---------------------------------------------------*/
 | 
			
		||||
/*
 | 
			
		||||
 * This bit of code is performance-critical.
 | 
			
		||||
 * On 32bit x86, gcc-6.3.0 was observed to spill ryy_j to stack,
 | 
			
		||||
 * resulting in abysmal performance (x3 slowdown).
 | 
			
		||||
 * Forcing it into a separate function alleviates register pressure,
 | 
			
		||||
 * and spillage no longer happens.
 | 
			
		||||
 * Other versions of gcc do not exhibit this problem, but out-of-line code
 | 
			
		||||
 * seems to be helping them too (code is both smaller and faster).
 | 
			
		||||
 * Therefore NOINLINE is enabled for the entire 32bit x86 arch for now,
 | 
			
		||||
 * without a check for gcc version.
 | 
			
		||||
 */
 | 
			
		||||
static
 | 
			
		||||
#if defined __i386__
 | 
			
		||||
NOINLINE
 | 
			
		||||
#endif
 | 
			
		||||
int inner_loop(uint8_t *yy, uint8_t ll_i)
 | 
			
		||||
{
 | 
			
		||||
	register uint8_t  rtmp;
 | 
			
		||||
	register uint8_t* ryy_j;
 | 
			
		||||
	rtmp  = yy[1];
 | 
			
		||||
	yy[1] = yy[0];
 | 
			
		||||
	ryy_j = &(yy[1]);
 | 
			
		||||
	while (ll_i != rtmp) {
 | 
			
		||||
		register uint8_t rtmp2;
 | 
			
		||||
		ryy_j++;
 | 
			
		||||
		rtmp2  = rtmp;
 | 
			
		||||
		rtmp   = *ryy_j;
 | 
			
		||||
		*ryy_j = rtmp2;
 | 
			
		||||
	}
 | 
			
		||||
	yy[0] = rtmp;
 | 
			
		||||
	return ryy_j - &(yy[0]);
 | 
			
		||||
}
 | 
			
		||||
static NOINLINE
 | 
			
		||||
void generateMTFValues(EState* s)
 | 
			
		||||
{
 | 
			
		||||
@@ -165,7 +197,6 @@ void generateMTFValues(EState* s)
 | 
			
		||||
	int i;
 | 
			
		||||
	int zPend;
 | 
			
		||||
	int32_t wr;
 | 
			
		||||
	int32_t EOB;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * After sorting (eg, here),
 | 
			
		||||
@@ -189,15 +220,12 @@ void generateMTFValues(EState* s)
 | 
			
		||||
	 * compressBlock().
 | 
			
		||||
	 */
 | 
			
		||||
	uint32_t* ptr   = s->ptr;
 | 
			
		||||
	uint8_t*  block = s->block;
 | 
			
		||||
	uint16_t* mtfv  = s->mtfv;
 | 
			
		||||
 | 
			
		||||
	makeMaps_e(s);
 | 
			
		||||
	EOB = s->nInUse+1;
 | 
			
		||||
 | 
			
		||||
	wr = 0;
 | 
			
		||||
	zPend = 0;
 | 
			
		||||
	for (i = 0; i <= EOB; i++)
 | 
			
		||||
	for (i = 0; i <= s->nInUse+1; i++)
 | 
			
		||||
		s->mtfFreq[i] = 0;
 | 
			
		||||
 | 
			
		||||
	for (i = 0; i < s->nInUse; i++)
 | 
			
		||||
@@ -211,7 +239,7 @@ void generateMTFValues(EState* s)
 | 
			
		||||
		j = ptr[i] - 1;
 | 
			
		||||
		if (j < 0)
 | 
			
		||||
			j += s->nblock;
 | 
			
		||||
		ll_i = s->unseqToSeq[block[j]];
 | 
			
		||||
		ll_i = s->unseqToSeq[s->block[j]];
 | 
			
		||||
		AssertD(ll_i < s->nInUse, "generateMTFValues(2a)");
 | 
			
		||||
 | 
			
		||||
		if (yy[0] == ll_i) {
 | 
			
		||||
@@ -225,15 +253,15 @@ void generateMTFValues(EState* s)
 | 
			
		||||
			while (1) {
 | 
			
		||||
#if 0
 | 
			
		||||
				if (zPend & 1) {
 | 
			
		||||
					mtfv[wr] = BZ_RUNB; wr++;
 | 
			
		||||
					s->mtfv[wr] = BZ_RUNB; wr++;
 | 
			
		||||
					s->mtfFreq[BZ_RUNB]++;
 | 
			
		||||
				} else {
 | 
			
		||||
					mtfv[wr] = BZ_RUNA; wr++;
 | 
			
		||||
					s->mtfv[wr] = BZ_RUNA; wr++;
 | 
			
		||||
					s->mtfFreq[BZ_RUNA]++;
 | 
			
		||||
				}
 | 
			
		||||
#else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */
 | 
			
		||||
				unsigned run = zPend & 1;
 | 
			
		||||
				mtfv[wr] = run;
 | 
			
		||||
				s->mtfv[wr] = run;
 | 
			
		||||
				wr++;
 | 
			
		||||
				s->mtfFreq[run]++;
 | 
			
		||||
#endif
 | 
			
		||||
@@ -247,36 +275,19 @@ void generateMTFValues(EState* s)
 | 
			
		||||
				goto end;
 | 
			
		||||
			zPend = 0;
 | 
			
		||||
		}
 | 
			
		||||
		{
 | 
			
		||||
			register uint8_t  rtmp;
 | 
			
		||||
			register uint8_t* ryy_j;
 | 
			
		||||
			register uint8_t  rll_i;
 | 
			
		||||
			rtmp  = yy[1];
 | 
			
		||||
			yy[1] = yy[0];
 | 
			
		||||
			ryy_j = &(yy[1]);
 | 
			
		||||
			rll_i = ll_i;
 | 
			
		||||
			while (rll_i != rtmp) {
 | 
			
		||||
				register uint8_t rtmp2;
 | 
			
		||||
				ryy_j++;
 | 
			
		||||
				rtmp2  = rtmp;
 | 
			
		||||
				rtmp   = *ryy_j;
 | 
			
		||||
				*ryy_j = rtmp2;
 | 
			
		||||
			}
 | 
			
		||||
			yy[0] = rtmp;
 | 
			
		||||
			j = ryy_j - &(yy[0]);
 | 
			
		||||
			mtfv[wr] = j+1;
 | 
			
		||||
			wr++;
 | 
			
		||||
			s->mtfFreq[j+1]++;
 | 
			
		||||
		}
 | 
			
		||||
		j = inner_loop(yy, ll_i);
 | 
			
		||||
		s->mtfv[wr] = j+1;
 | 
			
		||||
		wr++;
 | 
			
		||||
		s->mtfFreq[j+1]++;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	i = -1;
 | 
			
		||||
	if (zPend > 0)
 | 
			
		||||
		goto process_zPend; /* "process it and come back here" */
 | 
			
		||||
 end:
 | 
			
		||||
	mtfv[wr] = EOB;
 | 
			
		||||
	s->mtfv[wr] = s->nInUse+1;
 | 
			
		||||
	wr++;
 | 
			
		||||
	s->mtfFreq[EOB]++;
 | 
			
		||||
	s->mtfFreq[s->nInUse+1]++;
 | 
			
		||||
 | 
			
		||||
	s->nMTF = wr;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user