top: adapt utf8 logic to support extra wide characters

Back when top was refactored to support UTF-8 encoding
it was acknowledged that languages like zh_CN were not
supported. That was because a single 'character' might
require more than a single 'column' when it's printed.

Well I've now figured out how to accommodate languages
like that. My adaptation is represented in this patch.

[ and just in case someone wishes to avoid the extra ]
[ runtime costs, a #define OFF_XTRAWIDE is included. ]

Along the way, I've cleaned up some miscellaneous code
supporting the 'Inspect' feature so that the rightmost
screen column was always used rather than being blank.

[ interestingly, my xterm & urxvt terminal emulators ]
[ are able to split extra wide characters then print ]
[ 1/2 of such graphics in the last column. the gnome ]
[ terminal emulator does not duplicate such behavior ]
[ but prints 1 extra character in same width window. ]

Reference(s):
. Sep, 2017 - original utf8 support
commit 9773c56add6446d418c0677f306c8771356f0c01

Signed-off-by: Jim Warner <james.warner@comcast.net>
This commit is contained in:
Jim Warner 2018-01-08 00:00:00 -06:00 committed by Craig Small
parent 6f2e66969a
commit 264790d80d
2 changed files with 41 additions and 18 deletions

View File

@ -37,6 +37,7 @@
#include <termios.h>
#include <time.h>
#include <unistd.h>
#include <wchar.h>
#include <sys/ioctl.h>
#include <sys/resource.h>
@ -504,6 +505,24 @@ static char UTF8_tab[] = {
}; // ( 0xF5 & beyond invalid )
/*
* Accommodate any potential differences between some multibyte
* character sequence and the screen columns needed to print it */
static inline int utf8_cols (const unsigned char *p, int n) {
#ifndef OFF_XTRAWIDE
wchar_t wc;
int wlen;
(void)mbtowc(&wc, (const char *)p, n);
if ((wlen = wcwidth(wc)) < 1) wlen = 1;
return wlen;
#else
(void)p; (void)n;
return 1;
#endif
} // end: utf8_cols
/*
* Determine difference between total bytes versus printable
* characters in that passed, potentially multi-byte, string */
@ -514,8 +533,8 @@ static int utf8_delta (const char *str) {
while (*p) {
// -1 represents a decoding error, pretend it's untranslated ...
if (0 > (clen = UTF8_tab[*p])) return 0;
cnum += utf8_cols(p, clen);
p += clen;
++cnum;
}
return (int)((const char *)p - str) - cnum;
} // end: utf8_delta
@ -532,8 +551,8 @@ static int utf8_embody (const char *str, int width) {
while (*p) {
// -1 represents a decoding error, pretend it's untranslated ...
if (0 > (clen = UTF8_tab[*p])) return width;
if (width < (cnum += utf8_cols(p, clen))) break;
p += clen;
if (++cnum >= width) break;
}
}
return (int)((const char *)p - str);
@ -2636,15 +2655,15 @@ static void insp_find_str (int ch, int *col, int *row) {
* while visible search matches display with capclr_hdr for emphasis.
* ( we hide ugly plumbing in macros to concentrate on the algorithm ) */
static void insp_mkrow_raw (int col, int row) {
#define maxSZ ( Screen_cols - (to + 1) )
#define maxSZ ( Screen_cols - to )
#define capNO { if (hicap) { putp(Caps_off); hicap = 0; } }
#define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \
fr += Insp_sel->flen -1; to += Insp_sel->flen; hicap = 0; }
#ifndef INSP_JUSTNOT
#define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \
PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 2; hicap = 1; }
#define mkUNP { int x = maxSZ; const char *p = fmtmk("<%02X>", uch); \
PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 4; hicap = 1; }
#define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \
PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 2; hicap = 1; }
#define mkUNP { const char *p = fmtmk("<%02X>", uch); \
PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 4; hicap = 1; }
#else
#define mkCTL { if ((to += 2) <= Screen_cols) \
PUTT("%s^%c", (!hicap) ? Curwin->capclr_msg : "", uch + '@'); hicap = 1; }
@ -2653,7 +2672,7 @@ static void insp_mkrow_raw (int col, int row) {
#endif
#define mkSTD { capNO; if (++to <= Screen_cols) { static char _str[2]; \
_str[0] = uch; putp(_str); } }
char tline[SCREENMAX];
unsigned char tline[SCREENMAX];
int fr, to, ofs;
int hicap = 0;
@ -2661,7 +2680,7 @@ static void insp_mkrow_raw (int col, int row) {
memcpy(tline, Insp_p[row] + col, sizeof(tline));
else tline[0] = '\n';
for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; fr++) {
for (fr = 0, to = 0, ofs = 0; to < Screen_cols; fr++) {
if (!ofs)
ofs = insp_find_ofs(col + fr, row);
if (col + fr < ofs) {
@ -2694,20 +2713,20 @@ static void insp_mkrow_raw (int col, int row) {
* characters will then be displayed in two positions like '^A'.
* ( assuming they can even get past those 'gettext' utilities ) */
static void insp_mkrow_utf8 (int col, int row) {
#define maxSZ ( Screen_cols - (to + 1) )
#define maxSZ ( Screen_cols - to )
#define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \
fr += Insp_sel->flen; to += Insp_sel->flen; }
#ifndef INSP_JUSTNOT
#define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \
PUTT("%s%.*s%s", Curwin->capclr_msg, x, p, Caps_off); to += 2; }
#define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \
PUTT("%s%.*s%s", Curwin->capclr_msg, maxSZ, p, Caps_off); to += 2; }
#else
#define mkCTL { if ((to += 2) <= Screen_cols) \
PUTT("%s^%c%s", Curwin->capclr_msg, uch + '@', Caps_off); }
#endif
#define mkNUL { buf1[0] = ' '; doPUT(buf1) }
#define doPUT(buf) if (++to <= Screen_cols) putp(buf);
#define doPUT(buf) if ((to += cno) <= Screen_cols) putp(buf);
static char buf1[2], buf2[3], buf3[4], buf4[5];
char tline[BIGBUFSIZ];
unsigned char tline[BIGBUFSIZ];
int fr, to, ofs;
col = utf8_proper_col(Insp_p[row], col, 1);
@ -2715,15 +2734,17 @@ static void insp_mkrow_utf8 (int col, int row) {
memcpy(tline, Insp_p[row] + col, sizeof(tline));
else tline[0] = '\n';
for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; ) {
for (fr = 0, to = 0, ofs = 0; to < Screen_cols; ) {
if (!ofs)
ofs = insp_find_ofs(col + fr, row);
if (col + fr < ofs) {
unsigned char uch = tline[fr++];
switch (UTF8_tab[(int)uch]) {
unsigned char uch = tline[fr];
int bno = UTF8_tab[uch];
int cno = utf8_cols(&tline[fr++], bno);
switch (bno) {
case 1:
if (uch == '\n') break;
else if (uch < 32) mkCTL
if (uch < 32) mkCTL
else if (uch == 127) mkNUL
else { buf1[0] = uch; doPUT(buf1) }
break;

View File

@ -39,6 +39,7 @@
//#define OFF_SCROLLBK /* disable tty emulators scrollback buffer */
//#define OFF_STDERROR /* disable our stderr buffering (redirect) */
//#define OFF_STDIOLBF /* disable our own stdout _IOFBF override */
//#define OFF_XTRAWIDE /* disable our extra wide multi-byte logic */
//#define PRETENDNOCAP /* use a terminal without essential caps */
//#define QUICK_GRAPHS /* use fast algorithm, accept +2% distort */
//#define RCFILE_NOERR /* rcfile errs silently default, vs. fatal */
@ -553,6 +554,7 @@ typedef struct WIN_t {
//atic void sig_resize (int dont_care_sig);
/*------ Special UTF-8 Multi-Byte support ------------------------------*/
/*atic char UTF8_tab[] = { ... } */
//atic inline int utf8_cols (const unsigned char *p, int n);
//atic int utf8_delta (const char *str);
//atic int utf8_embody (const char *str, int width);
//atic const char *utf8_justify (const char *str, int width, int justr);