top: adapt utf8 logic to support extra wide characters

Back when top was refactored to support UTF-8 encoding
it was acknowledged that languages like zh_CN were not
supported. That was because a single 'character' might
require more than a single 'column' when it's printed.

Well I've now figured out how to accommodate languages
like that. My adaptation is represented in this patch.

[ and just in case someone wishes to avoid the extra ]
[ runtime costs, a #define OFF_XTRAWIDE is included. ]

Along the way, I've cleaned up some miscellaneous code
supporting the 'Inspect' feature so that the rightmost
screen column was always used rather than being blank.

[ interestingly, my xterm & urxvt terminal emulators ]
[ are able to split extra wide characters then print ]
[ 1/2 of such graphics in the last column. the gnome ]
[ terminal emulator does not duplicate such behavior ]
[ but prints 1 extra character in same width window. ]

Reference(s):
. Sep, 2017 - original utf8 support
commit 9773c56add6446d418c0677f306c8771356f0c01

Signed-off-by: Jim Warner <james.warner@comcast.net>
This commit is contained in:
Jim Warner 2018-01-08 00:00:00 -06:00 committed by Craig Small
parent 6f2e66969a
commit 264790d80d
2 changed files with 41 additions and 18 deletions

View File

@ -37,6 +37,7 @@
#include <termios.h> #include <termios.h>
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#include <wchar.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <sys/resource.h> #include <sys/resource.h>
@ -504,6 +505,24 @@ static char UTF8_tab[] = {
}; // ( 0xF5 & beyond invalid ) }; // ( 0xF5 & beyond invalid )
/*
* Accommodate any potential differences between some multibyte
* character sequence and the screen columns needed to print it */
static inline int utf8_cols (const unsigned char *p, int n) {
#ifndef OFF_XTRAWIDE
wchar_t wc;
int wlen;
(void)mbtowc(&wc, (const char *)p, n);
if ((wlen = wcwidth(wc)) < 1) wlen = 1;
return wlen;
#else
(void)p; (void)n;
return 1;
#endif
} // end: utf8_cols
/* /*
* Determine difference between total bytes versus printable * Determine difference between total bytes versus printable
* characters in that passed, potentially multi-byte, string */ * characters in that passed, potentially multi-byte, string */
@ -514,8 +533,8 @@ static int utf8_delta (const char *str) {
while (*p) { while (*p) {
// -1 represents a decoding error, pretend it's untranslated ... // -1 represents a decoding error, pretend it's untranslated ...
if (0 > (clen = UTF8_tab[*p])) return 0; if (0 > (clen = UTF8_tab[*p])) return 0;
cnum += utf8_cols(p, clen);
p += clen; p += clen;
++cnum;
} }
return (int)((const char *)p - str) - cnum; return (int)((const char *)p - str) - cnum;
} // end: utf8_delta } // end: utf8_delta
@ -532,8 +551,8 @@ static int utf8_embody (const char *str, int width) {
while (*p) { while (*p) {
// -1 represents a decoding error, pretend it's untranslated ... // -1 represents a decoding error, pretend it's untranslated ...
if (0 > (clen = UTF8_tab[*p])) return width; if (0 > (clen = UTF8_tab[*p])) return width;
if (width < (cnum += utf8_cols(p, clen))) break;
p += clen; p += clen;
if (++cnum >= width) break;
} }
} }
return (int)((const char *)p - str); return (int)((const char *)p - str);
@ -2636,15 +2655,15 @@ static void insp_find_str (int ch, int *col, int *row) {
* while visible search matches display with capclr_hdr for emphasis. * while visible search matches display with capclr_hdr for emphasis.
* ( we hide ugly plumbing in macros to concentrate on the algorithm ) */ * ( we hide ugly plumbing in macros to concentrate on the algorithm ) */
static void insp_mkrow_raw (int col, int row) { static void insp_mkrow_raw (int col, int row) {
#define maxSZ ( Screen_cols - (to + 1) ) #define maxSZ ( Screen_cols - to )
#define capNO { if (hicap) { putp(Caps_off); hicap = 0; } } #define capNO { if (hicap) { putp(Caps_off); hicap = 0; } }
#define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \ #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \
fr += Insp_sel->flen -1; to += Insp_sel->flen; hicap = 0; } fr += Insp_sel->flen -1; to += Insp_sel->flen; hicap = 0; }
#ifndef INSP_JUSTNOT #ifndef INSP_JUSTNOT
#define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \ #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \
PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 2; hicap = 1; } PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 2; hicap = 1; }
#define mkUNP { int x = maxSZ; const char *p = fmtmk("<%02X>", uch); \ #define mkUNP { const char *p = fmtmk("<%02X>", uch); \
PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 4; hicap = 1; } PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 4; hicap = 1; }
#else #else
#define mkCTL { if ((to += 2) <= Screen_cols) \ #define mkCTL { if ((to += 2) <= Screen_cols) \
PUTT("%s^%c", (!hicap) ? Curwin->capclr_msg : "", uch + '@'); hicap = 1; } PUTT("%s^%c", (!hicap) ? Curwin->capclr_msg : "", uch + '@'); hicap = 1; }
@ -2653,7 +2672,7 @@ static void insp_mkrow_raw (int col, int row) {
#endif #endif
#define mkSTD { capNO; if (++to <= Screen_cols) { static char _str[2]; \ #define mkSTD { capNO; if (++to <= Screen_cols) { static char _str[2]; \
_str[0] = uch; putp(_str); } } _str[0] = uch; putp(_str); } }
char tline[SCREENMAX]; unsigned char tline[SCREENMAX];
int fr, to, ofs; int fr, to, ofs;
int hicap = 0; int hicap = 0;
@ -2661,7 +2680,7 @@ static void insp_mkrow_raw (int col, int row) {
memcpy(tline, Insp_p[row] + col, sizeof(tline)); memcpy(tline, Insp_p[row] + col, sizeof(tline));
else tline[0] = '\n'; else tline[0] = '\n';
for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; fr++) { for (fr = 0, to = 0, ofs = 0; to < Screen_cols; fr++) {
if (!ofs) if (!ofs)
ofs = insp_find_ofs(col + fr, row); ofs = insp_find_ofs(col + fr, row);
if (col + fr < ofs) { if (col + fr < ofs) {
@ -2694,20 +2713,20 @@ static void insp_mkrow_raw (int col, int row) {
* characters will then be displayed in two positions like '^A'. * characters will then be displayed in two positions like '^A'.
* ( assuming they can even get past those 'gettext' utilities ) */ * ( assuming they can even get past those 'gettext' utilities ) */
static void insp_mkrow_utf8 (int col, int row) { static void insp_mkrow_utf8 (int col, int row) {
#define maxSZ ( Screen_cols - (to + 1) ) #define maxSZ ( Screen_cols - to )
#define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \ #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \
fr += Insp_sel->flen; to += Insp_sel->flen; } fr += Insp_sel->flen; to += Insp_sel->flen; }
#ifndef INSP_JUSTNOT #ifndef INSP_JUSTNOT
#define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \ #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \
PUTT("%s%.*s%s", Curwin->capclr_msg, x, p, Caps_off); to += 2; } PUTT("%s%.*s%s", Curwin->capclr_msg, maxSZ, p, Caps_off); to += 2; }
#else #else
#define mkCTL { if ((to += 2) <= Screen_cols) \ #define mkCTL { if ((to += 2) <= Screen_cols) \
PUTT("%s^%c%s", Curwin->capclr_msg, uch + '@', Caps_off); } PUTT("%s^%c%s", Curwin->capclr_msg, uch + '@', Caps_off); }
#endif #endif
#define mkNUL { buf1[0] = ' '; doPUT(buf1) } #define mkNUL { buf1[0] = ' '; doPUT(buf1) }
#define doPUT(buf) if (++to <= Screen_cols) putp(buf); #define doPUT(buf) if ((to += cno) <= Screen_cols) putp(buf);
static char buf1[2], buf2[3], buf3[4], buf4[5]; static char buf1[2], buf2[3], buf3[4], buf4[5];
char tline[BIGBUFSIZ]; unsigned char tline[BIGBUFSIZ];
int fr, to, ofs; int fr, to, ofs;
col = utf8_proper_col(Insp_p[row], col, 1); col = utf8_proper_col(Insp_p[row], col, 1);
@ -2715,15 +2734,17 @@ static void insp_mkrow_utf8 (int col, int row) {
memcpy(tline, Insp_p[row] + col, sizeof(tline)); memcpy(tline, Insp_p[row] + col, sizeof(tline));
else tline[0] = '\n'; else tline[0] = '\n';
for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; ) { for (fr = 0, to = 0, ofs = 0; to < Screen_cols; ) {
if (!ofs) if (!ofs)
ofs = insp_find_ofs(col + fr, row); ofs = insp_find_ofs(col + fr, row);
if (col + fr < ofs) { if (col + fr < ofs) {
unsigned char uch = tline[fr++]; unsigned char uch = tline[fr];
switch (UTF8_tab[(int)uch]) { int bno = UTF8_tab[uch];
int cno = utf8_cols(&tline[fr++], bno);
switch (bno) {
case 1: case 1:
if (uch == '\n') break; if (uch == '\n') break;
else if (uch < 32) mkCTL if (uch < 32) mkCTL
else if (uch == 127) mkNUL else if (uch == 127) mkNUL
else { buf1[0] = uch; doPUT(buf1) } else { buf1[0] = uch; doPUT(buf1) }
break; break;

View File

@ -39,6 +39,7 @@
//#define OFF_SCROLLBK /* disable tty emulators scrollback buffer */ //#define OFF_SCROLLBK /* disable tty emulators scrollback buffer */
//#define OFF_STDERROR /* disable our stderr buffering (redirect) */ //#define OFF_STDERROR /* disable our stderr buffering (redirect) */
//#define OFF_STDIOLBF /* disable our own stdout _IOFBF override */ //#define OFF_STDIOLBF /* disable our own stdout _IOFBF override */
//#define OFF_XTRAWIDE /* disable our extra wide multi-byte logic */
//#define PRETENDNOCAP /* use a terminal without essential caps */ //#define PRETENDNOCAP /* use a terminal without essential caps */
//#define QUICK_GRAPHS /* use fast algorithm, accept +2% distort */ //#define QUICK_GRAPHS /* use fast algorithm, accept +2% distort */
//#define RCFILE_NOERR /* rcfile errs silently default, vs. fatal */ //#define RCFILE_NOERR /* rcfile errs silently default, vs. fatal */
@ -553,6 +554,7 @@ typedef struct WIN_t {
//atic void sig_resize (int dont_care_sig); //atic void sig_resize (int dont_care_sig);
/*------ Special UTF-8 Multi-Byte support ------------------------------*/ /*------ Special UTF-8 Multi-Byte support ------------------------------*/
/*atic char UTF8_tab[] = { ... } */ /*atic char UTF8_tab[] = { ... } */
//atic inline int utf8_cols (const unsigned char *p, int n);
//atic int utf8_delta (const char *str); //atic int utf8_delta (const char *str);
//atic int utf8_embody (const char *str, int width); //atic int utf8_embody (const char *str, int width);
//atic const char *utf8_justify (const char *str, int width, int justr); //atic const char *utf8_justify (const char *str, int width, int justr);