library: refactor 'escape' logic for newlib essentials

This new library provides callers with pure strings or
string vectors. It is up to those callers to deal with
potential utf8 multibyte characters and any difference
between strlen and the corresponding printable widths.

So, it makes no sense for the library to go to all the
trouble of invoking those rather expensive 'mbrtowc' &
'wcwidth' functions to ultimately yield total 'cells'.

Thus, this patch will eliminate all the code and parms
that are involved with such possible multibyte issues.

[ Along the way we'll lose the ability to substitute ]
[ '?' for an invalid/unprintable multibyte sequence. ]
[ We will, however, replace ctrl chars with the '?'. ]

[ This presents no problem for that ps program since ]
[ it now duplicates all of the original escape code. ]
[ And, we'll no longer be executing that code twice! ]

[ As for the top program, it takes the position that ]
[ it is wrong to alter kernel supplied data. So with ]
[ potential invalid/unprintable stuff, he'll rely on ]
[ terminal emulators to properly handle such issues! ]

[ Besides, even using a proper multibyte string, not ]
[ all terminals generate the proper printable width. ]
[ This is especially true when it comes to an emoji. ]

[ And should callers chose not to be portable to all ]
[ locales by calling setlocale(LC_ALL, ""), they can ]
[ expect to see lots of "?", regardless of what this ]
[ library fixes in a faulty multibyte string anyway. ]

Signed-off-by: Jim Warner <james.warner@comcast.net>
This commit is contained in:
Jim Warner 2020-12-24 00:00:00 -06:00 committed by Craig Small
parent 9c212a7e77
commit a221b9084a
4 changed files with 24 additions and 158 deletions

View File

@ -17,171 +17,46 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <ctype.h>
#include <langinfo.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h> /* MB_CUR_MAX */
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include <sys/types.h>
#include "escape.h"
#include "readproc.h"
#define SECURE_ESCAPE_ARGS(dst, bytes, cells) do { \
#define SECURE_ESCAPE_ARGS(dst, bytes) do { \
if ((bytes) <= 0) return 0; \
*(dst) = '\0'; \
if ((bytes) >= INT_MAX) return 0; \
if ((cells) >= INT_MAX) return 0; \
if ((cells) <= 0) return 0; \
} while (0)
static int escape_str_utf8 (char *dst, const char *src, int bufsize, int *maxcells) {
int my_cells = 0;
int my_bytes = 0;
mbstate_t s;
int escape_str (unsigned char *dst, const unsigned char *src, int bufsize) {
int i, n;
SECURE_ESCAPE_ARGS(dst, bufsize, *maxcells);
SECURE_ESCAPE_ARGS(dst, bufsize);
memset(&s, 0, sizeof (s));
for(;;) {
wchar_t wc;
int len = 0;
if(my_cells >= *maxcells || my_bytes+1 >= bufsize)
break;
if (!(len = mbrtowc (&wc, src, MB_CUR_MAX, &s)))
/* 'str' contains \0 */
break;
if (len < 0) {
/* invalid multibyte sequence -- zeroize state */
memset (&s, 0, sizeof (s));
*(dst++) = '?';
src++;
my_cells++;
my_bytes++;
} else if (len==1) {
/* non-multibyte */
*(dst++) = isprint(*src) ? *src : '?';
src++;
my_cells++;
my_bytes++;
} else if (!iswprint(wc)) {
/* multibyte - no printable */
*(dst++) = '?';
src+=len;
my_cells++;
my_bytes++;
} else {
/* multibyte - maybe, kinda "printable" */
int wlen = wcwidth(wc);
// Got space?
if (wlen > *maxcells-my_cells || len >= bufsize-(my_bytes+1)) break;
// safe multibyte
memcpy(dst, src, len);
dst += len;
src += len;
my_bytes += len;
if (wlen > 0) my_cells += wlen;
}
//fprintf(stdout, "cells: %d\n", my_cells);
}
n = snprintf(dst, bufsize, "%s", src);
if (n < 0) {
*dst = '\0';
return 0;
}
if (n >= bufsize) n = bufsize-1;
// fprintf(stderr, "maxcells: %d, my_cells; %d\n", *maxcells, my_cells);
// control chars, especially tabs, create alignment problems for ps & top ...
for (i = 0; i < n; i++)
if (dst[i] < 0x20 || dst[i] == 0x7f)
dst[i] = '?';
*maxcells -= my_cells;
return my_bytes; // bytes of text, excluding the NUL
return n;
}
/* sanitize a string via one-way mangle */
int escape_str (char *dst, const char *src, int bufsize, int *maxcells) {
unsigned char c;
int my_cells = 0;
int my_bytes = 0;
const char codes[] =
"Z..............................."
"||||||||||||||||||||||||||||||||"
"||||||||||||||||||||||||||||||||"
"|||||||||||||||||||||||||||||||."
"????????????????????????????????"
"????????????????????????????????"
"????????????????????????????????"
"????????????????????????????????";
static int utf_init=0;
if(utf_init==0){
/* first call -- check if UTF stuff is usable */
char *enc = nl_langinfo(CODESET);
utf_init = enc && strcasecmp(enc, "UTF-8")==0 ? 1 : -1;
}
if (utf_init==1 && MB_CUR_MAX>1) {
/* UTF8 locales */
return escape_str_utf8(dst, src, bufsize, maxcells);
}
SECURE_ESCAPE_ARGS(dst, bufsize, *maxcells);
if(bufsize > *maxcells+1) bufsize=*maxcells+1; // FIXME: assumes 8-bit locale
for(;;){
if(my_cells >= *maxcells || my_bytes+1 >= bufsize)
break;
c = (unsigned char) *(src++);
if(!c) break;
if(codes[c]!='|') c=codes[c];
my_cells++;
my_bytes++;
*(dst++) = c;
}
*dst = '\0';
*maxcells -= my_cells;
return my_bytes; // bytes of text, excluding the NUL
}
/////////////////////////////////////////////////
// escape an argv or environment string array
//
// bytes arg means sizeof(buf)
static int escape_strlist (char *dst, const char **src, size_t bytes, int *cells) {
size_t i = 0;
for(;;){
i += escape_str(dst+i, *src, bytes-i, cells);
if(bytes-i < 3) break; // need room for space, a character, and the NUL
src++;
if(!*src) break; // need something to print
if (*cells<=1) break; // need room for printed size of text
dst[i++] = ' ';
--*cells;
}
return i; // bytes, excluding the NUL
}
///////////////////////////////////////////////////
int escape_command (char *const outbuf, const proc_t *pp, int bytes, int *cells, unsigned flags) {
int escape_command (unsigned char *outbuf, const proc_t *pp, int bytes, unsigned flags) {
int overhead = 0;
int end = 0;
if(flags & ESC_ARGS){
const char **lc = (const char**)pp->cmdline;
if(lc && *lc) return escape_strlist(outbuf, lc, bytes, cells);
}
if(flags & ESC_BRACKETS){
overhead += 2;
}
@ -189,16 +64,14 @@ int escape_command (char *const outbuf, const proc_t *pp, int bytes, int *cells,
if(pp->state=='Z') overhead += 10; // chars in " <defunct>"
else flags &= ~ESC_DEFUNCT;
}
if(overhead + 1 >= *cells || // if no room for even one byte of the command name
overhead + 1 >= bytes){
if(overhead + 1 >= bytes){ // if no room for even one byte of the command name
outbuf[0] = '\0';
return 0;
}
if(flags & ESC_BRACKETS){
outbuf[end++] = '[';
}
*cells -= overhead;
end += escape_str(outbuf+end, pp->cmd, bytes-overhead, cells);
end += escape_str(outbuf+end, pp->cmd, bytes-overhead);
// Hmmm, do we want "[foo] <defunct>" or "[foo <defunct>]"?
if(flags & ESC_BRACKETS){

View File

@ -3,12 +3,11 @@
#include "readproc.h"
#define ESC_ARGS 0x1 // try to use cmdline instead of cmd
#define ESC_BRACKETS 0x2 // if using cmd, put '[' and ']' around it
#define ESC_DEFUNCT 0x4 // mark zombies with " <defunct>"
int escape_command (char *outbuf, const proc_t *pp, int bytes, int *cells, unsigned flags);
int escape_command (unsigned char *outbuf, const proc_t *pp, int bytes, unsigned flags);
int escape_str (char *dst, const char *src, int bufsize, int *maxcells);
int escape_str (unsigned char *dst, const unsigned char *src, int bufsize);
#endif

View File

@ -825,7 +825,7 @@ static int vectorize_dash_rc (char*** vec) {
static int fill_cgroup_cvt (const char* directory, proc_t *restrict p) {
#define vMAX ( MAX_BUFSZ - (int)(dst - dst_buffer) )
char *src, *dst, *grp, *eob, *name;
int tot, x, whackable_int = MAX_BUFSZ, len;
int tot, x, len;
*(dst = dst_buffer) = '\0'; // empty destination
tot = read_unvectored(src_buffer, MAX_BUFSZ, directory, "cgroup", '\0');
@ -841,7 +841,7 @@ static int fill_cgroup_cvt (const char* directory, proc_t *restrict p) {
len = snprintf(dst, vMAX, "%s", (dst > dst_buffer) ? "," : "");
if (len < 0 || len >= vMAX) break;
dst += len;
dst += escape_str(dst, grp, vMAX, &whackable_int);
dst += escape_str(dst, grp, vMAX);
}
if (!(p->cgroup = strdup(dst_buffer[0] ? dst_buffer : "-")))
return 1;
@ -859,12 +859,10 @@ static int fill_cgroup_cvt (const char* directory, proc_t *restrict p) {
// valid proc_t.cmdline pointer.
static int fill_cmdline_cvt (const char* directory, proc_t *restrict p) {
#define uFLG ( ESC_BRACKETS | ESC_DEFUNCT )
int whackable_int = MAX_BUFSZ;
if (read_unvectored(src_buffer, MAX_BUFSZ, directory, "cmdline", ' '))
escape_str(dst_buffer, src_buffer, MAX_BUFSZ, &whackable_int);
escape_str(dst_buffer, src_buffer, MAX_BUFSZ);
else
escape_command(dst_buffer, p, MAX_BUFSZ, &whackable_int, uFLG);
escape_command(dst_buffer, p, MAX_BUFSZ, uFLG);
p->cmdline = strdup(dst_buffer[0] ? dst_buffer : "?");
if (!p->cmdline)
return 1;
@ -876,11 +874,9 @@ static int fill_cmdline_cvt (const char* directory, proc_t *restrict p) {
// This routine reads an 'environ' for the designated proc_t and
// guarantees the caller a valid proc_t.environ pointer.
static int fill_environ_cvt (const char* directory, proc_t *restrict p) {
int whackable_int = MAX_BUFSZ;
dst_buffer[0] = '\0';
if (read_unvectored(src_buffer, MAX_BUFSZ, directory, "environ", ' '))
escape_str(dst_buffer, src_buffer, MAX_BUFSZ, &whackable_int);
escape_str(dst_buffer, src_buffer, MAX_BUFSZ);
p->environ = strdup(dst_buffer[0] ? dst_buffer : "-");
if (!p->environ)
return 1;

View File

@ -215,8 +215,6 @@ typedef struct PROCTAB {
#define PROC_FILL_LUID 0x400000 // fill in proc_t luid (login user id)
#define PROC_FILL_EXE 0x200000 // fill in proc_t exe path + pgm name
#define PROC_LOOSE_TASKS 0x2000 // treat threads as if they were processes
// consider only processes with one of the passed:
#define PROC_PID 0x1000 // process id numbers ( 0 terminated)
#define PROC_UID 0x4000 // user id numbers ( length needed )