From a659b81dfa435aa19130a8c7dd1bfe8fa9a22131 Mon Sep 17 00:00:00 2001 From: Tomas Heinrich Date: Thu, 29 Apr 2010 13:43:39 +0200 Subject: [PATCH] libbb/lineedit: add support for preserving "broken" (non-unicode) chars Signed-off-by: Tomas Heinrich Signed-off-by: Denys Vlasenko --- Config.in | 11 ++++++++ libbb/lineedit.c | 62 +++++++++++++++++++++++++++++++++++++-------- libbb/unicode.c | 12 +++------ testsuite/ash.tests | 24 ++++++++++++++++++ 4 files changed, 89 insertions(+), 20 deletions(-) diff --git a/Config.in b/Config.in index 40af9115d..a5d20038a 100644 --- a/Config.in +++ b/Config.in @@ -223,6 +223,17 @@ config UNICODE_NEUTRAL_TABLE With this option on, more extensive (and bigger) table of neutral chars will be used. +config UNICODE_PRESERVE_BROKEN + bool "Make it possible to enter sequences of chars which are not Unicode" + default n + depends on UNICODE_SUPPORT + help + With this option on, invalid UTF-8 bytes are not substituted + with the selected substitution character. + For example, this means that entering 'l', 's', ' ', 0xff, [Enter] + at shell prompt will list file named 0xff (single char name + with char value 255), not file named '?'. + config LONG_OPTS bool "Support for --long-options" default y diff --git a/libbb/lineedit.c b/libbb/lineedit.c index dc90846f9..622f9ddfc 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c @@ -68,7 +68,7 @@ #undef CHAR_T #if ENABLE_UNICODE_SUPPORT -# define BB_NUL L'\0' +# define BB_NUL ((wchar_t)0) # define CHAR_T wchar_t static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); } # if ENABLE_FEATURE_EDITING_VI @@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); } #endif +# if ENABLE_UNICODE_PRESERVE_BROKEN +# define unicode_mark_inv_wchar(wc) ((wc) | 0x20000000) +# define unicode_is_inv_wchar(wc) ((wc) & 0x20000000) +# else +# define unicode_is_inv_wchar(wc) 0 +# endif + + enum { /* We use int16_t for positions, need to limit line len */ MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0 @@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize) ssize_t len = mbstowcs(command_ps, src, maxsize - 1); if (len < 0) len = 0; - command_ps[len] = L'\0'; + command_ps[len] = 0; return len; } -static size_t save_string(char *dst, int maxsize) +static unsigned save_string(char *dst, unsigned maxsize) { +#if !ENABLE_UNICODE_PRESERVE_BROKEN ssize_t len = wcstombs(dst, command_ps, maxsize - 1); if (len < 0) len = 0; dst[len] = '\0'; return len; +#else + unsigned dstpos = 0; + unsigned srcpos = 0; + + maxsize--; + while (dstpos < maxsize) { + wchar_t wc; + int n = srcpos; + while ((wc = command_ps[srcpos]) != 0 + && !unicode_is_inv_wchar(wc) + ) { + srcpos++; + } + command_ps[srcpos] = 0; + n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos); + if (n < 0) /* should not happen */ + break; + dstpos += n; + if (wc == 0) /* usually is */ + break; + /* We do have invalid byte here! */ + command_ps[srcpos] = wc; /* restore it */ + srcpos++; + if (dstpos == maxsize) + break; + dst[dstpos++] = (char) wc; + } + dst[dstpos] = '\0'; + return dstpos; +#endif } /* I thought just fputwc(c, stdout) would work. But no... */ static void BB_PUTCHAR(wchar_t c) { char buf[MB_CUR_MAX + 1]; mbstate_t mbst = { 0 }; - ssize_t len = wcrtomb(buf, c, &mbst); + ssize_t len; + if (unicode_is_inv_wchar(c)) + c = CONFIG_SUBST_WCHAR; + len = wcrtomb(buf, c, &mbst); if (len > 0) { buf[len] = '\0'; fputs(buf, stdout); @@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize) return strlen(command_ps); } # if ENABLE_FEATURE_TAB_COMPLETION -static void save_string(char *dst, int maxsize) +static void save_string(char *dst, unsigned maxsize) { safe_strncpy(dst, command_ps, maxsize); } @@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer) pushback: /* Invalid sequence. Save all "bad bytes" except first */ read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1); - /* - * ic = unicode_buf[0] sounds even better, but currently - * this does not work: wchar_t[] -> char[] conversion - * when lineedit finishes mangles such "raw bytes" - * (by misinterpreting them as unicode chars): - */ +# if !ENABLE_UNICODE_PRESERVE_BROKEN ic = CONFIG_SUBST_WCHAR; +# else + ic = unicode_mark_inv_wchar(unicode_buf[0]); +# endif } else { /* Valid unicode char, return its code */ ic = wc; diff --git a/libbb/unicode.c b/libbb/unicode.c index 83e70b412..d1c6167c7 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c @@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs) # if LAST_SUPPORTED_WCHAR >= 0x300 /* sorted list of non-overlapping intervals of non-spacing characters */ /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ - static const struct interval combining[] = { # define BIG_(a,b) { a, b }, # define PAIR(a,b) # define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \ @@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs) BIG_(0xFE20, 0xFE23) \ BIG_(0xFEFF, 0xFEFF) \ BIG_(0xFFF9, 0xFFFB) - ARRAY + static const struct interval combining[] = { ARRAY }; # undef BIG_ # undef PAIR - }; # define BIG_(a,b) # define PAIR(a,b) (a << 2) | (b-a), static const uint16_t combining1[] = { ARRAY }; @@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc) * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter */ - static const struct interval rtl_b[] = { # define BIG_(a,b) { a, b }, # define PAIR(a,b) # define ARRAY \ @@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc) {0x10E7F, 0x10FFF}, {0x1E800, 0x1EFFF} */ - ARRAY + static const struct interval rtl_b[] = { ARRAY }; # undef BIG_ # undef PAIR - }; # define BIG_(a,b) # define PAIR(a,b) (a << 2) | (b-a), static const uint16_t rtl_p[] = { ARRAY }; @@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc) * White_Space, Other_Neutral, European_Number, European_Separator, * European_Terminator, Arabic_Number, Common_Separator */ - static const struct interval neutral_b[] = { # define BIG_(a,b) { a, b }, # define PAIR(a,b) # define ARRAY \ @@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc) {0x1F030, 0x1F093}, {0x1F100, 0x1F10A} */ - ARRAY + static const struct interval neutral_b[] = { ARRAY }; # undef BIG_ # undef PAIR - }; # define BIG_(a,b) # define PAIR(a,b) (a << 2) | (b-a), static const uint16_t neutral_p[] = { ARRAY }; diff --git a/testsuite/ash.tests b/testsuite/ash.tests index 6b2caf316..ce585beb1 100755 --- a/testsuite/ash.tests +++ b/testsuite/ash.tests @@ -7,8 +7,30 @@ . ./testing.sh +test -f "$bindir/.config" && . "$bindir/.config" + # testing "test name" "options" "expected result" "file input" "stdin" +if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then +testing "One byte which is not valid unicode char followed by valid input" \ + "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ + "\ +00000000 ff 2d 0a |.-.| +00000003 +" \ + "" \ + "echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" + +testing "30 bytes which are not valid unicode chars followed by valid input" \ + "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ + "\ +00000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................| +00000010 ff ff ff ff ff ff ff ff ff ff ff ff ff ff 2d 0a |..............-.| +00000020 +" \ + "" \ + "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" +else testing "One byte which is not valid unicode char followed by valid input" \ "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ "\ @@ -27,6 +49,8 @@ testing "30 bytes which are not valid unicode chars followed by valid input" \ " \ "" \ "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" +fi + # Not sure this behavior is perfect: we lose all invalid input which precedes # arrow keys and such. In this example, \xff\xff are lost