libbb/lineedit: add support for preserving "broken" (non-unicode) chars

Signed-off-by: Tomas Heinrich <heinrich.tomas@gmail.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Tomas Heinrich 2010-04-29 13:43:39 +02:00 committed by Denys Vlasenko
parent 25b10d97e6
commit a659b81dfa
4 changed files with 89 additions and 20 deletions

View File

@ -223,6 +223,17 @@ config UNICODE_NEUTRAL_TABLE
With this option on, more extensive (and bigger) table With this option on, more extensive (and bigger) table
of neutral chars will be used. of neutral chars will be used.
config UNICODE_PRESERVE_BROKEN
bool "Make it possible to enter sequences of chars which are not Unicode"
default n
depends on UNICODE_SUPPORT
help
With this option on, invalid UTF-8 bytes are not substituted
with the selected substitution character.
For example, this means that entering 'l', 's', ' ', 0xff, [Enter]
at shell prompt will list file named 0xff (single char name
with char value 255), not file named '?'.
config LONG_OPTS config LONG_OPTS
bool "Support for --long-options" bool "Support for --long-options"
default y default y

View File

@ -68,7 +68,7 @@
#undef CHAR_T #undef CHAR_T
#if ENABLE_UNICODE_SUPPORT #if ENABLE_UNICODE_SUPPORT
# define BB_NUL L'\0' # define BB_NUL ((wchar_t)0)
# define CHAR_T wchar_t # define CHAR_T wchar_t
static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); } static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
# if ENABLE_FEATURE_EDITING_VI # if ENABLE_FEATURE_EDITING_VI
@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); }
#endif #endif
# if ENABLE_UNICODE_PRESERVE_BROKEN
# define unicode_mark_inv_wchar(wc) ((wc) | 0x20000000)
# define unicode_is_inv_wchar(wc) ((wc) & 0x20000000)
# else
# define unicode_is_inv_wchar(wc) 0
# endif
enum { enum {
/* We use int16_t for positions, need to limit line len */ /* We use int16_t for positions, need to limit line len */
MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0 MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0
@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize)
ssize_t len = mbstowcs(command_ps, src, maxsize - 1); ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
if (len < 0) if (len < 0)
len = 0; len = 0;
command_ps[len] = L'\0'; command_ps[len] = 0;
return len; return len;
} }
static size_t save_string(char *dst, int maxsize) static unsigned save_string(char *dst, unsigned maxsize)
{ {
#if !ENABLE_UNICODE_PRESERVE_BROKEN
ssize_t len = wcstombs(dst, command_ps, maxsize - 1); ssize_t len = wcstombs(dst, command_ps, maxsize - 1);
if (len < 0) if (len < 0)
len = 0; len = 0;
dst[len] = '\0'; dst[len] = '\0';
return len; return len;
#else
unsigned dstpos = 0;
unsigned srcpos = 0;
maxsize--;
while (dstpos < maxsize) {
wchar_t wc;
int n = srcpos;
while ((wc = command_ps[srcpos]) != 0
&& !unicode_is_inv_wchar(wc)
) {
srcpos++;
}
command_ps[srcpos] = 0;
n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos);
if (n < 0) /* should not happen */
break;
dstpos += n;
if (wc == 0) /* usually is */
break;
/* We do have invalid byte here! */
command_ps[srcpos] = wc; /* restore it */
srcpos++;
if (dstpos == maxsize)
break;
dst[dstpos++] = (char) wc;
}
dst[dstpos] = '\0';
return dstpos;
#endif
} }
/* I thought just fputwc(c, stdout) would work. But no... */ /* I thought just fputwc(c, stdout) would work. But no... */
static void BB_PUTCHAR(wchar_t c) static void BB_PUTCHAR(wchar_t c)
{ {
char buf[MB_CUR_MAX + 1]; char buf[MB_CUR_MAX + 1];
mbstate_t mbst = { 0 }; mbstate_t mbst = { 0 };
ssize_t len = wcrtomb(buf, c, &mbst); ssize_t len;
if (unicode_is_inv_wchar(c))
c = CONFIG_SUBST_WCHAR;
len = wcrtomb(buf, c, &mbst);
if (len > 0) { if (len > 0) {
buf[len] = '\0'; buf[len] = '\0';
fputs(buf, stdout); fputs(buf, stdout);
@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize)
return strlen(command_ps); return strlen(command_ps);
} }
# if ENABLE_FEATURE_TAB_COMPLETION # if ENABLE_FEATURE_TAB_COMPLETION
static void save_string(char *dst, int maxsize) static void save_string(char *dst, unsigned maxsize)
{ {
safe_strncpy(dst, command_ps, maxsize); safe_strncpy(dst, command_ps, maxsize);
} }
@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer)
pushback: pushback:
/* Invalid sequence. Save all "bad bytes" except first */ /* Invalid sequence. Save all "bad bytes" except first */
read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1); read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1);
/* # if !ENABLE_UNICODE_PRESERVE_BROKEN
* ic = unicode_buf[0] sounds even better, but currently
* this does not work: wchar_t[] -> char[] conversion
* when lineedit finishes mangles such "raw bytes"
* (by misinterpreting them as unicode chars):
*/
ic = CONFIG_SUBST_WCHAR; ic = CONFIG_SUBST_WCHAR;
# else
ic = unicode_mark_inv_wchar(unicode_buf[0]);
# endif
} else { } else {
/* Valid unicode char, return its code */ /* Valid unicode char, return its code */
ic = wc; ic = wc;

View File

@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs)
# if LAST_SUPPORTED_WCHAR >= 0x300 # if LAST_SUPPORTED_WCHAR >= 0x300
/* sorted list of non-overlapping intervals of non-spacing characters */ /* sorted list of non-overlapping intervals of non-spacing characters */
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
static const struct interval combining[] = {
# define BIG_(a,b) { a, b }, # define BIG_(a,b) { a, b },
# define PAIR(a,b) # define PAIR(a,b)
# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \ # define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs)
BIG_(0xFE20, 0xFE23) \ BIG_(0xFE20, 0xFE23) \
BIG_(0xFEFF, 0xFEFF) \ BIG_(0xFEFF, 0xFEFF) \
BIG_(0xFFF9, 0xFFFB) BIG_(0xFFF9, 0xFFFB)
ARRAY static const struct interval combining[] = { ARRAY };
# undef BIG_ # undef BIG_
# undef PAIR # undef PAIR
};
# define BIG_(a,b) # define BIG_(a,b)
# define PAIR(a,b) (a << 2) | (b-a), # define PAIR(a,b) (a << 2) | (b-a),
static const uint16_t combining1[] = { ARRAY }; static const uint16_t combining1[] = { ARRAY };
@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
* http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
* Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
*/ */
static const struct interval rtl_b[] = {
# define BIG_(a,b) { a, b }, # define BIG_(a,b) { a, b },
# define PAIR(a,b) # define PAIR(a,b)
# define ARRAY \ # define ARRAY \
@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
{0x10E7F, 0x10FFF}, {0x10E7F, 0x10FFF},
{0x1E800, 0x1EFFF} {0x1E800, 0x1EFFF}
*/ */
ARRAY static const struct interval rtl_b[] = { ARRAY };
# undef BIG_ # undef BIG_
# undef PAIR # undef PAIR
};
# define BIG_(a,b) # define BIG_(a,b)
# define PAIR(a,b) (a << 2) | (b-a), # define PAIR(a,b) (a << 2) | (b-a),
static const uint16_t rtl_p[] = { ARRAY }; static const uint16_t rtl_p[] = { ARRAY };
@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
* White_Space, Other_Neutral, European_Number, European_Separator, * White_Space, Other_Neutral, European_Number, European_Separator,
* European_Terminator, Arabic_Number, Common_Separator * European_Terminator, Arabic_Number, Common_Separator
*/ */
static const struct interval neutral_b[] = {
# define BIG_(a,b) { a, b }, # define BIG_(a,b) { a, b },
# define PAIR(a,b) # define PAIR(a,b)
# define ARRAY \ # define ARRAY \
@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
{0x1F030, 0x1F093}, {0x1F030, 0x1F093},
{0x1F100, 0x1F10A} {0x1F100, 0x1F10A}
*/ */
ARRAY static const struct interval neutral_b[] = { ARRAY };
# undef BIG_ # undef BIG_
# undef PAIR # undef PAIR
};
# define BIG_(a,b) # define BIG_(a,b)
# define PAIR(a,b) (a << 2) | (b-a), # define PAIR(a,b) (a << 2) | (b-a),
static const uint16_t neutral_p[] = { ARRAY }; static const uint16_t neutral_p[] = { ARRAY };

View File

@ -7,8 +7,30 @@
. ./testing.sh . ./testing.sh
test -f "$bindir/.config" && . "$bindir/.config"
# testing "test name" "options" "expected result" "file input" "stdin" # testing "test name" "options" "expected result" "file input" "stdin"
if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then
testing "One byte which is not valid unicode char followed by valid input" \
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
"\
00000000 ff 2d 0a |.-.|
00000003
" \
"" \
"echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
testing "30 bytes which are not valid unicode chars followed by valid input" \
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
"\
00000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
00000010 ff ff ff ff ff ff ff ff ff ff ff ff ff ff 2d 0a |..............-.|
00000020
" \
"" \
"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
else
testing "One byte which is not valid unicode char followed by valid input" \ testing "One byte which is not valid unicode char followed by valid input" \
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ "script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
"\ "\
@ -27,6 +49,8 @@ testing "30 bytes which are not valid unicode chars followed by valid input" \
" \ " \
"" \ "" \
"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" "echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
fi
# Not sure this behavior is perfect: we lose all invalid input which precedes # Not sure this behavior is perfect: we lose all invalid input which precedes
# arrow keys and such. In this example, \xff\xff are lost # arrow keys and such. In this example, \xff\xff are lost