libbb/lineedit: add support for preserving "broken" (non-unicode) chars
Signed-off-by: Tomas Heinrich <heinrich.tomas@gmail.com> Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
25b10d97e6
commit
a659b81dfa
11
Config.in
11
Config.in
@ -223,6 +223,17 @@ config UNICODE_NEUTRAL_TABLE
|
|||||||
With this option on, more extensive (and bigger) table
|
With this option on, more extensive (and bigger) table
|
||||||
of neutral chars will be used.
|
of neutral chars will be used.
|
||||||
|
|
||||||
|
config UNICODE_PRESERVE_BROKEN
|
||||||
|
bool "Make it possible to enter sequences of chars which are not Unicode"
|
||||||
|
default n
|
||||||
|
depends on UNICODE_SUPPORT
|
||||||
|
help
|
||||||
|
With this option on, invalid UTF-8 bytes are not substituted
|
||||||
|
with the selected substitution character.
|
||||||
|
For example, this means that entering 'l', 's', ' ', 0xff, [Enter]
|
||||||
|
at shell prompt will list file named 0xff (single char name
|
||||||
|
with char value 255), not file named '?'.
|
||||||
|
|
||||||
config LONG_OPTS
|
config LONG_OPTS
|
||||||
bool "Support for --long-options"
|
bool "Support for --long-options"
|
||||||
default y
|
default y
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
|
|
||||||
#undef CHAR_T
|
#undef CHAR_T
|
||||||
#if ENABLE_UNICODE_SUPPORT
|
#if ENABLE_UNICODE_SUPPORT
|
||||||
# define BB_NUL L'\0'
|
# define BB_NUL ((wchar_t)0)
|
||||||
# define CHAR_T wchar_t
|
# define CHAR_T wchar_t
|
||||||
static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
|
static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
|
||||||
# if ENABLE_FEATURE_EDITING_VI
|
# if ENABLE_FEATURE_EDITING_VI
|
||||||
@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); }
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
# if ENABLE_UNICODE_PRESERVE_BROKEN
|
||||||
|
# define unicode_mark_inv_wchar(wc) ((wc) | 0x20000000)
|
||||||
|
# define unicode_is_inv_wchar(wc) ((wc) & 0x20000000)
|
||||||
|
# else
|
||||||
|
# define unicode_is_inv_wchar(wc) 0
|
||||||
|
# endif
|
||||||
|
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
/* We use int16_t for positions, need to limit line len */
|
/* We use int16_t for positions, need to limit line len */
|
||||||
MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0
|
MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0
|
||||||
@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize)
|
|||||||
ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
|
ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
|
||||||
if (len < 0)
|
if (len < 0)
|
||||||
len = 0;
|
len = 0;
|
||||||
command_ps[len] = L'\0';
|
command_ps[len] = 0;
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
static size_t save_string(char *dst, int maxsize)
|
static unsigned save_string(char *dst, unsigned maxsize)
|
||||||
{
|
{
|
||||||
|
#if !ENABLE_UNICODE_PRESERVE_BROKEN
|
||||||
ssize_t len = wcstombs(dst, command_ps, maxsize - 1);
|
ssize_t len = wcstombs(dst, command_ps, maxsize - 1);
|
||||||
if (len < 0)
|
if (len < 0)
|
||||||
len = 0;
|
len = 0;
|
||||||
dst[len] = '\0';
|
dst[len] = '\0';
|
||||||
return len;
|
return len;
|
||||||
|
#else
|
||||||
|
unsigned dstpos = 0;
|
||||||
|
unsigned srcpos = 0;
|
||||||
|
|
||||||
|
maxsize--;
|
||||||
|
while (dstpos < maxsize) {
|
||||||
|
wchar_t wc;
|
||||||
|
int n = srcpos;
|
||||||
|
while ((wc = command_ps[srcpos]) != 0
|
||||||
|
&& !unicode_is_inv_wchar(wc)
|
||||||
|
) {
|
||||||
|
srcpos++;
|
||||||
|
}
|
||||||
|
command_ps[srcpos] = 0;
|
||||||
|
n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos);
|
||||||
|
if (n < 0) /* should not happen */
|
||||||
|
break;
|
||||||
|
dstpos += n;
|
||||||
|
if (wc == 0) /* usually is */
|
||||||
|
break;
|
||||||
|
/* We do have invalid byte here! */
|
||||||
|
command_ps[srcpos] = wc; /* restore it */
|
||||||
|
srcpos++;
|
||||||
|
if (dstpos == maxsize)
|
||||||
|
break;
|
||||||
|
dst[dstpos++] = (char) wc;
|
||||||
|
}
|
||||||
|
dst[dstpos] = '\0';
|
||||||
|
return dstpos;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
/* I thought just fputwc(c, stdout) would work. But no... */
|
/* I thought just fputwc(c, stdout) would work. But no... */
|
||||||
static void BB_PUTCHAR(wchar_t c)
|
static void BB_PUTCHAR(wchar_t c)
|
||||||
{
|
{
|
||||||
char buf[MB_CUR_MAX + 1];
|
char buf[MB_CUR_MAX + 1];
|
||||||
mbstate_t mbst = { 0 };
|
mbstate_t mbst = { 0 };
|
||||||
ssize_t len = wcrtomb(buf, c, &mbst);
|
ssize_t len;
|
||||||
|
|
||||||
|
if (unicode_is_inv_wchar(c))
|
||||||
|
c = CONFIG_SUBST_WCHAR;
|
||||||
|
len = wcrtomb(buf, c, &mbst);
|
||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
buf[len] = '\0';
|
buf[len] = '\0';
|
||||||
fputs(buf, stdout);
|
fputs(buf, stdout);
|
||||||
@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize)
|
|||||||
return strlen(command_ps);
|
return strlen(command_ps);
|
||||||
}
|
}
|
||||||
# if ENABLE_FEATURE_TAB_COMPLETION
|
# if ENABLE_FEATURE_TAB_COMPLETION
|
||||||
static void save_string(char *dst, int maxsize)
|
static void save_string(char *dst, unsigned maxsize)
|
||||||
{
|
{
|
||||||
safe_strncpy(dst, command_ps, maxsize);
|
safe_strncpy(dst, command_ps, maxsize);
|
||||||
}
|
}
|
||||||
@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer)
|
|||||||
pushback:
|
pushback:
|
||||||
/* Invalid sequence. Save all "bad bytes" except first */
|
/* Invalid sequence. Save all "bad bytes" except first */
|
||||||
read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1);
|
read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1);
|
||||||
/*
|
# if !ENABLE_UNICODE_PRESERVE_BROKEN
|
||||||
* ic = unicode_buf[0] sounds even better, but currently
|
|
||||||
* this does not work: wchar_t[] -> char[] conversion
|
|
||||||
* when lineedit finishes mangles such "raw bytes"
|
|
||||||
* (by misinterpreting them as unicode chars):
|
|
||||||
*/
|
|
||||||
ic = CONFIG_SUBST_WCHAR;
|
ic = CONFIG_SUBST_WCHAR;
|
||||||
|
# else
|
||||||
|
ic = unicode_mark_inv_wchar(unicode_buf[0]);
|
||||||
|
# endif
|
||||||
} else {
|
} else {
|
||||||
/* Valid unicode char, return its code */
|
/* Valid unicode char, return its code */
|
||||||
ic = wc;
|
ic = wc;
|
||||||
|
@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs)
|
|||||||
# if LAST_SUPPORTED_WCHAR >= 0x300
|
# if LAST_SUPPORTED_WCHAR >= 0x300
|
||||||
/* sorted list of non-overlapping intervals of non-spacing characters */
|
/* sorted list of non-overlapping intervals of non-spacing characters */
|
||||||
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
|
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
|
||||||
static const struct interval combining[] = {
|
|
||||||
# define BIG_(a,b) { a, b },
|
# define BIG_(a,b) { a, b },
|
||||||
# define PAIR(a,b)
|
# define PAIR(a,b)
|
||||||
# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
|
# define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
|
||||||
@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs)
|
|||||||
BIG_(0xFE20, 0xFE23) \
|
BIG_(0xFE20, 0xFE23) \
|
||||||
BIG_(0xFEFF, 0xFEFF) \
|
BIG_(0xFEFF, 0xFEFF) \
|
||||||
BIG_(0xFFF9, 0xFFFB)
|
BIG_(0xFFF9, 0xFFFB)
|
||||||
ARRAY
|
static const struct interval combining[] = { ARRAY };
|
||||||
# undef BIG_
|
# undef BIG_
|
||||||
# undef PAIR
|
# undef PAIR
|
||||||
};
|
|
||||||
# define BIG_(a,b)
|
# define BIG_(a,b)
|
||||||
# define PAIR(a,b) (a << 2) | (b-a),
|
# define PAIR(a,b) (a << 2) | (b-a),
|
||||||
static const uint16_t combining1[] = { ARRAY };
|
static const uint16_t combining1[] = { ARRAY };
|
||||||
@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
|
|||||||
* http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
|
* http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
|
||||||
* Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
|
* Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
|
||||||
*/
|
*/
|
||||||
static const struct interval rtl_b[] = {
|
|
||||||
# define BIG_(a,b) { a, b },
|
# define BIG_(a,b) { a, b },
|
||||||
# define PAIR(a,b)
|
# define PAIR(a,b)
|
||||||
# define ARRAY \
|
# define ARRAY \
|
||||||
@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
|
|||||||
{0x10E7F, 0x10FFF},
|
{0x10E7F, 0x10FFF},
|
||||||
{0x1E800, 0x1EFFF}
|
{0x1E800, 0x1EFFF}
|
||||||
*/
|
*/
|
||||||
ARRAY
|
static const struct interval rtl_b[] = { ARRAY };
|
||||||
# undef BIG_
|
# undef BIG_
|
||||||
# undef PAIR
|
# undef PAIR
|
||||||
};
|
|
||||||
# define BIG_(a,b)
|
# define BIG_(a,b)
|
||||||
# define PAIR(a,b) (a << 2) | (b-a),
|
# define PAIR(a,b) (a << 2) | (b-a),
|
||||||
static const uint16_t rtl_p[] = { ARRAY };
|
static const uint16_t rtl_p[] = { ARRAY };
|
||||||
@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
|
|||||||
* White_Space, Other_Neutral, European_Number, European_Separator,
|
* White_Space, Other_Neutral, European_Number, European_Separator,
|
||||||
* European_Terminator, Arabic_Number, Common_Separator
|
* European_Terminator, Arabic_Number, Common_Separator
|
||||||
*/
|
*/
|
||||||
static const struct interval neutral_b[] = {
|
|
||||||
# define BIG_(a,b) { a, b },
|
# define BIG_(a,b) { a, b },
|
||||||
# define PAIR(a,b)
|
# define PAIR(a,b)
|
||||||
# define ARRAY \
|
# define ARRAY \
|
||||||
@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
|
|||||||
{0x1F030, 0x1F093},
|
{0x1F030, 0x1F093},
|
||||||
{0x1F100, 0x1F10A}
|
{0x1F100, 0x1F10A}
|
||||||
*/
|
*/
|
||||||
ARRAY
|
static const struct interval neutral_b[] = { ARRAY };
|
||||||
# undef BIG_
|
# undef BIG_
|
||||||
# undef PAIR
|
# undef PAIR
|
||||||
};
|
|
||||||
# define BIG_(a,b)
|
# define BIG_(a,b)
|
||||||
# define PAIR(a,b) (a << 2) | (b-a),
|
# define PAIR(a,b) (a << 2) | (b-a),
|
||||||
static const uint16_t neutral_p[] = { ARRAY };
|
static const uint16_t neutral_p[] = { ARRAY };
|
||||||
|
@ -7,8 +7,30 @@
|
|||||||
|
|
||||||
. ./testing.sh
|
. ./testing.sh
|
||||||
|
|
||||||
|
test -f "$bindir/.config" && . "$bindir/.config"
|
||||||
|
|
||||||
# testing "test name" "options" "expected result" "file input" "stdin"
|
# testing "test name" "options" "expected result" "file input" "stdin"
|
||||||
|
|
||||||
|
if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then
|
||||||
|
testing "One byte which is not valid unicode char followed by valid input" \
|
||||||
|
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
|
||||||
|
"\
|
||||||
|
00000000 ff 2d 0a |.-.|
|
||||||
|
00000003
|
||||||
|
" \
|
||||||
|
"" \
|
||||||
|
"echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
|
||||||
|
|
||||||
|
testing "30 bytes which are not valid unicode chars followed by valid input" \
|
||||||
|
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
|
||||||
|
"\
|
||||||
|
00000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
|
||||||
|
00000010 ff ff ff ff ff ff ff ff ff ff ff ff ff ff 2d 0a |..............-.|
|
||||||
|
00000020
|
||||||
|
" \
|
||||||
|
"" \
|
||||||
|
"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
|
||||||
|
else
|
||||||
testing "One byte which is not valid unicode char followed by valid input" \
|
testing "One byte which is not valid unicode char followed by valid input" \
|
||||||
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
|
"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \
|
||||||
"\
|
"\
|
||||||
@ -27,6 +49,8 @@ testing "30 bytes which are not valid unicode chars followed by valid input" \
|
|||||||
" \
|
" \
|
||||||
"" \
|
"" \
|
||||||
"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
|
"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
# Not sure this behavior is perfect: we lose all invalid input which precedes
|
# Not sure this behavior is perfect: we lose all invalid input which precedes
|
||||||
# arrow keys and such. In this example, \xff\xff are lost
|
# arrow keys and such. In this example, \xff\xff are lost
|
||||||
|
Loading…
Reference in New Issue
Block a user