more fine-grained Unicode support
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
083e172641
commit
2edba21f4c
51
Config.in
51
Config.in
@ -141,6 +141,57 @@ config FEATURE_CHECK_UNICODE_IN_ENV
|
|||||||
|
|
||||||
Otherwise, Unicode support will be always enabled and active.
|
Otherwise, Unicode support will be always enabled and active.
|
||||||
|
|
||||||
|
config SUBST_WCHAR
|
||||||
|
int "Character code to substitute unprintable characters with"
|
||||||
|
range 1 4294967295
|
||||||
|
depends on FEATURE_ASSUME_UNICODE
|
||||||
|
default 63
|
||||||
|
help
|
||||||
|
Typical values are 63 for '?' (works with any output device),
|
||||||
|
30 for ASCII substitute control code,
|
||||||
|
65533 (0xfffd) for Unicode replacement character.
|
||||||
|
|
||||||
|
config LAST_SUPPORTED_WCHAR
|
||||||
|
int "Range of supported Unicode characters"
|
||||||
|
range 0 4294967295
|
||||||
|
depends on FEATURE_ASSUME_UNICODE
|
||||||
|
default 767
|
||||||
|
help
|
||||||
|
Any character with Unicode value bigger than this is assumed
|
||||||
|
to be non-printable on output device. Many applets replace
|
||||||
|
such chars with substitution character.
|
||||||
|
|
||||||
|
The idea is that many valid printable Unicode chars are
|
||||||
|
nevertheless are not displayed correctly. Think about
|
||||||
|
combining charachers, double-wide hieroglyphs and such.
|
||||||
|
Many terminals, xterms and such will fail to handle them
|
||||||
|
correctly.
|
||||||
|
|
||||||
|
Typical values are:
|
||||||
|
126 - ASCII only
|
||||||
|
767 (0x2ff) - there are no combining chars in [0..767] range
|
||||||
|
(the range includes Latin 1, Latin Ext. A and B),
|
||||||
|
code is ~700 bytes smaller for this case.
|
||||||
|
4351 (0x10ff) - there are no double-wide chars in [0..4351] range,
|
||||||
|
code is ~300 bytes smaller for this case.
|
||||||
|
0 - off, any valid printable Unicode character will be printed.
|
||||||
|
|
||||||
|
config UNICODE_COMBINING_WCHARS
|
||||||
|
bool "Allow zero-width Unicode characters on output"
|
||||||
|
default n
|
||||||
|
depends on FEATURE_ASSUME_UNICODE
|
||||||
|
help
|
||||||
|
With this option off, any Unicode char with width of 0
|
||||||
|
is substituted on output.
|
||||||
|
|
||||||
|
config UNICODE_WIDE_WCHARS
|
||||||
|
bool "Allow wide Unicode characters on output"
|
||||||
|
default n
|
||||||
|
depends on FEATURE_ASSUME_UNICODE
|
||||||
|
help
|
||||||
|
With this option off, any Unicode char with width > 1
|
||||||
|
is substituted on output.
|
||||||
|
|
||||||
config LONG_OPTS
|
config LONG_OPTS
|
||||||
bool "Support for --long-options"
|
bool "Support for --long-options"
|
||||||
default y
|
default y
|
||||||
|
@ -216,8 +216,6 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
|
|||||||
return org_n - n;
|
return org_n - n;
|
||||||
}
|
}
|
||||||
|
|
||||||
#include "unicode_wcwidth.c"
|
|
||||||
|
|
||||||
int FAST_FUNC iswspace(wint_t wc)
|
int FAST_FUNC iswspace(wint_t wc)
|
||||||
{
|
{
|
||||||
return (unsigned)wc <= 0x7f && isspace(wc);
|
return (unsigned)wc <= 0x7f && isspace(wc);
|
||||||
@ -233,6 +231,8 @@ int FAST_FUNC iswpunct(wint_t wc)
|
|||||||
return (unsigned)wc <= 0x7f && ispunct(wc);
|
return (unsigned)wc <= 0x7f && ispunct(wc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "unicode_wcwidth.c"
|
||||||
|
|
||||||
#endif /* Homegrown Unicode support */
|
#endif /* Homegrown Unicode support */
|
||||||
|
|
||||||
|
|
||||||
@ -251,8 +251,22 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
|
|||||||
char *dst;
|
char *dst;
|
||||||
unsigned dst_len;
|
unsigned dst_len;
|
||||||
|
|
||||||
if (unicode_status != UNICODE_ON)
|
if (unicode_status != UNICODE_ON) {
|
||||||
return xasprintf("%-*.*s", width, width, src);
|
char *d = dst = xmalloc(width + 1);
|
||||||
|
while ((int)--width >= 0) {
|
||||||
|
unsigned char c = *src;
|
||||||
|
if (c == '\0') {
|
||||||
|
do
|
||||||
|
*d++ = ' ';
|
||||||
|
while ((int)--width >= 0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
*d++ = (c >= ' ' && c < 0x7f) ? c : '?';
|
||||||
|
src++;
|
||||||
|
}
|
||||||
|
*d = '\0';
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
|
||||||
dst = NULL;
|
dst = NULL;
|
||||||
dst_len = 0;
|
dst_len = 0;
|
||||||
@ -260,31 +274,64 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
|
|||||||
int w;
|
int w;
|
||||||
wchar_t wc;
|
wchar_t wc;
|
||||||
|
|
||||||
dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX);
|
|
||||||
#if ENABLE_LOCALE_SUPPORT
|
#if ENABLE_LOCALE_SUPPORT
|
||||||
{
|
{
|
||||||
mbstate_t mbst = { 0 };
|
mbstate_t mbst = { 0 };
|
||||||
ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
|
ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst);
|
||||||
if (rc <= 0) /* error, or end-of-string */
|
/* If invalid sequence is seen: -1 is returned,
|
||||||
|
* src points to the invalid sequence, errno = EILSEQ.
|
||||||
|
* Else number of wchars (excluding terminating L'\0')
|
||||||
|
* written to dest is returned.
|
||||||
|
* If len (here: 1) non-L'\0' wchars stored at dest,
|
||||||
|
* src points to the next char to be converted.
|
||||||
|
* If string is completely converted: src = NULL.
|
||||||
|
*/
|
||||||
|
if (rc == 0) /* end-of-string */
|
||||||
break;
|
break;
|
||||||
|
if (rc < 0) { /* error */
|
||||||
|
src++;
|
||||||
|
goto subst;
|
||||||
|
}
|
||||||
|
if (!iswprint(wc))
|
||||||
|
goto subst;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
src = mbstowc_internal(&wc, src);
|
{
|
||||||
if (!src || wc == 0) /* error, or end-of-string */
|
const char *src1 = mbstowc_internal(&wc, src);
|
||||||
break;
|
/* src = NULL: invalid sequence is seen,
|
||||||
#endif
|
* else: wc is set, src is advanced to next mb char
|
||||||
w = wcwidth(wc);
|
*/
|
||||||
if (w < 0) /* non-printable wchar */
|
if (src1) {/* no error */
|
||||||
break;
|
if (wc == 0) /* end-of-string */
|
||||||
width -= w;
|
break;
|
||||||
if ((int)width < 0) { /* string is longer than width */
|
src = src1;
|
||||||
width += w;
|
} else { /* error */
|
||||||
while (width) {
|
src++;
|
||||||
dst[dst_len++] = ' ';
|
goto subst;
|
||||||
width--;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
|
||||||
|
goto subst;
|
||||||
|
w = wcwidth(wc);
|
||||||
|
if ((ENABLE_UNICODE_COMBINING_WCHARS && w < 0) /* non-printable wchar */
|
||||||
|
|| (!ENABLE_UNICODE_COMBINING_WCHARS && wc <= 0)
|
||||||
|
|| (!ENABLE_UNICODE_WIDE_WCHARS && wc > 1)
|
||||||
|
) {
|
||||||
|
subst:
|
||||||
|
wc = CONFIG_SUBST_WCHAR;
|
||||||
|
w = 1;
|
||||||
|
}
|
||||||
|
width -= w;
|
||||||
|
/* Note: if width == 0, we still may add more chars,
|
||||||
|
* they may be zero-width or combining ones */
|
||||||
|
if ((int)width < 0) {
|
||||||
|
/* can't add this wc, string would become longer than width */
|
||||||
|
width += w;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dst = xrealloc(dst, dst_len + MB_CUR_MAX);
|
||||||
#if ENABLE_LOCALE_SUPPORT
|
#if ENABLE_LOCALE_SUPPORT
|
||||||
{
|
{
|
||||||
mbstate_t mbst = { 0 };
|
mbstate_t mbst = { 0 };
|
||||||
@ -294,7 +341,14 @@ char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src)
|
|||||||
dst_len += wcrtomb_internal(&dst[dst_len], wc);
|
dst_len += wcrtomb_internal(&dst[dst_len], wc);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Pad to remaining width */
|
||||||
|
dst = xrealloc(dst, dst_len + width + 1);
|
||||||
|
while ((int)--width >= 0) {
|
||||||
|
dst[dst_len++] = ' ';
|
||||||
|
}
|
||||||
dst[dst_len] = '\0';
|
dst[dst_len] = '\0';
|
||||||
|
|
||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,6 +59,13 @@
|
|||||||
* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#if CONFIG_LAST_SUPPORTED_WCHAR == 0
|
||||||
|
# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1)
|
||||||
|
#else
|
||||||
|
# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if LAST_SUPPORTED_WCHAR >= 0x0300
|
||||||
struct interval {
|
struct interval {
|
||||||
uint16_t first;
|
uint16_t first;
|
||||||
uint16_t last;
|
uint16_t last;
|
||||||
@ -111,6 +118,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
|
|||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/* The following two functions define the column width of an ISO 10646
|
/* The following two functions define the column width of an ISO 10646
|
||||||
@ -146,6 +154,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
|
|||||||
*/
|
*/
|
||||||
static int wcwidth(unsigned ucs)
|
static int wcwidth(unsigned ucs)
|
||||||
{
|
{
|
||||||
|
#if LAST_SUPPORTED_WCHAR >= 0x0300
|
||||||
/* sorted list of non-overlapping intervals of non-spacing characters */
|
/* sorted list of non-overlapping intervals of non-spacing characters */
|
||||||
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
|
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
|
||||||
static const struct interval combining[] = {
|
static const struct interval combining[] = {
|
||||||
@ -420,12 +429,15 @@ static int wcwidth(unsigned ucs)
|
|||||||
#undef BIG_
|
#undef BIG_
|
||||||
#undef PAIR
|
#undef PAIR
|
||||||
};
|
};
|
||||||
|
# if LAST_SUPPORTED_WCHAR >= 0x1100
|
||||||
static const struct interval combining0x10000[] = {
|
static const struct interval combining0x10000[] = {
|
||||||
{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
|
{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
|
||||||
{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
|
{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
|
||||||
{ 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
|
{ 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
|
||||||
{ 0xD242, 0xD244 }
|
{ 0xD242, 0xD244 }
|
||||||
};
|
};
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
if (ucs == 0)
|
if (ucs == 0)
|
||||||
return 0;
|
return 0;
|
||||||
@ -435,6 +447,9 @@ static int wcwidth(unsigned ucs)
|
|||||||
if (ucs < 0x0300) /* optimization */
|
if (ucs < 0x0300) /* optimization */
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
#if LAST_SUPPORTED_WCHAR < 0x0300
|
||||||
|
return -1;
|
||||||
|
#else
|
||||||
/* binary search in table of non-spacing characters */
|
/* binary search in table of non-spacing characters */
|
||||||
if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
|
if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
|
||||||
return 0;
|
return 0;
|
||||||
@ -444,6 +459,9 @@ static int wcwidth(unsigned ucs)
|
|||||||
if (ucs < 0x1100) /* optimization */
|
if (ucs < 0x1100) /* optimization */
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
# if LAST_SUPPORTED_WCHAR < 0x1100
|
||||||
|
return -1;
|
||||||
|
# else
|
||||||
/* binary search in table of non-spacing characters, cont. */
|
/* binary search in table of non-spacing characters, cont. */
|
||||||
if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
|
if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
|
||||||
return 0;
|
return 0;
|
||||||
@ -458,8 +476,8 @@ static int wcwidth(unsigned ucs)
|
|||||||
|
|
||||||
return 1 +
|
return 1 +
|
||||||
( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
|
( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
|
||||||
|| ucs == 0x2329
|
|| ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
|
||||||
|| ucs == 0x232a
|
|| ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
|
||||||
|| (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
|
|| (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
|
||||||
|| (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
|
|| (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
|
||||||
|| (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
|
|| (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
|
||||||
@ -470,4 +488,6 @@ static int wcwidth(unsigned ucs)
|
|||||||
|| (ucs >= 0x20000 && ucs <= 0x2fffd)
|
|| (ucs >= 0x20000 && ucs <= 0x2fffd)
|
||||||
|| (ucs >= 0x30000 && ucs <= 0x3fffd)
|
|| (ucs >= 0x30000 && ucs <= 0x3fffd)
|
||||||
);
|
);
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user