diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index 410c741ac..c7cc524a6 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c @@ -90,13 +90,13 @@ * until Unicode committee assigns something there. */ -#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000 -# define LAST_SUPPORTED_WCHAR 0x30000 +#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000 +# define LAST_SUPPORTED_WCHAR 0x2ffff #else # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR #endif -#if LAST_SUPPORTED_WCHAR >= 0x0300 +#if LAST_SUPPORTED_WCHAR >= 0x300 struct interval { uint16_t first; uint16_t last; @@ -185,7 +185,7 @@ static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max) */ static int wcwidth(unsigned ucs) { -#if LAST_SUPPORTED_WCHAR >= 0x0300 +#if LAST_SUPPORTED_WCHAR >= 0x300 /* sorted list of non-overlapping intervals of non-spacing characters */ /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ static const struct interval combining[] = { @@ -460,75 +460,75 @@ static int wcwidth(unsigned ucs) #undef BIG_ #undef PAIR }; -# if LAST_SUPPORTED_WCHAR >= 0x10000 - /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */ - static const struct interval combining0x10000[] = { - { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, - { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, - { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD }, - { 0xD242, 0xD244 } - }; -# endif #endif if (ucs == 0) return 0; - /* test for 8-bit control characters (00-1f, 80-9f, 7f) */ + + /* Test for 8-bit control characters (00-1f, 80-9f, 7f) */ if ((ucs & ~0x80) < 0x20 || ucs == 0x7f) return -1; - if (ucs < 0x0300) /* optimization */ + /* Quick abort if it is an obviously invalid char */ + if (ucs > LAST_SUPPORTED_WCHAR) + return -1; + + /* Optimization: no combining chars below 0x300 */ + if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300) return 1; -#if LAST_SUPPORTED_WCHAR < 0x0300 - return -1; -#else - /* binary search in table of non-spacing characters */ +#if LAST_SUPPORTED_WCHAR >= 0x300 + /* Binary search in table of non-spacing characters */ if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1)) return 0; if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1)) return 0; - if (ucs < 0x1100) /* optimization */ + /* Optimization: all chars below 0x1100 are not double-width */ + if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100) return 1; -# if LAST_SUPPORTED_WCHAR < 0x1100 - return -1; -# else - if (ucs >= LAST_SUPPORTED_WCHAR) - return -1; - - /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */ - /* We also exclude Private Use Area (e000..f8ff) */ - if (LAST_SUPPORTED_WCHAR >= 0xd800 - && (ucs >= 0xd800 || ucs <= 0xf8ff) +# if LAST_SUPPORTED_WCHAR >= 0x1100 + /* Invalid code points: */ + /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */ + /* Private Use Area (e000..f8ff) */ + /* Noncharacters fdd0..fdef */ + if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff) + || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef) ) { return -1; } - /* 0xfffe and 0xffff in every plane are invalid */ - if (LAST_SUPPORTED_WCHAR >= 0xfffe - && (ucs & 0xfffe) == 0xfffe - ) { + if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) { return -1; } # if LAST_SUPPORTED_WCHAR >= 0x10000 - /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */ - if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) - return 0; -# endif - /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ - if (LAST_SUPPORTED_WCHAR >= 0xE0001 - && ( ucs == 0xE0001 - || (ucs >= 0xE0020 && ucs <= 0xE007F) - || (ucs >= 0xE0100 && ucs <= 0xE01EF) - ) - ) { - return 0; + if (ucs >= 0x10000) { + /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */ + static const struct interval combining0x10000[] = { + { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, + { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, + { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD }, + { 0xD242, 0xD244 } + }; + /* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */ + if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) + return 0; + /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ + if (LAST_SUPPORTED_WCHAR >= 0xE0001 + && ( ucs == 0xE0001 + || (ucs >= 0xE0020 && ucs <= 0xE007F) + || (ucs >= 0xE0100 && ucs <= 0xE01EF) + ) + ) { + return 0; + } } +# endif - /* if we arrive here, ucs is not a combining or C0/C1 control character */ - + /* If we arrive here, ucs is not a combining or C0/C1 control character. + * Check whether it's 1 char or 2-shar wide. + */ return 1 + ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */ || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */ diff --git a/testsuite/ls.tests b/testsuite/ls.tests index e08249ea6..169313a63 100755 --- a/testsuite/ls.tests +++ b/testsuite/ls.tests @@ -13,7 +13,7 @@ mkdir ls.testdir || exit 1 # With Unicode provided by libc locale, I'm not sure this test can pass. # I suspect we might fail to skip exactly correct number of bytes -# over broken unicode sequences. +# over broked unicode sequences. test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ && test x"$CONFIG_SUBST_WCHAR" = x"63" \ @@ -144,7 +144,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ 0003_2.1__First_possible_sequence_of_a_certain_length_____________________| 0004_2.1.2__2_bytes__U-00000080_:________"?"______________________________| 0005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________| -0006_2.1.4__4_bytes__U-00010000_:________"?"______________________________| +0006_2.1.4__4_bytes__U-00010000_:________"𐀀"______________________________| 0007_2.1.5__5_bytes__U-00200000_:________"?"______________________________| 0008_2.1.6__6_bytes__U-04000000_:________"?"______________________________| 0009_2.2__Last_possible_sequence_of_a_certain_length______________________| @@ -155,9 +155,9 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ 0014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________| 0015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________| 0016_2.3__Other_boundary_conditions_______________________________________| -0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________| +0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"퟿"___________________________________| 0018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________| -0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________| +0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"�"___________________________________| 0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________| 0021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________| 0022_3__Malformed_sequences_______________________________________________|