exclude more invalid unicode chars
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
344a44fbc5
commit
40e4e88a28
@ -59,8 +59,39 @@
|
|||||||
* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if CONFIG_LAST_SUPPORTED_WCHAR == 0
|
/* Assigned Unicode character ranges:
|
||||||
# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1)
|
* Plane Range
|
||||||
|
* 0 0000–FFFF Basic Multilingual Plane
|
||||||
|
* 1 10000–1FFFF Supplementary Multilingual Plane
|
||||||
|
* 2 20000–2FFFF Supplementary Ideographic Plane
|
||||||
|
* 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
|
||||||
|
* 4-13 40000–DFFFF currently unassigned
|
||||||
|
* 14 E0000–EFFFF Supplementary Special-purpose Plane
|
||||||
|
* 15 F0000–FFFFF Supplementary Private Use Area-A
|
||||||
|
* 16 100000–10FFFF Supplementary Private Use Area-B
|
||||||
|
*
|
||||||
|
* "Supplementary Special-purpose Plane currently contains non-graphical
|
||||||
|
* characters in two blocks of 128 and 240 characters. The first block
|
||||||
|
* is for language tag characters for use when language cannot be indicated
|
||||||
|
* through other protocols (such as the xml:lang attribute in XML).
|
||||||
|
* The other block contains glyph variation selectors to indicate
|
||||||
|
* an alternate glyph for a character that cannot be determined by context."
|
||||||
|
*
|
||||||
|
* In simpler terms: it is a tool to fix the "Han unification" mess
|
||||||
|
* created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
|
||||||
|
* version of a character. (They forgot that the whole purpose of the Unicode
|
||||||
|
* was to be able to write all chars in one charset without such tricks).
|
||||||
|
* Until East Asian users say it is actually necessary to support these
|
||||||
|
* code points in console applications like busybox
|
||||||
|
* (i.e. do these chars ever appear in filenames, hostnames, text files
|
||||||
|
* and such?), we are treating these code points as invalid.
|
||||||
|
*
|
||||||
|
* Tertiary Ideographic Plane is also ignored for now,
|
||||||
|
* until Unicode committee assigns something there.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000
|
||||||
|
# define LAST_SUPPORTED_WCHAR 0x30000
|
||||||
#else
|
#else
|
||||||
# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
|
# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
|
||||||
#endif
|
#endif
|
||||||
@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs)
|
|||||||
#undef BIG_
|
#undef BIG_
|
||||||
#undef PAIR
|
#undef PAIR
|
||||||
};
|
};
|
||||||
# if LAST_SUPPORTED_WCHAR >= 0x1100
|
# if LAST_SUPPORTED_WCHAR >= 0x10000
|
||||||
|
/* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
|
||||||
static const struct interval combining0x10000[] = {
|
static const struct interval combining0x10000[] = {
|
||||||
{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
|
{ 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
|
||||||
{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
|
{ 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
|
||||||
@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs)
|
|||||||
# if LAST_SUPPORTED_WCHAR < 0x1100
|
# if LAST_SUPPORTED_WCHAR < 0x1100
|
||||||
return -1;
|
return -1;
|
||||||
# else
|
# else
|
||||||
/* binary search in table of non-spacing characters, cont. */
|
if (ucs >= LAST_SUPPORTED_WCHAR)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
/* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */
|
||||||
|
/* We also exclude Private Use Area (e000..f8ff) */
|
||||||
|
if (LAST_SUPPORTED_WCHAR >= 0xd800
|
||||||
|
&& (ucs >= 0xd800 || ucs <= 0xf8ff)
|
||||||
|
) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 0xfffe and 0xffff in every plane are invalid */
|
||||||
|
if (LAST_SUPPORTED_WCHAR >= 0xfffe
|
||||||
|
&& (ucs & 0xfffe) == 0xfffe
|
||||||
|
) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
# if LAST_SUPPORTED_WCHAR >= 0x10000
|
||||||
|
/* binary search in table of non-spacing characters in Supplementary Multilingual Plane */
|
||||||
if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
|
if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
|
||||||
return 0;
|
return 0;
|
||||||
if (ucs == 0xE0001
|
# endif
|
||||||
|| (ucs >= 0xE0020 && ucs <= 0xE007F)
|
/* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
|
||||||
|| (ucs >= 0xE0100 && ucs <= 0xE01EF)
|
if (LAST_SUPPORTED_WCHAR >= 0xE0001
|
||||||
|
&& ( ucs == 0xE0001
|
||||||
|
|| (ucs >= 0xE0020 && ucs <= 0xE007F)
|
||||||
|
|| (ucs >= 0xE0100 && ucs <= 0xE01EF)
|
||||||
|
)
|
||||||
) {
|
) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs)
|
|||||||
|| (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
|
|| (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
|
||||||
|| (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
|
|| (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
|
||||||
|| (ucs >= 0xffe0 && ucs <= 0xffe6)
|
|| (ucs >= 0xffe0 && ucs <= 0xffe6)
|
||||||
|| (ucs >= 0x20000 && ucs <= 0x2fffd)
|
|| ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
|
||||||
|| (ucs >= 0x30000 && ucs <= 0x3fffd)
|
|
||||||
);
|
);
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -18,7 +18,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
|
|||||||
&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
|
&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
|
||||||
&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
|
&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
|
||||||
&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
|
&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
|
||||||
&& testing "ls unicode test" \
|
&& testing "ls unicode test with codepoints limited to 767" \
|
||||||
"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \
|
"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \
|
||||||
'0001_1__Some_correct_UTF-8_text___________________________________________|
|
'0001_1__Some_correct_UTF-8_text___________________________________________|
|
||||||
0002_2__Boundary_condition_test_cases_____________________________________|
|
0002_2__Boundary_condition_test_cases_____________________________________|
|
||||||
@ -132,6 +132,125 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
|
|||||||
0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________|
|
0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________|
|
||||||
' "" ""
|
' "" ""
|
||||||
|
|
||||||
|
# Currently fails on "0080_4.2.2__U-000007FF_=_e0_9f_bf" line
|
||||||
|
test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
|
||||||
|
&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
|
||||||
|
&& test x"$CONFIG_SUBST_WCHAR" = x"63" \
|
||||||
|
&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \
|
||||||
|
&& testing "ls unicode test with unlimited codepoints" \
|
||||||
|
"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \
|
||||||
|
'0001_1__Some_correct_UTF-8_text___________________________________________|
|
||||||
|
0002_2__Boundary_condition_test_cases_____________________________________|
|
||||||
|
0003_2.1__First_possible_sequence_of_a_certain_length_____________________|
|
||||||
|
0004_2.1.2__2_bytes__U-00000080_:________"?"______________________________|
|
||||||
|
0005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________|
|
||||||
|
0006_2.1.4__4_bytes__U-00010000_:________"?"______________________________|
|
||||||
|
0007_2.1.5__5_bytes__U-00200000_:________"?"______________________________|
|
||||||
|
0008_2.1.6__6_bytes__U-04000000_:________"?"______________________________|
|
||||||
|
0009_2.2__Last_possible_sequence_of_a_certain_length______________________|
|
||||||
|
0010_2.2.1__1_byte___U-0000007F_:________"?"______________________________|
|
||||||
|
0011_2.2.2__2_bytes__U-000007FF_:________"߿"______________________________|
|
||||||
|
0012_2.2.3__3_bytes__U-0000FFFF_:________"?"______________________________|
|
||||||
|
0013_2.2.4__4_bytes__U-001FFFFF_:________"?"______________________________|
|
||||||
|
0014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________|
|
||||||
|
0015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________|
|
||||||
|
0016_2.3__Other_boundary_conditions_______________________________________|
|
||||||
|
0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________|
|
||||||
|
0018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________|
|
||||||
|
0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________|
|
||||||
|
0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________|
|
||||||
|
0021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________|
|
||||||
|
0022_3__Malformed_sequences_______________________________________________|
|
||||||
|
0023_3.1__Unexpected_continuation_bytes___________________________________|
|
||||||
|
0024_3.1.1__First_continuation_byte_0x80:_"?"_____________________________|
|
||||||
|
0025_3.1.2__Last__continuation_byte_0xbf:_"?"_____________________________|
|
||||||
|
0026_3.1.3__2_continuation_bytes:_"??"____________________________________|
|
||||||
|
0027_3.1.4__3_continuation_bytes:_"???"___________________________________|
|
||||||
|
0028_3.1.5__4_continuation_bytes:_"????"__________________________________|
|
||||||
|
0029_3.1.6__5_continuation_bytes:_"?????"_________________________________|
|
||||||
|
0030_3.1.7__6_continuation_bytes:_"??????"________________________________|
|
||||||
|
0031_3.1.8__7_continuation_bytes:_"???????"_______________________________|
|
||||||
|
0032_3.1.9__Sequence_of_all_64_possible_continuation_bytes__0x80-0xbf_:___|
|
||||||
|
0033____"????????????????_________________________________________________|
|
||||||
|
0034_____????????????????_________________________________________________|
|
||||||
|
0035_____????????????????_________________________________________________|
|
||||||
|
0036_____????????????????"________________________________________________|
|
||||||
|
0037_3.2__Lonely_start_characters_________________________________________|
|
||||||
|
0038_3.2.1__All_32_first_bytes_of_2-byte_sequences__0xc0-0xdf_,___________|
|
||||||
|
0039________each_followed_by_a_space_character:___________________________|
|
||||||
|
0040____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?__________________________________|
|
||||||
|
0041_____?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________|
|
||||||
|
0042_3.2.2__All_16_first_bytes_of_3-byte_sequences__0xe0-0xef_,___________|
|
||||||
|
0043________each_followed_by_a_space_character:___________________________|
|
||||||
|
0044____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________|
|
||||||
|
0045_3.2.3__All_8_first_bytes_of_4-byte_sequences__0xf0-0xf7_,____________|
|
||||||
|
0046________each_followed_by_a_space_character:___________________________|
|
||||||
|
0047____"?_?_?_?_?_?_?_?_"________________________________________________|
|
||||||
|
0048_3.2.4__All_4_first_bytes_of_5-byte_sequences__0xf8-0xfb_,____________|
|
||||||
|
0049________each_followed_by_a_space_character:___________________________|
|
||||||
|
0050____"?_?_?_?_"________________________________________________________|
|
||||||
|
0051_3.2.5__All_2_first_bytes_of_6-byte_sequences__0xfc-0xfd_,____________|
|
||||||
|
0052________each_followed_by_a_space_character:___________________________|
|
||||||
|
0053____"?_?_"____________________________________________________________|
|
||||||
|
0054_3.3__Sequences_with_last_continuation_byte_missing___________________|
|
||||||
|
0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
|
||||||
|
0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
|
||||||
|
0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
|
||||||
|
0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
|
||||||
|
0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
|
||||||
|
0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______|
|
||||||
|
0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______|
|
||||||
|
0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______|
|
||||||
|
0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______|
|
||||||
|
0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______|
|
||||||
|
0065_3.4__Concatenation_of_incomplete_sequences___________________________|
|
||||||
|
0066____"??????????"______________________________________________________|
|
||||||
|
0067_3.5__Impossible_bytes________________________________________________|
|
||||||
|
0068_3.5.1__fe_=_"?"______________________________________________________|
|
||||||
|
0069_3.5.2__ff_=_"?"______________________________________________________|
|
||||||
|
0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________|
|
||||||
|
0071_4__Overlong_sequences________________________________________________|
|
||||||
|
0072_4.1__Examples_of_an_overlong_ASCII_character_________________________|
|
||||||
|
0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________|
|
||||||
|
0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________|
|
||||||
|
0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________|
|
||||||
|
0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________|
|
||||||
|
0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________|
|
||||||
|
0078_4.2__Maximum_overlong_sequences______________________________________|
|
||||||
|
0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________|
|
||||||
|
0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________|
|
||||||
|
0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________|
|
||||||
|
0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________|
|
||||||
|
0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________|
|
||||||
|
0084_4.3__Overlong_representation_of_the_NUL_character____________________|
|
||||||
|
0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________|
|
||||||
|
0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________|
|
||||||
|
0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________|
|
||||||
|
0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________|
|
||||||
|
0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________|
|
||||||
|
0090_5__Illegal_code_positions____________________________________________|
|
||||||
|
0091_5.1_Single_UTF-16_surrogates_________________________________________|
|
||||||
|
0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________|
|
||||||
|
0093_5.1.2__U+DB7F_=_ed_ad_bf_=_"?"_______________________________________|
|
||||||
|
0094_5.1.3__U+DB80_=_ed_ae_80_=_"?"_______________________________________|
|
||||||
|
0095_5.1.4__U+DBFF_=_ed_af_bf_=_"?"_______________________________________|
|
||||||
|
0096_5.1.5__U+DC00_=_ed_b0_80_=_"?"_______________________________________|
|
||||||
|
0097_5.1.6__U+DF80_=_ed_be_80_=_"?"_______________________________________|
|
||||||
|
0098_5.1.7__U+DFFF_=_ed_bf_bf_=_"?"_______________________________________|
|
||||||
|
0099_5.2_Paired_UTF-16_surrogates_________________________________________|
|
||||||
|
0100_5.2.1__U+D800_U+DC00_=_ed_a0_80_ed_b0_80_=_"??"______________________|
|
||||||
|
0101_5.2.2__U+D800_U+DFFF_=_ed_a0_80_ed_bf_bf_=_"??"______________________|
|
||||||
|
0102_5.2.3__U+DB7F_U+DC00_=_ed_ad_bf_ed_b0_80_=_"??"______________________|
|
||||||
|
0103_5.2.4__U+DB7F_U+DFFF_=_ed_ad_bf_ed_bf_bf_=_"??"______________________|
|
||||||
|
0104_5.2.5__U+DB80_U+DC00_=_ed_ae_80_ed_b0_80_=_"??"______________________|
|
||||||
|
0105_5.2.6__U+DB80_U+DFFF_=_ed_ae_80_ed_bf_bf_=_"??"______________________|
|
||||||
|
0106_5.2.7__U+DBFF_U+DC00_=_ed_af_bf_ed_b0_80_=_"??"______________________|
|
||||||
|
0107_5.2.8__U+DBFF_U+DFFF_=_ed_af_bf_ed_bf_bf_=_"??"______________________|
|
||||||
|
0108_5.3_Other_illegal_code_positions_____________________________________|
|
||||||
|
0109_5.3.1__U+FFFE_=_ef_bf_be_=_"?"_______________________________________|
|
||||||
|
0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________|
|
||||||
|
' "" ""
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
rm -rf ls.testdir 2>/dev/null
|
rm -rf ls.testdir 2>/dev/null
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user