fix broken ansel input module by porting (updated) 2.2 version;minor fix to test_util_test.py
svn: r9305
This commit is contained in:
		
							
								
								
									
										71
									
								
								src/GrampsDbUtils/test/_GedcomChar_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								src/GrampsDbUtils/test/_GedcomChar_test.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,71 @@ | ||||
| #!/usr/bin/env python | ||||
| import unittest | ||||
| import os, os.path | ||||
| import codecs | ||||
| import struct | ||||
|  | ||||
| from test import test_util as tu | ||||
| m = tu.msg | ||||
|  | ||||
| par = tu.path_append_parent() | ||||
| here = tu.absdir() | ||||
|  | ||||
| import _GedcomChar as G | ||||
|  | ||||
| cdir = tu.make_subdir("test_data") | ||||
|  | ||||
| # unicode block "latin1 supplement" chars | ||||
| utest_chars = "".join(map(unichr, range(0xA0,0x100))) + "\n" | ||||
|  | ||||
| # 12 ansel test chars (raw 8-bit bytes, here) | ||||
| atest_list = range(0xa1,0xa7) + range(0xb1,0xb7) + [0x0a,] | ||||
| atest_bytes = struct.pack("B"*13, *atest_list) | ||||
|  | ||||
| # unicode mappings of above (http://www.gymel.com/charsets/ANSEL.html) | ||||
| a2u = u"".join(map(unichr, ( | ||||
|     0x141, 0xd8, 0x110, 0xde, 0xc6, 0x152, | ||||
|     0x142, 0xf8, 0x111, 0xfe, 0xe6, 0x153, | ||||
|     0x0a, ))) | ||||
|  | ||||
| def gen_chars(filename, encoding): | ||||
|     """write generic test chars as given file and encoding""" | ||||
|     if not os.path.exists(filename): | ||||
|         codecs.open(filename, "wb", encoding).write(utest_chars) | ||||
|      | ||||
| class Test1_ansi(unittest.TestCase): | ||||
|     enc = "latin-1" | ||||
|     fil = os.path.join(cdir,enc) | ||||
|     exp  = utest_chars | ||||
|      | ||||
|     def setUp(s): | ||||
|         gen_chars(s.fil, s.enc) | ||||
|  | ||||
|     def test1a_read_ansi(s):         | ||||
|         f = open(s.fil) | ||||
|         ra= G.AnsiReader(f) | ||||
|         got = ra.readline() | ||||
|         s.assertEquals(got,s.exp, m(got,s.exp, "AnsiReader")) | ||||
|  | ||||
|     def test1b_read_codec_latin1(s): | ||||
|         got=codecs.open(s.fil, encoding=s.enc).read() | ||||
|         s.assertEquals(got,s.exp, m(got,s.exp, "using codec %s" % s.enc)) | ||||
|  | ||||
| class Test2_ansel(unittest.TestCase): | ||||
|     enc = "ansel" | ||||
|     afil = os.path.join(cdir,enc) | ||||
|     exp  = a2u | ||||
|      | ||||
|     def setUp(s): | ||||
|         open(s.afil, "wb").write(atest_bytes) | ||||
|  | ||||
|     def test2a_read_ansel(s): | ||||
|         f = open(s.afil) | ||||
|         ra = G.AnselReader(f) | ||||
|         got = ra.readline() | ||||
|         s.assertEquals(got,s.exp, m(got,s.exp, "AnselReader")) | ||||
|         | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     unittest.main() | ||||
|  | ||||
| #===eof=== | ||||
| @@ -20,333 +20,377 @@ | ||||
|  | ||||
| # $Id$ | ||||
|  | ||||
| """ | ||||
| Handles ANSEL/Unicode conversions | ||||
| """ | ||||
| # ANSEL references: | ||||
| #  http://lcweb2.loc.gov/diglib/codetables/45.html | ||||
| #  http://www.gymel.com/charsets/ANSEL.html | ||||
|  | ||||
| # Note that cStringIO stores 8-bit strings (bytes not unicode) | ||||
| # utf8 will do, since that looks just like bytes  | ||||
| import cStringIO | ||||
|  | ||||
| ONEBYTE = { | ||||
|     '\x8D' : u'\x20\x0D', '\x8E' : u'\x20\x0C', '\xA1' : u'\x01\x41', | ||||
|     '\xA2' : u'\xD8',     '\xA3' : u'\xD0',     '\xA4' : u'\xDE', | ||||
|     '\xA5' : u'\xC6',     '\xA6' : u'\x01\x52', '\xA7' : u'\x02\xB9', | ||||
|     '\xA8' : u'\xB7',     '\xA9' : u'\x26\x6D', '\xAA' : u'\xAE', | ||||
|     '\xAB' : u'\xB1',     '\xAC' : u'\x01\xA0', '\xAD' : u'\x01\xAF', | ||||
|     '\xAE' : u'\x02\xBE', '\xB0' : u'\x02\xBF', '\xB1' : u'\x01\x42', | ||||
|     '\xB2' : u'\xF8',     '\xB3' : u'\x01\x11', '\xB4' : u'\xFE', | ||||
|     '\xB5' : u'\xE6',     '\xB6' : u'\x01\x53', '\xB7' : u'\x02\xBA', | ||||
|     '\xB8' : u'\x01\x31', '\xB9' : u'\xA3',     '\xBA' : u'\xF0', | ||||
|     '\xBC' : u'\x01\xA1', '\xBD' : u'\x01\xB0', '\xC0' : u'\xB0', | ||||
|     '\xC1' : u'\x21\x13', '\xC2' : u'\x21\x17', '\xC3' : u'\xA9', | ||||
|     '\xC4' : u'\x26\x6F', '\xC5' : u'\xBF',     '\xC6' : u'\xA1', | ||||
|     '\xCF' : u'\xDF',     '\xE0' : u'\x03\x09', '\xE1' : u'\x03', | ||||
|     '\xE2' : u'\x03\x01', '\xE3' : u'\x03\x02', '\xE4' : u'\x03\x03', | ||||
|     '\xE5' : u'\x03\x04', '\xE6' : u'\x03\x06', '\xE7' : u'\x03\x07', | ||||
|     '\xE9' : u'\x03\x0C', '\xEA' : u'\x03\x0A', '\xEB' : u'\xFE\x20', | ||||
|     '\xEC' : u'\xFE\x21', '\xED' : u'\x03\x15', '\xEE' : u'\x03\x0B', | ||||
|     '\xEF' : u'\x03\x10', '\xF0' : u'\x03\x27', '\xF1' : u'\x03\x28', | ||||
|     '\xF2' : u'\x03\x23', '\xF3' : u'\x03\x24', '\xF4' : u'\x03\x25', | ||||
|     '\xF5' : u'\x03\x33', '\xF6' : u'\x03\x32', '\xF7' : u'\x03\x26', | ||||
|     '\xF8' : u'\x03\x1C', '\xF9' : u'\x03\x2E', '\xFA' : u'\xFE\x22', | ||||
|     '\xFB' : u'\xFE\x23', '\xFE' : u'\x03\x13', | ||||
| # list of ANSEL codes that replicate ASCII | ||||
| # note that DEL (127=0x7F) is a control char | ||||
| # Note: spec allows control-chars that Gramps probably doesn't use | ||||
| #  but 10=0x0A _is_ needed (!) | ||||
| # --- | ||||
| # Also: there are two additional control chars 0x98,0x9c (unicode same) | ||||
| #  which we also ignore for now (start/emd of string (or sort sequence) | ||||
| # --- | ||||
| # TODO: should we allow TAB, as a Gramps extension? | ||||
| _printable_ascii = map(chr, range(32,127)) # note: up thru 126 | ||||
| _use_ASCII = map(chr, [10, 27, 29 ,30, 31]) + _printable_ascii | ||||
|  | ||||
| # mappings of single byte ANSEL codes to unicode | ||||
| _onebyte = { | ||||
|      '\xA1' : u'\u0141',   '\xA2' : u'\u00d8',   '\xA3' : u'\u0110',    | ||||
|      '\xA4' : u'\u00de',   '\xA5' : u'\u00c6',   '\xA6' : u'\u0152',    | ||||
|      '\xA7' : u'\u02b9',   '\xA8' : u'\u00b7',   '\xA9' : u'\u266d',    | ||||
|      '\xAA' : u'\u00ae',   '\xAB' : u'\u00b1',   '\xAC' : u'\u01a0',    | ||||
|      '\xAD' : u'\u01af',   '\xAE' : u'\u02bc',   '\xB0' : u'\u02bb',    | ||||
|      '\xB1' : u'\u0142',   '\xB2' : u'\u00f8',   '\xB3' : u'\u0111',    | ||||
|      '\xB4' : u'\u00fe',   '\xB5' : u'\u00e6',   '\xB6' : u'\u0153',    | ||||
|      '\xB7' : u'\u02ba',   '\xB8' : u'\u0131',   '\xB9' : u'\u00a3',    | ||||
|      '\xBA' : u'\u00f0',   '\xBC' : u'\u01a1',   '\xBD' : u'\u01b0',    | ||||
|      '\xC0' : u'\u00b0',   '\xC1' : u'\u2113',   '\xC2' : u'\u2117',    | ||||
|      '\xC3' : u'\u00a9',   '\xC4' : u'\u266f',   '\xC5' : u'\u00bf',    | ||||
|      '\xC6' : u'\u00a1',   '\xC7' : u'\u00df',   '\xC8' : u'\u20ac',   | ||||
|     } | ||||
|  | ||||
| TWOBYTE = { | ||||
|     '\xe1a' : u'\xe0',     '\xf2T' : u'\x1el',    '\xf2V' : u'\x1e~', | ||||
|     '\xe9z' : u'\x01~',    '\xe1e' : u'\xe8',     '\xeau' : u'\x01o', | ||||
|     '\xf2S' : u'\x1eb',    '\xe1i' : u'\xec',     '\xe9s' : u'\x01a', | ||||
|     '\xe9r' : u'\x01Y',    '\xe9u' : u'\x01\xd4', '\xe9t' : u'\x01e', | ||||
|     '\xe1o' : u'\xf2',     '\xe9i' : u'\x01\xd0', '\xf2E' : u'\x1e\xb8', | ||||
|     '\xe9k' : u'\x01\xe9', '\xe9j' : u'\x01\xf0', '\xe1u' : u'\xf9', | ||||
|     '\xe9l' : u'\x01>',    '\xe1w' : u'\x1e\x81', '\xe9n' : u'\x01H', | ||||
|     '\xe1y' : u'\x1e\xf3', '\xf2M' : u'\x1eB',    '\xe9c' : u'\x01\r', | ||||
|     '\xf2O' : u'\x1e\xcc', '\xe9e' : u'\x01\x1b', '\xe9d' : u'\x01\x0f', | ||||
|     '\xe9g' : u'\x01\xe7', '\xeaA' : u'\xc5',     '\xe1A' : u'\xc0', | ||||
|     '\xf2u' : u'\x1e\xe5', '\xf2v' : u'\x1e\x7f', '\xe9Z' : u'\x01}', | ||||
|     '\xe1E' : u'\xc8',     '\xf2R' : u'\x1eZ',    '\xf2r' : u'\x1e[', | ||||
|     '\xf0c' : u'\xe7',     '\xe1I' : u'\xcc',     '\xe9S' : u'\x01`', | ||||
|     '\xe9R' : u'\x01X',    '\xe9U' : u'\x01\xd3', '\xe9T' : u'\x01d', | ||||
|     '\xe1O' : u'\xd2',     '\xeaa' : u'\xe5',     '\xe9I' : u'\x01\xcf', | ||||
|     '\xf2e' : u'\x1e\xb9', '\xe9K' : u'\x01\xe8', '\xe1U' : u'\xd9', | ||||
|     '\xe9L' : u'\x01=',    '\xe1W' : u'\x1e\x80', '\xe9N' : u'\x01G', | ||||
|     '\xe1Y' : u'\x1e\xf2', '\xf0h' : u'\x1e)',    '\xe9C' : u'\x01\x0c', | ||||
|     '\xf2o' : u'\x1e\xcd', '\xe9E' : u'\x01\x1a', '\xe9D' : u'\x01\x0e', | ||||
|     '\xe9G' : u'\x01\xe6', '\xf0g' : u'\x01#',    '\xe2e' : u'\xe9', | ||||
|     '\xe2g' : u'\x01\xf5', '\xe2a' : u'\xe1',     '\xe2c' : u'\x01\x07', | ||||
|     '\xe2l' : u'\x01:',    '\xe2m' : u'\x1e?',    '\xe2n' : u'\x01D', | ||||
|     '\xe2o' : u'\xf3',     '\xe2i' : u'\xed',     '\xf3U' : u'\x1er', | ||||
|     '\xe2k' : u'\x1e1',    '\xe2u' : u'\xfa',     '\xe2w' : u'\x1e\x83', | ||||
|     '\xe2p' : u'\x1eU',    '\xf2Z' : u'\x1e\x92', '\xe2r' : u'\x01U', | ||||
|     '\xe2s' : u'\x01[',    '\xe2y' : u'\xfd',     '\xe2z' : u'\x01z', | ||||
|     '\xe2E' : u'\xc9',     '\xe2G' : u'\x01\xf4', '\xe2A' : u'\xc1', | ||||
|     '\xe2C' : u'\x01\x06', '\xe2L' : u'\x019',    '\xe2M' : u'\x1e>', | ||||
|     '\xe2N' : u'\x01C',    '\xe2O' : u'\xd3',     '\xe2I' : u'\xcd', | ||||
|     '\xf3u' : u'\x1es',    '\xe2K' : u'\x1e0',    '\xe2U' : u'\xda', | ||||
|     '\xe2W' : u'\x1e\x82', '\xe2P' : u'\x1eT',    '\xf0l' : u'\x01<', | ||||
|     '\xe2R' : u'\x01T',    '\xe2S' : u'\x01Z',    '\xea\xad' : u'\x01n', | ||||
|     '\xf0k' : u'\x017',    '\xe2Y' : u'\xdd',     '\xe2Z' : u'\x01y', | ||||
|     '\xf2A' : u'\x1e\xa0', '\xe7w' : u'\x1e\x87', '\xe2\xa5' : u'\x01\xfc', | ||||
|     '\xe7t' : u'\x1ek',    '\xe7s' : u'\x1ea',    '\xe7r' : u'\x1eY', | ||||
|     '\xf0G' : u'\x01"',    '\xe7p' : u'\x1eW',    '\xf2k' : u'\x1e3', | ||||
|     '\xf2I' : u'\x1e\xca', '\xe7z' : u'\x01|',    '\xe7y' : u'\x1e\x8f', | ||||
|     '\xe7x' : u'\x1e\x8b', '\xe7g' : u'\x01!',    '\xe2\xb5' : u'\x01\xfd', | ||||
|     '\xe7e' : u'\x01\x17', '\xe7d' : u'\x1e\x0b', '\xe7c' : u'\x01\x0b', | ||||
|     '\xe7b' : u'\x1e\x03', '\xf0D' : u'\x1e\x10', '\xe7n' : u'\x1eE', | ||||
|     '\xe7m' : u'\x1eA',    '\xf0N' : u'\x01E',    '\xf2N' : u'\x1eF', | ||||
|     '\xf0L' : u'\x01;',    '\xe7h' : u'\x1e#',    '\xe7W' : u'\x1e\x86', | ||||
|     '\xf0s' : u'\x01_',    '\xe7T' : u'\x1ej',    '\xe7S' : u'\x1e`', | ||||
|     '\xe7R' : u'\x1eX',    '\xf0t' : u'\x01c',    '\xe7P' : u'\x1eV', | ||||
|     '\xf2H' : u'\x1e$',    '\xf2Y' : u'\x1e\xf4', '\xe7Z' : u'\x01{', | ||||
|     '\xe7Y' : u'\x1e\x8e', '\xe7X' : u'\x1e\x8a', '\xe7G' : u'\x01 ', | ||||
|     '\xe7F' : u'\x1e\x1e', '\xe7E' : u'\x01\x16', '\xe7D' : u'\x1e\n', | ||||
|     '\xe7C' : u'\x01\n',   '\xe7B' : u'\x1e\x02', '\xf0d' : u'\x1e\x11', | ||||
|     '\xe7N' : u'\x1eD',    '\xe7M' : u'\x1e@',    '\xf2K' : u'\x1e2', | ||||
|     '\xf0n' : u'\x01F',    '\xe7I' : u'\x010',    '\xe7H' : u'\x1e"', | ||||
|     '\xf2t' : u'\x1em',    '\xe8x' : u'\x1e\x8d', '\xe0a' : u'\x1e\xa3', | ||||
|     '\xf1U' : u'\x01r',    '\xe0e' : u'\x1e\xbb', '\xe0i' : u'\x1e\xc9', | ||||
|     '\xf9h' : u'\x1e+',    '\xe0o' : u'\x1e\xcf', '\xe8t' : u'\x1e\x97', | ||||
|     '\xe8u' : u'\xfc',     '\xf1A' : u'\x01\x04', '\xe8h' : u"\x1e'", | ||||
|     '\xe8i' : u'\xef',     '\xf1E' : u'\x01\x18', '\xe8o' : u'\xf6', | ||||
|     '\xe0u' : u'\x1e\xe7', '\xf1I' : u'\x01.',    '\xe0y' : u'\x1e\xf7', | ||||
|     '\xf1O' : u'\x01\xea', '\xe8e' : u'\xeb',     '\xf9H' : u'\x1e*', | ||||
|     '\xe8X' : u'\x1e\x8c', '\xe0A' : u'\x1e\xa2', '\xf1u' : u'\x01s', | ||||
|     '\xe0E' : u'\x1e\xba', '\xe0I' : u'\x1e\xc8', '\xe0O' : u'\x1e\xce', | ||||
|     '\xe8U' : u'\xdc',     '\xf1a' : u'\x01\x05', '\xe8H' : u'\x1e&', | ||||
|     '\xe8I' : u'\xcf',     '\xf1e' : u'\x01\x19', '\xe8O' : u'\xd6', | ||||
|     '\xe0U' : u'\x1e\xe6', '\xf1i' : u'\x01/',    '\xe0Y' : u'\x1e\xf6', | ||||
|     '\xf0r' : u'\x01W',    '\xf1o' : u'\x01\xeb', '\xe8E' : u'\xcb', | ||||
|     '\xf0R' : u'\x01V',    '\xe5o' : u'\x01M',    '\xe5i' : u'\x01+', | ||||
|     '\xf2D' : u'\x1e\x0c', '\xeeO' : u'\x01P',    '\xe5e' : u'\x01\x13', | ||||
|     '\xe5g' : u'\x1e!',    '\xe5a' : u'\x01\x01', '\xf2y' : u'\x1e\xf5', | ||||
|     '\xe8w' : u'\x1e\x85', '\xf2z' : u'\x1e\x93', '\xe5\xb5' : u'\x01\xe3',   | ||||
|     '\xe5u' : u'\x01k',    '\xeeU' : u'\x01p',    '\xf2d' : u'\x1e\r', | ||||
|     '\xe5O' : u'\x01L',    '\xe8a' : u'\xe4',     '\xe5I' : u'\x01*', | ||||
|     '\xf0T' : u'\x01b',    '\xeeo' : u'\x01Q',    '\xe5E' : u'\x01\x12', | ||||
|     '\xe5G' : u'\x1e ',    '\xe5A' : u'\x01',     '\xf2l' : u'\x1e7', | ||||
|     '\xf0C' : u'\xc7',     '\xf0S' : u'\x01^',    '\xe5U' : u'\x01j', | ||||
|     '\xf2B' : u'\x1e\x04', '\xeeu' : u'\x01q',    '\xf2a' : u'\x1e\xa1', | ||||
|     '\xf2w' : u'\x1e\x89', '\xf2U' : u'\x1e\xe4', '\xe6u' : u'\x01m', | ||||
|     '\xe6a' : u'\x01\x03', '\xe8Y' : u'\x01x',    '\xe6e' : u'\x01\x15', | ||||
|     '\xe6g' : u'\x01\x1f', '\xe6i' : u'\x01-',    '\xf2n' : u'\x1eG', | ||||
|     '\xe6o' : u'\x01O',    '\xe6U' : u'\x01l',    '\xe7f' : u'\x1e\x1f', | ||||
|     '\xf2h' : u'\x1e%',    '\xf2i' : u'\x1e\xcb', '\xe6A' : u'\x01\x02', | ||||
|     '\xe6E' : u'\x01\x14', '\xe6G' : u'\x01\x1e', '\xe6I' : u'\x01,', | ||||
|     '\xe9O' : u'\x01\xd1', '\xe6O' : u'\x01N',    '\xf2W' : u'\x1e\x88', | ||||
|     '\xe3j' : u'\x015',    '\xe3i' : u'\xee',     '\xe3h' : u'\x01%', | ||||
|     '\xe3o' : u'\xf4',     '\xe3c' : u'\x01\t',   '\xe3a' : u'\xe2', | ||||
|     '\xe3g' : u'\x01\x1d', '\xe3e' : u'\xea',     '\xe8W' : u'\x1e\x84', | ||||
|     '\xe3z' : u'\x1e\x91', '\xe3y' : u'\x01w',    '\xf0K' : u'\x016', | ||||
|     '\xe3s' : u'\x01]',    '\xe3w' : u'\x01u',    '\xf0H' : u'\x1e(', | ||||
|     '\xe3u' : u'\xfb',     '\xeay' : u'\x1e\x99', '\xe3J' : u'\x014', | ||||
|     '\xe3I' : u'\xce',     '\xe3H' : u'\x01$',    '\xe3O' : u'\xd4', | ||||
|     '\xe3C' : u'\x01\x08', '\xe3A' : u'\xc2',     '\xe3G' : u'\x01\x1c', | ||||
|     '\xf0 ' : u'\xb8',     '\xe3E' : u'\xca',     '\xe3Z' : u'\x1e\x90', | ||||
|     '\xe3Y' : u'\x01v',    '\xe9A' : u'\x01\xcd', '\xe3S' : u'\x01\\', | ||||
|     '\xf2s' : u'\x1ec',    '\xe9o' : u'\x01\xd2', '\xf4A' : u'\x1e', | ||||
|     '\xe3W' : u'\x01t',    '\xe3U' : u'\xdb',     '\xf4a' : u'\x1e\x01', | ||||
|     '\xe4n' : u'\xf1',     '\xe4o' : u'\xf5',     '\xeaw' : u'\x1e\x98', | ||||
|     '\xe4i' : u'\x01)',    '\xf2b' : u'\x1e\x05', '\xe5\xa5' : u'\x01\xe2',  | ||||
|     '\xe4e' : u'\x1e\xbd', '\xf2L' : u'\x1e6',    '\xe4a' : u'\xe3', | ||||
|     '\xf2m' : u'\x1eC',    '\xe4y' : u'\x1e\xf9', '\xe4v' : u'\x1e}', | ||||
|     '\xe4u' : u'\x01i',    '\xe4N' : u'\xd1',     '\xe4O' : u'\xd5', | ||||
|     '\xe8A' : u'\xc4',     '\xe8y' : u'\xff',     '\xe4I' : u'\x01(', | ||||
|     '\xe4E' : u'\x1e\xbc', '\xe4A' : u'\xc3',     '\xe9a' : u'\x01\xce', | ||||
|     '\xe4Y' : u'\x1e\xf8', '\xe4V' : u'\x1e|',    '\xe4U' : u'\x01h', | ||||
| # combining forms (in ANSEL, they precede the modified ASCII character | ||||
| # whereas the unicode combining term follows the character modified | ||||
| # Note: unicode allows multiple modifiers, but ANSEL may not (TDB?),  | ||||
| # so we ignore multiple combining forms in this module | ||||
| #  8d & 8e are zero-width joiner (ZWJ), and zero-width non-joiner ZWNJ | ||||
| #  (strange things) probably not commonly found in our needs, unless one | ||||
| #   starts writing persian (or???) poetry in ANSEL | ||||
| _acombiners = { | ||||
|      '\x8D' : u'\u200d',   '\x8E' : u'\u200c',   '\xE0' : u'\u0309',    | ||||
|      '\xE1' : u'\u0300',   '\xE2' : u'\u0301',   '\xE3' : u'\u0302',    | ||||
|      '\xE4' : u'\u0303',   '\xE5' : u'\u0304',   '\xE6' : u'\u0306',    | ||||
|      '\xE7' : u'\u0307',   '\xE8' : u'\u0308',   '\xE9' : u'\u030c',    | ||||
|      '\xEA' : u'\u030a',   '\xEB' : u'\ufe20',   '\xEC' : u'\ufe21',    | ||||
|      '\xED' : u'\u0315',   '\xEE' : u'\u030b',   '\xEF' : u'\u0310',    | ||||
|      '\xF0' : u'\u0327',   '\xF1' : u'\u0328',   '\xF2' : u'\u0323',    | ||||
|      '\xF3' : u'\u0324',   '\xF4' : u'\u0325',   '\xF5' : u'\u0333',    | ||||
|      '\xF6' : u'\u0332',   '\xF7' : u'\u0326',   '\xF8' : u'\u031c',    | ||||
|      '\xF9' : u'\u032e',   '\xFA' : u'\ufe22',   '\xFB' : u'\ufe23',    | ||||
|      '\xFE' : u'\u0313',   | ||||
|    } | ||||
|  | ||||
| # mappings of two byte (precomposed forms) ANSEL codes to unicode | ||||
| _twobyte = { | ||||
|      '\xE0\x41' : u'\u1ea2',   '\xE0\x45' : u'\u1eba',   '\xE0\x49' : u'\u1ec8',    | ||||
|      '\xE0\x4F' : u'\u1ece',   '\xE0\x55' : u'\u1ee6',   '\xE0\x59' : u'\u1ef6',    | ||||
|      '\xE0\x61' : u'\u1ea3',   '\xE0\x65' : u'\u1ebb',   '\xE0\x69' : u'\u1ec9',    | ||||
|      '\xE0\x6F' : u'\u1ecf',   '\xE0\x75' : u'\u1ee7',   '\xE0\x79' : u'\u1ef7',    | ||||
|      '\xE1\x41' : u'\u00c0',   '\xE1\x45' : u'\u00c8',   '\xE1\x49' : u'\u00cc',    | ||||
|      '\xE1\x4F' : u'\u00d2',   '\xE1\x55' : u'\u00d9',   '\xE1\x57' : u'\u1e80',    | ||||
|      '\xE1\x59' : u'\u1ef2',   '\xE1\x61' : u'\u00e0',   '\xE1\x65' : u'\u00e8',    | ||||
|      '\xE1\x69' : u'\u00ec',   '\xE1\x6F' : u'\u00f2',   '\xE1\x75' : u'\u00f9',    | ||||
|      '\xE1\x77' : u'\u1e81',   '\xE1\x79' : u'\u1ef3',   '\xE2\x41' : u'\u00c1',    | ||||
|      '\xE2\x43' : u'\u0106',   '\xE2\x45' : u'\u00c9',   '\xE2\x47' : u'\u01f4',    | ||||
|      '\xE2\x49' : u'\u00cd',   '\xE2\x4B' : u'\u1e30',   '\xE2\x4C' : u'\u0139',    | ||||
|      '\xE2\x4D' : u'\u1e3e',   '\xE2\x4E' : u'\u0143',   '\xE2\x4F' : u'\u00d3',    | ||||
|      '\xE2\x50' : u'\u1e54',   '\xE2\x52' : u'\u0154',   '\xE2\x53' : u'\u015a',    | ||||
|      '\xE2\x55' : u'\u00da',   '\xE2\x57' : u'\u1e82',   '\xE2\x59' : u'\u00dd',    | ||||
|      '\xE2\x5A' : u'\u0179',   '\xE2\x61' : u'\u00e1',   '\xE2\x63' : u'\u0107',    | ||||
|      '\xE2\x65' : u'\u00e9',   '\xE2\x67' : u'\u01f5',   '\xE2\x69' : u'\u00ed',    | ||||
|      '\xE2\x6B' : u'\u1e31',   '\xE2\x6C' : u'\u013a',   '\xE2\x6D' : u'\u1e3f',    | ||||
|      '\xE2\x6E' : u'\u0144',   '\xE2\x6F' : u'\u00f3',   '\xE2\x70' : u'\u1e55',    | ||||
|      '\xE2\x72' : u'\u0155',   '\xE2\x73' : u'\u015b',   '\xE2\x75' : u'\u00fa',    | ||||
|      '\xE2\x77' : u'\u1e83',   '\xE2\x79' : u'\u00fd',   '\xE2\x7A' : u'\u017a',    | ||||
|      '\xE2\xA5' : u'\u01fc',   '\xE2\xB5' : u'\u01fd',   '\xE3\x41' : u'\u00c2',    | ||||
|      '\xE3\x43' : u'\u0108',   '\xE3\x45' : u'\u00ca',   '\xE3\x47' : u'\u011c',    | ||||
|      '\xE3\x48' : u'\u0124',   '\xE3\x49' : u'\u00ce',   '\xE3\x4A' : u'\u0134',    | ||||
|      '\xE3\x4F' : u'\u00d4',   '\xE3\x53' : u'\u015c',   '\xE3\x55' : u'\u00db',    | ||||
|      '\xE3\x57' : u'\u0174',   '\xE3\x59' : u'\u0176',   '\xE3\x5A' : u'\u1e90',    | ||||
|      '\xE3\x61' : u'\u00e2',   '\xE3\x63' : u'\u0109',   '\xE3\x65' : u'\u00ea',    | ||||
|      '\xE3\x67' : u'\u011d',   '\xE3\x68' : u'\u0125',   '\xE3\x69' : u'\u00ee',    | ||||
|      '\xE3\x6A' : u'\u0135',   '\xE3\x6F' : u'\u00f4',   '\xE3\x73' : u'\u015d',    | ||||
|      '\xE3\x75' : u'\u00fb',   '\xE3\x77' : u'\u0175',   '\xE3\x79' : u'\u0177',    | ||||
|      '\xE3\x7A' : u'\u1e91',   '\xE4\x41' : u'\u00c3',   '\xE4\x45' : u'\u1ebc',    | ||||
|      '\xE4\x49' : u'\u0128',   '\xE4\x4E' : u'\u00d1',   '\xE4\x4F' : u'\u00d5',    | ||||
|      '\xE4\x55' : u'\u0168',   '\xE4\x56' : u'\u1e7c',   '\xE4\x59' : u'\u1ef8',    | ||||
|      '\xE4\x61' : u'\u00e3',   '\xE4\x65' : u'\u1ebd',   '\xE4\x69' : u'\u0129',    | ||||
|      '\xE4\x6E' : u'\u00f1',   '\xE4\x6F' : u'\u00f5',   '\xE4\x75' : u'\u0169',    | ||||
|      '\xE4\x76' : u'\u1e7d',   '\xE4\x79' : u'\u1ef9',   '\xE5\x41' : u'\u0100',    | ||||
|      '\xE5\x45' : u'\u0112',   '\xE5\x47' : u'\u1e20',   '\xE5\x49' : u'\u012a',    | ||||
|      '\xE5\x4F' : u'\u014c',   '\xE5\x55' : u'\u016a',   '\xE5\x61' : u'\u0101',    | ||||
|      '\xE5\x65' : u'\u0113',   '\xE5\x67' : u'\u1e21',   '\xE5\x69' : u'\u012b',    | ||||
|      '\xE5\x6F' : u'\u014d',   '\xE5\x75' : u'\u016b',   '\xE5\xA5' : u'\u01e2',    | ||||
|      '\xE5\xB5' : u'\u01e3',   '\xE6\x41' : u'\u0102',   '\xE6\x45' : u'\u0114',    | ||||
|      '\xE6\x47' : u'\u011e',   '\xE6\x49' : u'\u012c',   '\xE6\x4F' : u'\u014e',    | ||||
|      '\xE6\x55' : u'\u016c',   '\xE6\x61' : u'\u0103',   '\xE6\x65' : u'\u0115',    | ||||
|      '\xE6\x67' : u'\u011f',   '\xE6\x69' : u'\u012d',   '\xE6\x6F' : u'\u014f',    | ||||
|      '\xE6\x75' : u'\u016d',   '\xE7\x42' : u'\u1e02',   '\xE7\x43' : u'\u010a',    | ||||
|      '\xE7\x44' : u'\u1e0a',   '\xE7\x45' : u'\u0116',   '\xE7\x46' : u'\u1e1e',    | ||||
|      '\xE7\x47' : u'\u0120',   '\xE7\x48' : u'\u1e22',   '\xE7\x49' : u'\u0130',    | ||||
|      '\xE7\x4D' : u'\u1e40',   '\xE7\x4E' : u'\u1e44',   '\xE7\x50' : u'\u1e56',    | ||||
|      '\xE7\x52' : u'\u1e58',   '\xE7\x53' : u'\u1e60',   '\xE7\x54' : u'\u1e6a',    | ||||
|      '\xE7\x57' : u'\u1e86',   '\xE7\x58' : u'\u1e8a',   '\xE7\x59' : u'\u1e8e',    | ||||
|      '\xE7\x5A' : u'\u017b',   '\xE7\x62' : u'\u1e03',   '\xE7\x63' : u'\u010b',    | ||||
|      '\xE7\x64' : u'\u1e0b',   '\xE7\x65' : u'\u0117',   '\xE7\x66' : u'\u1e1f',    | ||||
|      '\xE7\x67' : u'\u0121',   '\xE7\x68' : u'\u1e23',   '\xE7\x6D' : u'\u1e41',    | ||||
|      '\xE7\x6E' : u'\u1e45',   '\xE7\x70' : u'\u1e57',   '\xE7\x72' : u'\u1e59',    | ||||
|      '\xE7\x73' : u'\u1e61',   '\xE7\x74' : u'\u1e6b',   '\xE7\x77' : u'\u1e87',    | ||||
|      '\xE7\x78' : u'\u1e8b',   '\xE7\x79' : u'\u1e8f',   '\xE7\x7A' : u'\u017c',    | ||||
|      '\xE8\x41' : u'\u00c4',   '\xE8\x45' : u'\u00cb',   '\xE8\x48' : u'\u1e26',    | ||||
|      '\xE8\x49' : u'\u00cf',   '\xE8\x4F' : u'\u00d6',   '\xE8\x55' : u'\u00dc',    | ||||
|      '\xE8\x57' : u'\u1e84',   '\xE8\x58' : u'\u1e8c',   '\xE8\x59' : u'\u0178',    | ||||
|      '\xE8\x61' : u'\u00e4',   '\xE8\x65' : u'\u00eb',   '\xE8\x68' : u'\u1e27',    | ||||
|      '\xE8\x69' : u'\u00ef',   '\xE8\x6F' : u'\u00f6',   '\xE8\x74' : u'\u1e97',    | ||||
|      '\xE8\x75' : u'\u00fc',   '\xE8\x77' : u'\u1e85',   '\xE8\x78' : u'\u1e8d',    | ||||
|      '\xE8\x79' : u'\u00ff',   '\xE9\x41' : u'\u01cd',   '\xE9\x43' : u'\u010c',    | ||||
|      '\xE9\x44' : u'\u010e',   '\xE9\x45' : u'\u011a',   '\xE9\x47' : u'\u01e6',    | ||||
|      '\xE9\x49' : u'\u01cf',   '\xE9\x4B' : u'\u01e8',   '\xE9\x4C' : u'\u013d',    | ||||
|      '\xE9\x4E' : u'\u0147',   '\xE9\x4F' : u'\u01d1',   '\xE9\x52' : u'\u0158',    | ||||
|      '\xE9\x53' : u'\u0160',   '\xE9\x54' : u'\u0164',   '\xE9\x55' : u'\u01d3',    | ||||
|      '\xE9\x5A' : u'\u017d',   '\xE9\x61' : u'\u01ce',   '\xE9\x63' : u'\u010d',    | ||||
|      '\xE9\x64' : u'\u010f',   '\xE9\x65' : u'\u011b',   '\xE9\x67' : u'\u01e7',    | ||||
|      '\xE9\x69' : u'\u01d0',   '\xE9\x6A' : u'\u01f0',   '\xE9\x6B' : u'\u01e9',    | ||||
|      '\xE9\x6C' : u'\u013e',   '\xE9\x6E' : u'\u0148',   '\xE9\x6F' : u'\u01d2',    | ||||
|      '\xE9\x72' : u'\u0159',   '\xE9\x73' : u'\u0161',   '\xE9\x74' : u'\u0165',    | ||||
|      '\xE9\x75' : u'\u01d4',   '\xE9\x7A' : u'\u017e',   '\xEA\x41' : u'\u00c5',    | ||||
|      '\xEA\x61' : u'\u00e5',   '\xEA\x75' : u'\u016f',   '\xEA\x77' : u'\u1e98',    | ||||
|      '\xEA\x79' : u'\u1e99',   '\xEA\xAD' : u'\u016e',   '\xEE\x4F' : u'\u0150',    | ||||
|      '\xEE\x55' : u'\u0170',   '\xEE\x6F' : u'\u0151',   '\xEE\x75' : u'\u0171',    | ||||
|      '\xF0\x20' : u'\u00b8',   '\xF0\x43' : u'\u00c7',   '\xF0\x44' : u'\u1e10',    | ||||
|      '\xF0\x47' : u'\u0122',   '\xF0\x48' : u'\u1e28',   '\xF0\x4B' : u'\u0136',    | ||||
|      '\xF0\x4C' : u'\u013b',   '\xF0\x4E' : u'\u0145',   '\xF0\x52' : u'\u0156',    | ||||
|      '\xF0\x53' : u'\u015e',   '\xF0\x54' : u'\u0162',   '\xF0\x63' : u'\u00e7',    | ||||
|      '\xF0\x64' : u'\u1e11',   '\xF0\x67' : u'\u0123',   '\xF0\x68' : u'\u1e29',    | ||||
|      '\xF0\x6B' : u'\u0137',   '\xF0\x6C' : u'\u013c',   '\xF0\x6E' : u'\u0146',    | ||||
|      '\xF0\x72' : u'\u0157',   '\xF0\x73' : u'\u015f',   '\xF0\x74' : u'\u0163',    | ||||
|      '\xF1\x41' : u'\u0104',   '\xF1\x45' : u'\u0118',   '\xF1\x49' : u'\u012e',    | ||||
|      '\xF1\x4F' : u'\u01ea',   '\xF1\x55' : u'\u0172',   '\xF1\x61' : u'\u0105',    | ||||
|      '\xF1\x65' : u'\u0119',   '\xF1\x69' : u'\u012f',   '\xF1\x6F' : u'\u01eb',    | ||||
|      '\xF1\x75' : u'\u0173',   '\xF2\x41' : u'\u1ea0',   '\xF2\x42' : u'\u1e04',    | ||||
|      '\xF2\x44' : u'\u1e0c',   '\xF2\x45' : u'\u1eb8',   '\xF2\x48' : u'\u1e24',    | ||||
|      '\xF2\x49' : u'\u1eca',   '\xF2\x4B' : u'\u1e32',   '\xF2\x4C' : u'\u1e36',    | ||||
|      '\xF2\x4D' : u'\u1e42',   '\xF2\x4E' : u'\u1e46',   '\xF2\x4F' : u'\u1ecc',    | ||||
|      '\xF2\x52' : u'\u1e5a',   '\xF2\x53' : u'\u1e62',   '\xF2\x54' : u'\u1e6c',    | ||||
|      '\xF2\x55' : u'\u1ee4',   '\xF2\x56' : u'\u1e7e',   '\xF2\x57' : u'\u1e88',    | ||||
|      '\xF2\x59' : u'\u1ef4',   '\xF2\x5A' : u'\u1e92',   '\xF2\x61' : u'\u1ea1',    | ||||
|      '\xF2\x62' : u'\u1e05',   '\xF2\x64' : u'\u1e0d',   '\xF2\x65' : u'\u1eb9',    | ||||
|      '\xF2\x68' : u'\u1e25',   '\xF2\x69' : u'\u1ecb',   '\xF2\x6B' : u'\u1e33',    | ||||
|      '\xF2\x6C' : u'\u1e37',   '\xF2\x6D' : u'\u1e43',   '\xF2\x6E' : u'\u1e47',    | ||||
|      '\xF2\x6F' : u'\u1ecd',   '\xF2\x72' : u'\u1e5b',   '\xF2\x73' : u'\u1e63',    | ||||
|      '\xF2\x74' : u'\u1e6d',   '\xF2\x75' : u'\u1ee5',   '\xF2\x76' : u'\u1e7f',    | ||||
|      '\xF2\x77' : u'\u1e89',   '\xF2\x79' : u'\u1ef5',   '\xF2\x7A' : u'\u1e93',    | ||||
|      '\xF3\x55' : u'\u1e72',   '\xF3\x75' : u'\u1e73',   '\xF4\x41' : u'\u1e00',    | ||||
|      '\xF4\x61' : u'\u1e01',   '\xF9\x48' : u'\u1e2a',   '\xF9\x68' : u'\u1e2b',   | ||||
|    } | ||||
|  | ||||
| # mappings of unicode to ANSEL codes | ||||
| # note: a char u'\u00A1' is internally remembered & represented as u'\xA1' | ||||
| #  so do NOT blindly use 4-hexdigit keys for those cases | ||||
| #  or the conversion function will fail | ||||
| _utoa = {  | ||||
|      u'\xa1'   : '\xC6',       u'\xa3'   : '\xB9',       u'\xa9'   : '\xC3',        | ||||
|      u'\xae'   : '\xAA',       u'\xb0'   : '\xC0',       u'\xb1'   : '\xAB',        | ||||
|      u'\xb7'   : '\xA8',       u'\xb8'   : '\xF0\x20',   u'\xbf'   : '\xC5',        | ||||
|      u'\xc0'   : '\xE1\x41',   u'\xc1'   : '\xE2\x41',   u'\xc2'   : '\xE3\x41',    | ||||
|      u'\xc3'   : '\xE4\x41',   u'\xc4'   : '\xE8\x41',   u'\xc5'   : '\xEA\x41',    | ||||
|      u'\xc6'   : '\xA5',       u'\xc7'   : '\xF0\x43',   u'\xc8'   : '\xE1\x45',    | ||||
|      u'\xc9'   : '\xE2\x45',   u'\xca'   : '\xE3\x45',   u'\xcb'   : '\xE8\x45',    | ||||
|      u'\xcc'   : '\xE1\x49',   u'\xcd'   : '\xE2\x49',   u'\xce'   : '\xE3\x49',    | ||||
|      u'\xcf'   : '\xE8\x49',   u'\xd1'   : '\xE4\x4E',   u'\xd2'   : '\xE1\x4F',    | ||||
|      u'\xd3'   : '\xE2\x4F',   u'\xd4'   : '\xE3\x4F',   u'\xd5'   : '\xE4\x4F',    | ||||
|      u'\xd6'   : '\xE8\x4F',   u'\xd8'   : '\xA2',       u'\xd9'   : '\xE1\x55',    | ||||
|      u'\xda'   : '\xE2\x55',   u'\xdb'   : '\xE3\x55',   u'\xdc'   : '\xE8\x55',    | ||||
|      u'\xdd'   : '\xE2\x59',   u'\xde'   : '\xA4',       u'\xdf'   : '\xC7',        | ||||
|      u'\xe0'   : '\xE1\x61',   u'\xe1'   : '\xE2\x61',   u'\xe2'   : '\xE3\x61',    | ||||
|      u'\xe3'   : '\xE4\x61',   u'\xe4'   : '\xE8\x61',   u'\xe5'   : '\xEA\x61',    | ||||
|      u'\xe6'   : '\xB5',       u'\xe7'   : '\xF0\x63',   u'\xe8'   : '\xE1\x65',    | ||||
|      u'\xe9'   : '\xE2\x65',   u'\xea'   : '\xE3\x65',   u'\xeb'   : '\xE8\x65',    | ||||
|      u'\xec'   : '\xE1\x69',   u'\xed'   : '\xE2\x69',   u'\xee'   : '\xE3\x69',    | ||||
|      u'\xef'   : '\xE8\x69',   u'\xf0'   : '\xBA',       u'\xf1'   : '\xE4\x6E',    | ||||
|      u'\xf2'   : '\xE1\x6F',   u'\xf3'   : '\xE2\x6F',   u'\xf4'   : '\xE3\x6F',    | ||||
|      u'\xf5'   : '\xE4\x6F',   u'\xf6'   : '\xE8\x6F',   u'\xf8'   : '\xB2',        | ||||
|      u'\xf9'   : '\xE1\x75',   u'\xfa'   : '\xE2\x75',   u'\xfb'   : '\xE3\x75',    | ||||
|      u'\xfc'   : '\xE8\x75',   u'\xfd'   : '\xE2\x79',   u'\xfe'   : '\xB4',        | ||||
|      u'\xff'   : '\xE8\x79',   u'\u0100' : '\xE5\x41',   u'\u0101' : '\xE5\x61',    | ||||
|      u'\u0102' : '\xE6\x41',   u'\u0103' : '\xE6\x61',   u'\u0104' : '\xF1\x41',    | ||||
|      u'\u0105' : '\xF1\x61',   u'\u0106' : '\xE2\x43',   u'\u0107' : '\xE2\x63',    | ||||
|      u'\u0108' : '\xE3\x43',   u'\u0109' : '\xE3\x63',   u'\u010a' : '\xE7\x43',    | ||||
|      u'\u010b' : '\xE7\x63',   u'\u010c' : '\xE9\x43',   u'\u010d' : '\xE9\x63',    | ||||
|      u'\u010e' : '\xE9\x44',   u'\u010f' : '\xE9\x64',   u'\u0110' : '\xA3',        | ||||
|      u'\u0111' : '\xB3',       u'\u0112' : '\xE5\x45',   u'\u0113' : '\xE5\x65',    | ||||
|      u'\u0114' : '\xE6\x45',   u'\u0115' : '\xE6\x65',   u'\u0116' : '\xE7\x45',    | ||||
|      u'\u0117' : '\xE7\x65',   u'\u0118' : '\xF1\x45',   u'\u0119' : '\xF1\x65',    | ||||
|      u'\u011a' : '\xE9\x45',   u'\u011b' : '\xE9\x65',   u'\u011c' : '\xE3\x47',    | ||||
|      u'\u011d' : '\xE3\x67',   u'\u011e' : '\xE6\x47',   u'\u011f' : '\xE6\x67',    | ||||
|      u'\u0120' : '\xE7\x47',   u'\u0121' : '\xE7\x67',   u'\u0122' : '\xF0\x47',    | ||||
|      u'\u0123' : '\xF0\x67',   u'\u0124' : '\xE3\x48',   u'\u0125' : '\xE3\x68',    | ||||
|      u'\u0128' : '\xE4\x49',   u'\u0129' : '\xE4\x69',   u'\u012a' : '\xE5\x49',    | ||||
|      u'\u012b' : '\xE5\x69',   u'\u012c' : '\xE6\x49',   u'\u012d' : '\xE6\x69',    | ||||
|      u'\u012e' : '\xF1\x49',   u'\u012f' : '\xF1\x69',   u'\u0130' : '\xE7\x49',    | ||||
|      u'\u0131' : '\xB8',       u'\u0134' : '\xE3\x4A',   u'\u0135' : '\xE3\x6A',    | ||||
|      u'\u0136' : '\xF0\x4B',   u'\u0137' : '\xF0\x6B',   u'\u0139' : '\xE2\x4C',    | ||||
|      u'\u013a' : '\xE2\x6C',   u'\u013b' : '\xF0\x4C',   u'\u013c' : '\xF0\x6C',    | ||||
|      u'\u013d' : '\xE9\x4C',   u'\u013e' : '\xE9\x6C',   u'\u0141' : '\xA1',        | ||||
|      u'\u0142' : '\xB1',       u'\u0143' : '\xE2\x4E',   u'\u0144' : '\xE2\x6E',    | ||||
|      u'\u0145' : '\xF0\x4E',   u'\u0146' : '\xF0\x6E',   u'\u0147' : '\xE9\x4E',    | ||||
|      u'\u0148' : '\xE9\x6E',   u'\u014c' : '\xE5\x4F',   u'\u014d' : '\xE5\x6F',    | ||||
|      u'\u014e' : '\xE6\x4F',   u'\u014f' : '\xE6\x6F',   u'\u0150' : '\xEE\x4F',    | ||||
|      u'\u0151' : '\xEE\x6F',   u'\u0152' : '\xA6',       u'\u0153' : '\xB6',        | ||||
|      u'\u0154' : '\xE2\x52',   u'\u0155' : '\xE2\x72',   u'\u0156' : '\xF0\x52',    | ||||
|      u'\u0157' : '\xF0\x72',   u'\u0158' : '\xE9\x52',   u'\u0159' : '\xE9\x72',    | ||||
|      u'\u015a' : '\xE2\x53',   u'\u015b' : '\xE2\x73',   u'\u015c' : '\xE3\x53',    | ||||
|      u'\u015d' : '\xE3\x73',   u'\u015e' : '\xF0\x53',   u'\u015f' : '\xF0\x73',    | ||||
|      u'\u0160' : '\xE9\x53',   u'\u0161' : '\xE9\x73',   u'\u0162' : '\xF0\x54',    | ||||
|      u'\u0163' : '\xF0\x74',   u'\u0164' : '\xE9\x54',   u'\u0165' : '\xE9\x74',    | ||||
|      u'\u0168' : '\xE4\x55',   u'\u0169' : '\xE4\x75',   u'\u016a' : '\xE5\x55',    | ||||
|      u'\u016b' : '\xE5\x75',   u'\u016c' : '\xE6\x55',   u'\u016d' : '\xE6\x75',    | ||||
|      u'\u016e' : '\xEA\xAD',   u'\u016f' : '\xEA\x75',   u'\u0170' : '\xEE\x55',    | ||||
|      u'\u0171' : '\xEE\x75',   u'\u0172' : '\xF1\x55',   u'\u0173' : '\xF1\x75',    | ||||
|      u'\u0174' : '\xE3\x57',   u'\u0175' : '\xE3\x77',   u'\u0176' : '\xE3\x59',    | ||||
|      u'\u0177' : '\xE3\x79',   u'\u0178' : '\xE8\x59',   u'\u0179' : '\xE2\x5A',    | ||||
|      u'\u017a' : '\xE2\x7A',   u'\u017b' : '\xE7\x5A',   u'\u017c' : '\xE7\x7A',    | ||||
|      u'\u017d' : '\xE9\x5A',   u'\u017e' : '\xE9\x7A',   u'\u01a0' : '\xAC',        | ||||
|      u'\u01a1' : '\xBC',       u'\u01af' : '\xAD',       u'\u01b0' : '\xBD',        | ||||
|      u'\u01cd' : '\xE9\x41',   u'\u01ce' : '\xE9\x61',   u'\u01cf' : '\xE9\x49',    | ||||
|      u'\u01d0' : '\xE9\x69',   u'\u01d1' : '\xE9\x4F',   u'\u01d2' : '\xE9\x6F',    | ||||
|      u'\u01d3' : '\xE9\x55',   u'\u01d4' : '\xE9\x75',   u'\u01e2' : '\xE5\xA5',    | ||||
|      u'\u01e3' : '\xE5\xB5',   u'\u01e6' : '\xE9\x47',   u'\u01e7' : '\xE9\x67',    | ||||
|      u'\u01e8' : '\xE9\x4B',   u'\u01e9' : '\xE9\x6B',   u'\u01ea' : '\xF1\x4F',    | ||||
|      u'\u01eb' : '\xF1\x6F',   u'\u01f0' : '\xE9\x6A',   u'\u01f4' : '\xE2\x47',    | ||||
|      u'\u01f5' : '\xE2\x67',   u'\u01fc' : '\xE2\xA5',   u'\u01fd' : '\xE2\xB5',    | ||||
|      u'\u02b9' : '\xA7',       u'\u02ba' : '\xB7',       u'\u02bb' : '\xB0',        | ||||
|      u'\u02bc' : '\xAE',       u'\u1e00' : '\xF4\x41',   u'\u1e01' : '\xF4\x61',    | ||||
|      u'\u1e02' : '\xE7\x42',   u'\u1e03' : '\xE7\x62',   u'\u1e04' : '\xF2\x42',    | ||||
|      u'\u1e05' : '\xF2\x62',   u'\u1e0a' : '\xE7\x44',   u'\u1e0b' : '\xE7\x64',    | ||||
|      u'\u1e0c' : '\xF2\x44',   u'\u1e0d' : '\xF2\x64',   u'\u1e10' : '\xF0\x44',    | ||||
|      u'\u1e11' : '\xF0\x64',   u'\u1e1e' : '\xE7\x46',   u'\u1e1f' : '\xE7\x66',    | ||||
|      u'\u1e20' : '\xE5\x47',   u'\u1e21' : '\xE5\x67',   u'\u1e22' : '\xE7\x48',    | ||||
|      u'\u1e23' : '\xE7\x68',   u'\u1e24' : '\xF2\x48',   u'\u1e25' : '\xF2\x68',    | ||||
|      u'\u1e26' : '\xE8\x48',   u'\u1e27' : '\xE8\x68',   u'\u1e28' : '\xF0\x48',    | ||||
|      u'\u1e29' : '\xF0\x68',   u'\u1e2a' : '\xF9\x48',   u'\u1e2b' : '\xF9\x68',    | ||||
|      u'\u1e30' : '\xE2\x4B',   u'\u1e31' : '\xE2\x6B',   u'\u1e32' : '\xF2\x4B',    | ||||
|      u'\u1e33' : '\xF2\x6B',   u'\u1e36' : '\xF2\x4C',   u'\u1e37' : '\xF2\x6C',    | ||||
|      u'\u1e3e' : '\xE2\x4D',   u'\u1e3f' : '\xE2\x6D',   u'\u1e40' : '\xE7\x4D',    | ||||
|      u'\u1e41' : '\xE7\x6D',   u'\u1e42' : '\xF2\x4D',   u'\u1e43' : '\xF2\x6D',    | ||||
|      u'\u1e44' : '\xE7\x4E',   u'\u1e45' : '\xE7\x6E',   u'\u1e46' : '\xF2\x4E',    | ||||
|      u'\u1e47' : '\xF2\x6E',   u'\u1e54' : '\xE2\x50',   u'\u1e55' : '\xE2\x70',    | ||||
|      u'\u1e56' : '\xE7\x50',   u'\u1e57' : '\xE7\x70',   u'\u1e58' : '\xE7\x52',    | ||||
|      u'\u1e59' : '\xE7\x72',   u'\u1e5a' : '\xF2\x52',   u'\u1e5b' : '\xF2\x72',    | ||||
|      u'\u1e60' : '\xE7\x53',   u'\u1e61' : '\xE7\x73',   u'\u1e62' : '\xF2\x53',    | ||||
|      u'\u1e63' : '\xF2\x73',   u'\u1e6a' : '\xE7\x54',   u'\u1e6b' : '\xE7\x74',    | ||||
|      u'\u1e6c' : '\xF2\x54',   u'\u1e6d' : '\xF2\x74',   u'\u1e72' : '\xF3\x55',    | ||||
|      u'\u1e73' : '\xF3\x75',   u'\u1e7c' : '\xE4\x56',   u'\u1e7d' : '\xE4\x76',    | ||||
|      u'\u1e7e' : '\xF2\x56',   u'\u1e7f' : '\xF2\x76',   u'\u1e80' : '\xE1\x57',    | ||||
|      u'\u1e81' : '\xE1\x77',   u'\u1e82' : '\xE2\x57',   u'\u1e83' : '\xE2\x77',    | ||||
|      u'\u1e84' : '\xE8\x57',   u'\u1e85' : '\xE8\x77',   u'\u1e86' : '\xE7\x57',    | ||||
|      u'\u1e87' : '\xE7\x77',   u'\u1e88' : '\xF2\x57',   u'\u1e89' : '\xF2\x77',    | ||||
|      u'\u1e8a' : '\xE7\x58',   u'\u1e8b' : '\xE7\x78',   u'\u1e8c' : '\xE8\x58',    | ||||
|      u'\u1e8d' : '\xE8\x78',   u'\u1e8e' : '\xE7\x59',   u'\u1e8f' : '\xE7\x79',    | ||||
|      u'\u1e90' : '\xE3\x5A',   u'\u1e91' : '\xE3\x7A',   u'\u1e92' : '\xF2\x5A',    | ||||
|      u'\u1e93' : '\xF2\x7A',   u'\u1e97' : '\xE8\x74',   u'\u1e98' : '\xEA\x77',    | ||||
|      u'\u1e99' : '\xEA\x79',   u'\u1ea0' : '\xF2\x41',   u'\u1ea1' : '\xF2\x61',    | ||||
|      u'\u1ea2' : '\xE0\x41',   u'\u1ea3' : '\xE0\x61',   u'\u1eb8' : '\xF2\x45',    | ||||
|      u'\u1eb9' : '\xF2\x65',   u'\u1eba' : '\xE0\x45',   u'\u1ebb' : '\xE0\x65',    | ||||
|      u'\u1ebc' : '\xE4\x45',   u'\u1ebd' : '\xE4\x65',   u'\u1ec8' : '\xE0\x49',    | ||||
|      u'\u1ec9' : '\xE0\x69',   u'\u1eca' : '\xF2\x49',   u'\u1ecb' : '\xF2\x69',    | ||||
|      u'\u1ecc' : '\xF2\x4F',   u'\u1ecd' : '\xF2\x6F',   u'\u1ece' : '\xE0\x4F',    | ||||
|      u'\u1ecf' : '\xE0\x6F',   u'\u1ee4' : '\xF2\x55',   u'\u1ee5' : '\xF2\x75',    | ||||
|      u'\u1ee6' : '\xE0\x55',   u'\u1ee7' : '\xE0\x75',   u'\u1ef2' : '\xE1\x59',    | ||||
|      u'\u1ef3' : '\xE1\x79',   u'\u1ef4' : '\xF2\x59',   u'\u1ef5' : '\xF2\x79',    | ||||
|      u'\u1ef6' : '\xE0\x59',   u'\u1ef7' : '\xE0\x79',   u'\u1ef8' : '\xE4\x59',    | ||||
|      u'\u1ef9' : '\xE4\x79',   u'\u20ac' : '\xC8',       u'\u2113' : '\xC1',        | ||||
|      u'\u2117' : '\xC2',       u'\u266d' : '\xA9',       u'\u266f' : '\xC4',       | ||||
|    } | ||||
|  | ||||
|  | ||||
| # unicode combining forms mapped to ANSEL  | ||||
| _ucombiners = { | ||||
|      u'\u0300' : '\xE1',       u'\u0301' : '\xE2',       u'\u0302' : '\xE3',        | ||||
|      u'\u0303' : '\xE4',       u'\u0304' : '\xE5',       u'\u0306' : '\xE6',        | ||||
|      u'\u0307' : '\xE7',       u'\u0308' : '\xE8',       u'\u0309' : '\xE0',        | ||||
|      u'\u030a' : '\xEA',       u'\u030b' : '\xEE',       u'\u030c' : '\xE9',        | ||||
|      u'\u0310' : '\xEF',       u'\u0313' : '\xFE',       u'\u0315' : '\xED',        | ||||
|      u'\u031c' : '\xF8',       u'\u0323' : '\xF2',       u'\u0324' : '\xF3',        | ||||
|      u'\u0325' : '\xF4',       u'\u0326' : '\xF7',       u'\u0327' : '\xF0',        | ||||
|      u'\u0328' : '\xF1',       u'\u032e' : '\xF9',       u'\u0332' : '\xF6',        | ||||
|      u'\u0333' : '\xF5',       u'\u200c' : '\x8E',       u'\u200d' : '\x8D',        | ||||
|      u'\ufe20' : '\xEB',       u'\ufe21' : '\xEC',       u'\ufe22' : '\xFA',        | ||||
|      u'\ufe23' : '\xFB',       | ||||
| } | ||||
|  | ||||
| UTOA = { | ||||
|     u'\xfe ' : '\xeb',     u'\xcb' : '\xe8E',     u'\xdb' : '\xe3U', | ||||
|     u'\xeb' : '\xe8e',     u'\xfb' : '\xe3u',     u'\x01\x04' : '\xf1A', | ||||
|     u'\xb0' : '\xc0',      u'\xc0' : '\xe1A',     u'\xd0' : '\xa3', | ||||
|     u'\xe0' : '\xe1a',     u'\xf0' : '\xba',      u'\x01\x14' : '\xe6E', | ||||
|     u'\xa1' : '\xc6',      u'\xdc' : '\xe8U',     u'\x01\xaf' : '\xad',  | ||||
|     u'\xb1' : '\xab',      u'\xc1' : '\xe2A',     u'\xd1' : '\xe4N', | ||||
|     u'\x01$' : '\xe3H',    u'\xe1' : '\xe2a',     u'\xf1' : '\xe4n', | ||||
|     u'\x01' : '\xe5A',     u'\x03\t' : '\xe0',    u'\x014' : '\xe3J', | ||||
|     u'\xc6' : '\xa5',      u'\xd6' : '\xe8O',     u'\xe6' : '\xb5', | ||||
|     u'\xfc' : '\xe8u',     u'\xf6' : '\xe8o',     u'\x1e\xc8' : '\xe0I', | ||||
|     u'\x1e\xc9' : '\xe0i', u'\x1e\xca' : '\xf2I', u'\x1e\xcb' : '\xf2i', | ||||
|     u'\x1e\xcc' : '\xf2O', u'\x1e\xcd' : '\xf2o', u'\x1e\xce' : '\xe0O', | ||||
|     u'\x1e\xcf' : '\xe0o', u'\x1e\xf8' : '\xe4Y', u'\x1e\xf9' : '\xe4y', | ||||
|     u'\x1e\xf2' : '\xe1Y', u'\x1e\xf3' : '\xe1y', u'\x1e\xf4' : '\xf2Y', | ||||
|     u'\x1e\xf5' : '\xf2y', u'\x1e\xf6' : '\xe0Y', u'\x1e\xf7' : '\xe0y', | ||||
|     u'\xb7' : '\xa8',      u'\x1e\xe4' : '\xf2U', u'\x1e\xe5' : '\xf2u', | ||||
|     u'\x1e\xe6' : '\xe0U', u'\x1e\xe7' : '\xe0u', u'\x1e\x98' : '\xeaw', | ||||
|     u'\x1e\x99' : '\xeay', u'\xc7' : '\xf0C',     u'\x1e\x90' : '\xe3Z', | ||||
|     u'\x1e\x91' : '\xe3z', u'\x1e\x92' : '\xf2Z', u'\x1e\x93' : '\xf2z', | ||||
|     u'\x1e\x97' : '\xe8t', u'\x1e\x88' : '\xf2W', u'\x1e\x89' : '\xf2w', | ||||
|     u'\x1e\x8a' : '\xe7X', u'\x1e\x8b' : '\xe7x', u'\x1e\x8c' : '\xe8X', | ||||
|     u'\x1e\x8d' : '\xe8x', u'\x1e\x8e' : '\xe7Y', u'\x1e\x8f' : '\xe7y', | ||||
|     u'\x1e\x80' : '\xe1W', u'\x1e\x81' : '\xe1w', u'\x1e\x82' : '\xe2W', | ||||
|     u'\x1e\x83' : '\xe2w', u'\x1e\x84' : '\xe8W', u'\x1e\x85' : '\xe8w', | ||||
|     u'\x1e\x86' : '\xe7W', u'\x1e\x87' : '\xe7w', u'\x1e\xb8' : '\xf2E', | ||||
|     u'\x1e\xb9' : '\xf2e', u'\x1e\xba' : '\xe0E', u'\x1e\xbb' : '\xe0e', | ||||
|     u'\x1e\xbc' : '\xe4E', u'\x1e\xbd' : '\xe4e', u'\xe7' : '\xf0c', | ||||
|     u'\x1e\xa0' : '\xf2A', u'\x1e\xa1' : '\xf2a', u'\x1e\xa2' : '\xe0A', | ||||
|     u'\x1e\xa3' : '\xe0a', u'\x1eX' : '\xe7R',    u'\x1eY' : '\xe7r', | ||||
|     u'\x1eZ' : '\xf2R',    u'\x1e[' : '\xf2r',    u'\x1eT' : '\xe2P', | ||||
|     u'\x1eU' : '\xe2p',    u'\x1eV' : '\xe7P',    u'\x1eW' : '\xe7p', | ||||
|     u'\x1e@' : '\xe7M',    u'\x1eA' : '\xe7m',    u'\x1eB' : '\xf2M', | ||||
|     u'\x1eC' : '\xf2m',    u'\x1eD' : '\xe7N',    u'\x1eE' : '\xe7n', | ||||
|     u'\x1eF' : '\xf2N',    u'\x1eG' : '\xf2n',    u'\x1e|' : '\xe4V', | ||||
|     u'\x1e}' : '\xe4v',    u'\x1e~' : '\xf2V',    u'\x1e\x7f' : '\xf2v', | ||||
|     u'\x1er' : '\xf3U',    u'\x1es' : '\xf3u',    u'\x1ej' : '\xe7T', | ||||
|     u'\x1ek' : '\xe7t',    u'\x1el' : '\xf2T',    u'\x1em' : '\xf2t', | ||||
|     u'\x1e`' : '\xe7S',    u'\x1ea' : '\xe7s',    u'\x1eb' : '\xf2S', | ||||
|     u'\x1ec' : '\xf2s',    u'\x1e\x1e' : '\xe7F', u'\x1e\x1f' : '\xe7f', | ||||
|     u'\x1e\x10' : '\xf0D', u'\x1e\x11' : '\xf0d', u'\xcc' : '\xe1I', | ||||
|     u'\x1e\n' : '\xe7D',   u'\x1e\x0b' : '\xe7d', u'\x1e\x0c' : '\xf2D', | ||||
|     u'\x1e\r' : '\xf2d',   u'\x1e\x01' : '\xf4a', u'\x1e\x02' : '\xe7B', | ||||
|     u'\x1e\x03' : '\xe7b', u'\x1e\x04' : '\xf2B', u'\x1e\x05' : '\xf2b', | ||||
|     u'\x1e>' : '\xe2M',    u'\x1e?' : '\xe2m',    u'\x1e0' : '\xe2K', | ||||
|     u'\x1e1' : '\xe2k',    u'\x1e2' : '\xf2K',    u'\x1e3' : '\xf2k', | ||||
|     u'\xec'  : '\xe1i',    u'\x1e6' : '\xf2L',    u'\x1e7' : '\xf2l', | ||||
|     u'\x1e(' : '\xf0H',    u'\x1e)' : '\xf0h',    u'\x1e*' : '\xf9H', | ||||
|     u'\x1e+' : '\xf9h',    u'\x1e ' : '\xe5G',    u'\x1e!' : '\xe5g', | ||||
|     u'\x1e"' : '\xe7H',    u'\x1e#' : '\xe7h',    u'\x1e$' : '\xf2H', | ||||
|     u'\x1e%' : '\xf2h',    u'\x1e&' : '\xe8H',    u"\x1e'" : '\xe8h', | ||||
|     u'\xcd' : '\xe2I',     u'\xdd' : '\xe2Y',     u'\xed' : '\xe2i', | ||||
|     u'\xfd' : '\xe2y',     u'\xc2' : '\xe3A',     u'\xd2' : '\xe1O', | ||||
|     u'\xe2' : '\xe3a',     u'\xf2' : '\xe1o',     u'\xa3' : '\xb9', | ||||
|     u'\x03\x0b' : '\xee',  u'\x03\n' : '\xea',    u'\xc3' : '\xe4A', | ||||
|     u'\x03\x0c' : '\xe9',  u'\x03\x03' : '\xe4',  u'\x03\x02' : '\xe3', | ||||
|     u'\x03\x01' : '\xe2',  u'\x03\x07' : '\xe7',  u'\x03\x06' : '\xe6', | ||||
|     u'\x03\x04' : '\xe5',  u'\xd3' : '\xe2O',     u'\x03\x1c' : '\xf8', | ||||
|     u'\x03\x13' : '\xfe',  u'\x03\x10' : '\xef',  u'\x03\x15' : '\xed', | ||||
|     u'\xe3' : '\xe4a',     u'\x03(' : '\xf1',     u'\x03.' : '\xf9', | ||||
|     u'\x03#' : '\xf2',     u"\x03'" : '\xf0',     u'\x03&' : '\xf7', | ||||
|     u'\x03%' : '\xf4',     u'\x03$' : '\xf3',     u'\xf3' : '\xe2o', | ||||
|     u'\x033' : '\xf5',     u'\x032' : '\xf6',     u'\x03' : '\xe1', | ||||
|     u'\xb8' : '\xf0 ',     u'\xc8' : '\xe1E',     u'\xd8' : '\xa2', | ||||
|     u'\xe8' : '\xe1e',     u'\xf8' : '\xb2',      u'\x1e' : '\xf4A', | ||||
|     u'\xa9' : '\xc3',      u'\x02\xbe' : '\xae',  u'\x02\xbf' : '\xb0', | ||||
|     u'\x02\xb9' : '\xa7',  u'\x02\xba' : '\xb7',  u'\xc9' : '\xe2E', | ||||
|     u'\xd9' : '\xe1U',     u'\xfe!' : '\xec',     u'\xfe"' : '\xfa', | ||||
|     u'\xfe#' : '\xfb',     u'\xe9' : '\xe2e',     u'\xf9' : '\xe1u', | ||||
|     u'\xae' : '\xaa',      u'\xce' : '\xe3I',     u'\xde' : '\xa4', | ||||
|     u'\xee' : '\xe3i',     u'\xfe' : '\xb4',      u'\x01\xcd' : '\xe9A', | ||||
|     u'\x01\xcf' : '\xe9I', u'\x01\xce' : '\xe9a', u'\x01\xd1' : '\xe9O', | ||||
|     u'\x01\xd0' : '\xe9i', u'\x01\xd3' : '\xe9U', u'\x01\xd2' : '\xe9o', | ||||
|     u'\x01\xd4' : '\xe9u', u'\x01\xe9' : '\xe9k', u'\x01\xe3' : '\xe5\xb5',  | ||||
|     u'\x01\xe7' : '\xe9g', u'\x01\xe6' : '\xe9G', u'\x01\xe2' : '\xe5\xa5', | ||||
|     u'\x01\xe8' : '\xe9K', u'\x01\xeb' : '\xf1o', u'\x01\xea' : '\xf1O', | ||||
|     u'\x01\xf0' : '\xe9j', u'\x01\xf5' : '\xe2g', u'\x01\xf4' : '\xe2G', | ||||
|     u'\xbf' : '\xc5',      u'\x01\xa1' : '\xbc',  u'\x01\xfd' : '\xe2\xb5',  | ||||
|     u'\xcf' : '\xe8I',     u'\xdf' : '\xcf',      u'\x01\xfc' : '\xe2\xa5',  | ||||
|     u'\x01\xa0' : '\xac',  u'\xef' : '\xe8i',     u'\x01\xb0' : '\xbd', | ||||
|     u'\xff' : '\xe8y',     u'\x01A' : '\xa1',     u'\x01C' : '\xe2N', | ||||
|     u'\x01B' : '\xb1',     u'\x01E' : '\xf0N',    u'\x01D' : '\xe2n', | ||||
|     u'\x01G' : '\xe9N',    u'\x01F' : '\xf0n',    u'\x01H' : '\xe9n', | ||||
|     u'\x01M' : '\xe5o',    u'\x01L' : '\xe5O',    u'\x01O' : '\xe6o', | ||||
|     u'\x01N' : '\xe6O',    u'\x01Q' : '\xeeo',    u'\x01P' : '\xeeO', | ||||
|     u'\x01S' : '\xb6',     u'\x01R' : '\xa6',     u'\x01U' : '\xe2r', | ||||
|     u'\x01T' : '\xe2R',    u'\x01W' : '\xf0r',    u'\x01V' : '\xf0R', | ||||
|     u'\x01Y' : '\xe9r',    u'\x01X' : '\xe9R',    u'\x01[' : '\xe2s', | ||||
|     u'\x01Z' : '\xe2S',    u'\x01]' : '\xe3s',    u'\x01\\' : '\xe3S', | ||||
|     u'\x01_' : '\xf0s',    u'\x01^' : '\xf0S',    u'\x01a' : '\xe9s', | ||||
|     u'\x01`' : '\xe9S',    u'\x01c' : '\xf0t',    u'\x01b' : '\xf0T', | ||||
|     u'\x01e' : '\xe9t',    u'\x01d' : '\xe9T',    u'\x01i' : '\xe4u', | ||||
|     u'\x01h' : '\xe4U',    u'\x01k' : '\xe5u',    u'\x01j' : '\xe5U', | ||||
|     u'\x01m' : '\xe6u',    u'\x01l' : '\xe6U',    u'\x01o' : '\xeau', | ||||
|     u'\x01n' : '\xea\xad', u'\x01q' : '\xeeu',    u'\x01p' : '\xeeU', | ||||
|     u'\x01s' : '\xf1u',    u'\x01r' : '\xf1U',    u'\x01u' : '\xe3w', | ||||
|     u'\x01t' : '\xe3W',    u'\x01w' : '\xe3y',    u'\x01v' : '\xe3Y', | ||||
|     u'\x01y' : '\xe2Z',    u'\x01x' : '\xe8Y',    u'\x01{' : '\xe7Z', | ||||
|     u'\x01z' : '\xe2z',    u'\x01}' : '\xe9Z',    u'\x01|' : '\xe7z', | ||||
|     u'\x01~' : '\xe9z',    u'\x01\x01' : '\xe5a', u'\x01\x03' : '\xe6a', | ||||
|     u'\x01\x02' : '\xe6A', u'\x01\x05' : '\xf1a', u'\xc4' : '\xe8A', | ||||
|     u'\x01\x07' : '\xe2c', u'\x01\x06' : '\xe2C', u'\x01\t' : '\xe3c', | ||||
|     u'\x01\x08' : '\xe3C', u'\x01\x0b' : '\xe7c', u'\x01\n' : '\xe7C', | ||||
|     u'\x01\r' : '\xe9c',   u'\x01\x0c' : '\xe9C', u'\x01\x0f' : '\xe9d', | ||||
|     u'\x01\x0e' : '\xe9D', u'\x01\x11' : '\xb3',  u'\x01\x10' : '\xa3', | ||||
|     u'\x01\x13' : '\xe5e', u'\x01\x12' : '\xe5E', u'\x01\x15' : '\xe6e', | ||||
|     u'\xd4' : '\xe3O',     u'\x01\x17' : '\xe7e', u'\x01\x16' : '\xe7E', | ||||
|     u'\x01\x19' : '\xf1e', u'\x01\x18' : '\xf1E', u'\x01\x1b' : '\xe9e', | ||||
|     u'\x01\x1a' : '\xe9E', u'\x01\x1d' : '\xe3g', u'\x01\x1c' : '\xe3G', | ||||
|     u'\x01\x1f' : '\xe6g', u'\x01\x1e' : '\xe6G', u'\x01!' : '\xe7g', | ||||
|     u'\x01 '  : '\xe7G',   u'\x01#' : '\xf0g',    u'\x01"' : '\xf0G', | ||||
|     u'\x01%' : '\xe3h',    u'\xe4' : '\xe8a',     u'\x01)' : '\xe4i', | ||||
|     u'\x01(' : '\xe4I',    u'\x01+' : '\xe5i',    u'\x01*' : '\xe5I', | ||||
|     u'\x01-' : '\xe6i',    u'\x01,' : '\xe6I',    u'\x01/' : '\xf1i', | ||||
|     u'\x01.' : '\xf1I',    u'\x011' : '\xb8',     u'\x010' : '\xe7I', | ||||
|     u'\x015' : '\xe3j',    u'\xf4' : '\xe3o',     u'\x017' : '\xf0k', | ||||
|     u'\x016' : '\xf0K',    u'\x019' : '\xe2L',    u'\x01;' : '\xf0L', | ||||
|     u'\x01:' : '\xe2l',    u'\x01=' : '\xe9L',    u'\x01<' : '\xf0l', | ||||
|     u'\x01>' : '\xe9l',    u'\xc5' : '\xeaA',     u'\xd5' : '\xe4O', | ||||
|     u'\xe5' : '\xeaa',     u'\xf5' : '\xe4o',     u'\xca' : '\xe3E', | ||||
|     u'\xda' : '\xe2U',     u'\xea' : '\xe3e',     u'\xfa' : '\xe2u', | ||||
|     } | ||||
|  | ||||
| #------------------------------------------------------------------------- | ||||
| # | ||||
| # ansel_to_utf8 | ||||
| # | ||||
| #------------------------------------------------------------------------- | ||||
| def ansel_to_utf8(inp): | ||||
|     """Converts an ANSEL encoded string to UTF8""" | ||||
|   | ||||
| # TODO: change name to ansel_to_unicode (it does NOT return utf-8) | ||||
| # ALSO: I think I'd prefer full pass-through of ANSEL's ASCII subset, | ||||
| #  with substitutions and deletions handled at a higher level | ||||
| def ansel_to_utf8(s): | ||||
|     """ Convert an ANSEL encoded string to unicode """ | ||||
|  | ||||
|     buff = cStringIO.StringIO() | ||||
|     while inp: | ||||
|         char0 = ord(inp[0]) | ||||
|         if char0 <= 31: | ||||
|             head = u' ' | ||||
|             inp = inp[1:] | ||||
|         elif char0 > 127: | ||||
|             char2 = inp[0:2] | ||||
|             char1 = inp[0] | ||||
|             if TWOBYTE.has_key(char2): | ||||
|                 head = TWOBYTE[char2] | ||||
|                 inp = inp[2:] | ||||
|             elif ONEBYTE.has_key(char1): | ||||
|                 head = ONEBYTE[char1] | ||||
|                 inp = inp[1:] | ||||
|     while s: | ||||
|         if ord(s[0]) < 128: | ||||
|             if s[0] in _use_ASCII: | ||||
|                 head = s[0] | ||||
|             else: | ||||
|                 head = u'\xff\xfd' | ||||
|                 inp = inp[1:] | ||||
|                 # substitute space for disallowed (control) chars | ||||
|                 head = ' ' | ||||
|             s = s[1:] | ||||
|         else: | ||||
|             head = inp[0] | ||||
|             inp = inp[1:] | ||||
|         buff.write(head) | ||||
|     ans = unicode(buff.getvalue()) | ||||
|             if _twobyte.has_key(s[0:2]): | ||||
|                 head = _twobyte[s[0:2]] | ||||
|                 s = s[2:] | ||||
|             elif _onebyte.has_key(s[0]): | ||||
|                 head = _onebyte[s[0]] | ||||
|                 s = s[1:] | ||||
|             elif s[0] in _acombiners.keys(): | ||||
|                 c =  _acombiners[s[0]] | ||||
|                 # always consume the combiner | ||||
|                 s = s[1:] | ||||
|                 next = s[0] | ||||
|                 if next in _printable_ascii: | ||||
|                     # consume next as well | ||||
|                     s = s[1:] | ||||
|                     # unicode: combiner follows base-char | ||||
|                     head = next + c | ||||
|                 else: | ||||
|                     # just drop the unexpected combiner | ||||
|                     continue  | ||||
|             else: | ||||
|                 head = u'\ufffd' # "Replacement Char" | ||||
|                 s = s[1:] | ||||
|         # note: cStringIO handles 8-bit strings, only (no unicode) | ||||
|         buff.write(head.encode("utf-8")) | ||||
|     ans = unicode(buff.getvalue(), "utf-8") | ||||
|     buff.close() | ||||
|     return ans | ||||
|  | ||||
| #------------------------------------------------------------------------- | ||||
| # | ||||
| # utf8_to_ansel | ||||
| # | ||||
| #------------------------------------------------------------------------- | ||||
| def utf8_to_ansel(inp): | ||||
|     """Converts an UTF8 encoded string to ANSEL""" | ||||
|      | ||||
|     if type(inp) != unicode: | ||||
|         inp = unicode(inp) | ||||
|  | ||||
| # TODO: change name to unicode_to_ansel (it does NOT process utf-8 input)  | ||||
| def utf8_to_ansel(s): | ||||
|     """ Convert a unicode string to ANSEL """ | ||||
|     | ||||
|     buff = cStringIO.StringIO() | ||||
|     while inp: | ||||
|         char0 = ord(inp[0]) | ||||
|         if char0 <= 3 or char0 == 0x1e or char0 >= 0xf3: | ||||
|             try: | ||||
|                 head = UTOA[inp[0:2]] | ||||
|                 inp = inp[2:] | ||||
|             except: | ||||
|                 try: | ||||
|                     head = UTOA[inp[0:1]] | ||||
|                     inp = inp[1:] | ||||
|                 except: | ||||
|                     head = '?' | ||||
|                     inp = inp[1:] | ||||
|         elif char0 > 127: | ||||
|             try: | ||||
|                 head = UTOA[inp[0:1]] | ||||
|                 inp = inp[1:] | ||||
|             except: | ||||
|                 head = '?' | ||||
|                 inp = inp[1:] | ||||
|     while s: | ||||
|         if ord(s[0]) < 128: | ||||
|             head = s[0].encode('ascii') | ||||
|             if not head in _use_ASCII: | ||||
|                 head = ' ' | ||||
|         else: | ||||
|             head = inp[0] | ||||
|             inp = inp[1:] | ||||
|             if s[0] in _utoa.keys(): | ||||
|                 head = _utoa[s[0]] | ||||
|             elif s[0] in _ucombiners.keys(): | ||||
|                 c = _ucombiners[s[0]] | ||||
|                 # head happens to have last conversion to ansel | ||||
|                 if len(head) == 1 and head[-1] in _printable_ascii: | ||||
|                     last = head[-1] | ||||
|                     head = head[:-1] + c + last | ||||
|                     buff.seek(-1,2) | ||||
|                     buff.truncate() | ||||
|                 else: | ||||
|                     # ignore mpultiple combiners | ||||
|                     # but always consume the combiner | ||||
|                     s = s[1:] | ||||
|                     continue | ||||
|             else: | ||||
|                 head = '?' | ||||
|         s = s[1:] | ||||
|         buff.write(head) | ||||
|     ans = buff.getvalue() | ||||
|     buff.close() | ||||
|     return ans | ||||
|  | ||||
|   | ||||
							
								
								
									
										257
									
								
								src/test/ansel_utf8_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										257
									
								
								src/test/ansel_utf8_test.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,257 @@ | ||||
| #!/usr/bin/python -tt | ||||
|  | ||||
| # Instructions for use | ||||
| # -------------------- | ||||
| # Eventually, this code might use a testing infrastructure (conventions TBD) | ||||
| # but, at present this is intended for use as a manual operation by placing | ||||
| # this file (temporarily) in the same dir as the module it tests. | ||||
| # | ||||
| # Running  | ||||
| #   ./test-ansel_utf8.py [-v] | ||||
| # should report 'OK' | ||||
| #   the -v option shows individual results for each test function | ||||
| # --------------------------------------------------------------------------- | ||||
|  | ||||
| # TODO | ||||
| # --------------------------------------------------------- | ||||
| # make table of test cases for readability | ||||
| # ansel U+xxxx UTF8 char-name string (char where appl) | ||||
| # --------------------------------------------------------- | ||||
|  | ||||
| import ansel_utf8 as A | ||||
| import unittest | ||||
|  | ||||
| # debugging provision to capture some strings for exernal examination | ||||
| # note that this debug output is ASCII, by virture of using `` (repr) | ||||
| OUT=0 | ||||
| if OUT > 0: | ||||
|     import sys | ||||
| #  set output levels 1,2,4 (or-ing ok) for string (repr) in tests 1a,1b,2a | ||||
| #  then manipulate that data with separate tools for additional validation | ||||
| # tools refs: | ||||
| #    http://search.cpan.org/~esummers/MARC-Charset-0.98/lib/MARC/Charset.pm | ||||
| #    http://pypi.python.org/pypi/pymarc | ||||
| # ---  | ||||
| # (perl) MARC::Charset | ||||
| # (python) pymarc omits eszett,euro (patchable); only does ansel-to-utf8  | ||||
| # shell: echo -e 'utf-8 encoded chars' works well | ||||
| # ==> NB: when examining unicode characters (rather than hexdump) externally, | ||||
| # it is absolutely essential to use a good unicode terminal for correct | ||||
| # display of combining forms (other than precomposed)     | ||||
| #    (eg: use xterm rather than konsole or gnome-terminal) | ||||
| # ==> and of course, use a locale with the UTF-8 charset | ||||
|  | ||||
|  | ||||
| # test convwenience utility extends python by showing got & expected (like perl) | ||||
| #  useful at least for the commonly used assertEquals() | ||||
| # conventions: | ||||
| #  dup the expected and got parms from the assertEquals and add a message | ||||
| #  (and an optional prefix to distinguish sub-tests) | ||||
| # ==> code the assert as assertEquals(got, exp, msg(got,exp,mess,pfx)) | ||||
| def msg(got, expect, msgbase, prefix=""): | ||||
|     if prefix: | ||||
|         prefix += ": " | ||||
|     return "%s%s\n .....got %s\n expected %s" % (prefix, msgbase, `got`, `expect`) | ||||
|  | ||||
|  | ||||
| class Test1(unittest.TestCase): | ||||
|     """ test basic ansel_to_unicode and inversion """ | ||||
|  | ||||
|     def test_1a(self): | ||||
|         """ 1a: map ansel onebyte to unicode and inverse """ | ||||
|         # no combining chars here .. see later test for those | ||||
|         count = 0 | ||||
|         sans  = "" | ||||
|         suni  = u"" | ||||
|         for acode in sorted(A._onebyte.keys()): | ||||
|             count += 1 | ||||
|             sans += acode | ||||
|             suni += A._onebyte[acode]  | ||||
|         if OUT & 1:     | ||||
|             print "test1a: %d codes" % count | ||||
|             print " ansel:%s" % `sans` | ||||
|             print " utf-8:%s" % `suni.encode("utf-8")`  # U8 for debugging | ||||
|             sys.stdout.flush() | ||||
|         a2u = A.ansel_to_utf8(sans) | ||||
|         self.assertEquals(a2u,suni, msg(a2u,suni, "map onebyte ansel to unicode")) | ||||
|         u2a = A.utf8_to_ansel(suni) | ||||
|         self.assertEquals(u2a,sans, msg(u2a, sans, "invert onebyte to unicode mapping")) | ||||
|  | ||||
|     def test_1b(self): | ||||
|         """ 1b: map ansel twobyte to unicode and inverse """ | ||||
|         # these are the precomposed combining forms | ||||
|         count = 0 | ||||
|         sans  = "" | ||||
|         suni  = u"" | ||||
|         for acode in sorted(A._twobyte.keys()): | ||||
|             count += 1 | ||||
|             sans += acode | ||||
|             suni += A._twobyte[acode]  | ||||
|         if OUT & 2:     | ||||
|             print "test1b: %d codes" % count | ||||
|             print " ansel:%s" % `sans` | ||||
|             print " utf-8:%s" % `suni.encode("utf-8")` # U8 | ||||
|             sys.stdout.flush() | ||||
|         a2u = A.ansel_to_utf8(sans) | ||||
|         self.assertEquals(a2u,suni, msg(a2u,suni,"map twobyte ansel to unicode")) | ||||
|         u2a = A.utf8_to_ansel(suni) | ||||
|         self.assertEquals(u2a,sans, msg(u2a,sans, "invert twobyte to unicode mapping")) | ||||
|          | ||||
| class Test2(unittest.TestCase): | ||||
|     """ test unicode_to_ansel (basic precomposed forms) and inversion """ | ||||
|  | ||||
|     def test_2a(self): | ||||
|         """ 2a: unicode to ansel and inverse """ | ||||
|         count = 0 | ||||
|         sans  = "" | ||||
|         suni  = u"" | ||||
|         for ucode in sorted(A._utoa.keys()): | ||||
|             count += 1 | ||||
|             suni += ucode | ||||
|             sans += A._utoa[ucode]  | ||||
|         if OUT & 4:     | ||||
|             print "test2a: %d codes" % count | ||||
|             print " utf-8:%s" % `suni.encode("utf-8")` # U8 | ||||
|             print " ansel:%s" % `sans` | ||||
|             sys.stdout.flush() | ||||
|         u2a = A.utf8_to_ansel(suni) | ||||
|         self.assertEquals(u2a,sans, msg(u2a,sans, "map unicode to ansel")) | ||||
|         a2u = A.ansel_to_utf8(sans) | ||||
|         self.assertEquals(a2u,suni, msg(a2u,suni, "invert unicode to ansel mapping")) | ||||
|  | ||||
| class Test3(unittest.TestCase): | ||||
|     """ test pass-through for matches with ansel ascii-subset """ | ||||
|  | ||||
|     def test3a(self): | ||||
|         """ 3a: ansel to unicode for matches with ascii and inverse """ | ||||
|         ascii_ok = "".join(A._use_ASCII) | ||||
|         ascii_uni =  unicode(ascii_ok) | ||||
|         a2u = A.ansel_to_utf8(ascii_ok) | ||||
|         # could match with lengths wrong? can't hurt to test | ||||
|         la = len(ascii_ok) | ||||
|         la2u = len(a2u) | ||||
|         self.assertEquals(la2u, la, msg(la2u, la, "ascii subset ansel to unicode lengths match")) | ||||
|         self.assertEquals(a2u, ascii_uni,  | ||||
|             msg(a2u, ascii_uni, "ascii subset ansel to unicode strings match")) | ||||
|         a2u2a = A.utf8_to_ansel(a2u) | ||||
|         self.assertEquals(a2u2a, ascii_ok,  | ||||
|             msg(a2u2a, ascii_ok, "invert ascii subset ansel to unicode")) | ||||
|  | ||||
|     def test3b(self): | ||||
|         """ 3b: (sample) non-matching ascii control chars map to space """ | ||||
|         for x in [0,1,8,9,11,26,28,127]: | ||||
|             a2u = A.ansel_to_utf8(chr(x)) | ||||
|             self.assertEquals(a2u, unicode(' '),  | ||||
|                 msg(a2u, unicode(' '), "map disallowed ASCII to unicode space")) | ||||
|             u2a = A.utf8_to_ansel(unichr(x)) | ||||
|             self.assertEquals(u2a, ' ', | ||||
|                 msg(u2a, ' ', "map unicode to space for disallowed ASCII")) | ||||
|      | ||||
|     def test3c(self): | ||||
|         """ 3c: (sample) no-match ansel to unicode cases """ | ||||
|         for x in [0x80,0x87,0x9F,0xFF]: | ||||
|             a2u = A.ansel_to_utf8(chr(x)) | ||||
|             self.assertEquals(a2u, u'\ufffd', | ||||
|                 msg(a2u, u'\ufffd', "ansel no-match should return unicode Replacement Char")) | ||||
|  | ||||
|     def test3d(self): | ||||
|         """ 3d: (sample) no-match unicode to ansel cases """ | ||||
|         for x in [1024,4096, 65535]: | ||||
|             u2a = A.utf8_to_ansel(unichr(x)) | ||||
|             self.assertEquals(u2a, '?',  | ||||
|                 msg(u2a, '?', "unicode no-match should return question mark")) | ||||
|  | ||||
| class Test4(unittest.TestCase): | ||||
|     """ test some special cases """ | ||||
|  | ||||
|     def test4a(self): | ||||
|         """ 4a: empty strings should return empty strings """ | ||||
|         self.assertEquals(A.ansel_to_utf8(""), u"", "empty a2u") | ||||
|         self.assertEquals(A.utf8_to_ansel(u""), "", "empty u2a") | ||||
|  | ||||
|     def test4b_unmapped_combos(s): | ||||
|         """ 4b: (sample) unmapped (non-precomposed) combinations """ | ||||
|         samples = (  | ||||
|             # ansel, unicode, failure-report-message .. see function msg() | ||||
|             ("b\xE5Ze", u"bZ\u0304e", "b Z+macron e"), | ||||
|             ( "\xE5Z",   u"Z\u0304", "Z+macron"), | ||||
|             ("b\xE5Z\xE9Xe", u"bZ\u0304X\u030ce", "b Z+macron X+caron e"), | ||||
|             ( "\xE5Z\xE9X",   u"Z\u0304X\u030c", "Z+macron X+caron"), | ||||
|         ) | ||||
|         for a,u,m in samples: | ||||
|             # ansel to unicode and inverse | ||||
|             a2u=A.ansel_to_utf8(a) | ||||
|             s.assertEquals(a2u, u, msg(a2u, u, m, "a2u")) | ||||
|             a2u2a = A.utf8_to_ansel(a2u) | ||||
|             s.assertEquals(a2u2a, a, msg(a2u2a, a, m, "a2u2a")) | ||||
|  | ||||
|             # unicode to ansel and inverse | ||||
|             u2a = A.utf8_to_ansel(u) | ||||
|             s.assertEquals(u2a, a, msg(u2a, a, m, "u2a")) | ||||
|             u2a2u = A.ansel_to_utf8(u2a) | ||||
|             s.assertEquals(u2a2u, u, msg(u2a2u, u, m, "u2a2u")) | ||||
|          | ||||
|     def test4c_multiple_combos(s): | ||||
|         """ 4c: (a2u) ignore multiple combinations (include precomposed) """ | ||||
|         samples = ( | ||||
|             ("b\xF0\xE5Ze", u"bZ\u0304e", "b <cedilla> Z+macron e"), | ||||
|             ( "\xF0\xE5Z",   u"Z\u0304", "<cedilla> Z+macron"), | ||||
|             ("\xF0\xE5Z\xE9X", u"Z\u0304X\u030c", "<cedilla> Z+macron X+caron"), | ||||
|             ("\xE5Z\xF0\xE9X", u"Z\u0304X\u030c", "Z+macron <cedilla> X+caron"), | ||||
|             ('\xF0\xE5A', u'\u0100', "<cedilla> A+macron"), | ||||
|             ("\xE5Z\xE5\xF0\xE9X", u"Z\u0304X\u030c", "Z+macron <macron> <cedilla> X+caron"), | ||||
|         ) | ||||
|         for a,u,m in samples: | ||||
|             a2u=A.ansel_to_utf8(a) | ||||
|             s.assertEquals(a2u, u, msg(a2u,u,m, "a2u drop extra <combiners>")) | ||||
|  | ||||
|     def test4d_multiple_combos(s): | ||||
|         """ 4c: (u2a) ignore multiple combinations (include precomposed) """ | ||||
|         samples = ( | ||||
|             ("b\xE5Ze", u"bZ\u0304\u0327e", "b Z+macron <cedilla> e"), | ||||
|             ("\xE5Z\xE5A", u"Z\u0304\u0327\u0100", "Z+macron <cedilla> A+macron"), | ||||
|             ("\xE5A\xE5Z", u"\u0100\u0327\u030cZ\u0304", "A+macron <cedilla> <caron> Z+macron"), | ||||
|         ) | ||||
|         for a,u,m in samples: | ||||
|             u2a=A.utf8_to_ansel(u) | ||||
|             s.assertEquals(u2a, a, msg(u2a,a,m, "u2a drop extra <combiners>")) | ||||
|  | ||||
| class Test99(unittest.TestCase): | ||||
|     """ test regression cases """ | ||||
|      | ||||
|     def test_99a(s): | ||||
|         """ 99a: sanity check on counts """ | ||||
|         n1B= len(A._onebyte) | ||||
|         n2B= len(A._twobyte) | ||||
|         na = n1B+n2B | ||||
|         nu = len(A._utoa) | ||||
|         s.assertEquals(na, nu, msg(na, nu, "basic counts: a2u=u2a")) | ||||
|         nac = len(A._acombiners) | ||||
|         nuc = len(A._ucombiners) | ||||
|         s.assertEquals(nac, nuc, msg(nac, nuc, "combiner counts: a2u=u2a")) | ||||
|  | ||||
|     def test_99b(s): | ||||
|         """ 99b: fix incorrect mapping for ansel 0xAE | ||||
|          | ||||
|         It used-to-be U+02be but was changed March 2005 to U+02bc | ||||
|         Note : the other revs per notes make double-wide combining | ||||
|         char halves into an ambiguous mess -- let's ignore that! | ||||
|             http://lcweb2.loc.gov/diglib/codetables/45.html | ||||
|         might as well add validation of other additions, though | ||||
|         """ | ||||
|          | ||||
|         # (ansel, uni, msg) | ||||
|         revs = ( | ||||
|             ('\xAE', u'\u02bc', "modifier right-half ring is now modifier Apostrophe"), | ||||
|             ('\xC7', u'\xdf',   "added code for eszet"), | ||||
|             ('\xC8', u'\u20ac', "added code for euro"), | ||||
|         ) | ||||
|         for a, u, m in revs: | ||||
|             g = A.ansel_to_utf8(a) | ||||
|             s.assertEquals(g,u,  | ||||
|             msg(g, u, m, "spec change")) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
|  | ||||
| #===eof=== | ||||
| @@ -72,9 +72,8 @@ class Test1(U.TestCase): | ||||
| # path-related features (note use of tu.msg tested above) | ||||
| class Test2(U.TestCase): | ||||
|     def test2a_context_via_traceback(s): | ||||
|         e = __file__.rstrip(".co")   # eg in *.py[co] | ||||
|         g = tu._caller_context()[0] | ||||
|         g.rstrip('c') | ||||
|         e = os.path.basename(__file__).rstrip(".co")   # eg in *.py[co] | ||||
|         g = os.path.basename(tu._caller_context()[0]).rstrip('co') | ||||
|         s.assertEqual(g,e, tu.msg(g,e, "_caller_context")) | ||||
|    | ||||
|     def test2b_absdir(s): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user