""" File for handling specialized encoding tasks or information. """ __all__ = [ 'lookupCodePage', ] # This adds additional encodings to python. import ebcdic as _ import codecs from ..exceptions import UnknownCodepageError, UnsupportedEncodingError # This is a dictionary matching the code page number to it's encoding name. # The list used to make this can be found here: # https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers ### TODO: # Many of these code pages are not supported by Python. As such, we should # really implement them ourselves to make sure that if someone wants to use an # MSG file with one of those encodings, they are able to. Perhaps we should # create a seperate module for that? # Code pages that currently don't have a supported encoding will be preceded by # `# UNSUPPORTED`. # For some of these, it is also possible that the name we are trying to find # them with is not known to Python. I have already confirmed this for a few of # them, and adjusted their names to ones that python would recognize. It is # Possible I missed a few. _CODE_PAGES = { 37: 'IBM037', # IBM EBCDIC US-Canada 437: 'IBM437', # OEM United States 500: 'IBM500', # IBM EBCDIC International 708: 'ASMO-708', # Arabic (ASMO 708) # UNSUPPORTED. 709: '', # Arabic (ASMO-449+, BCON V4) # UNSUPPORTED. 710: '', # Arabic - Transparent Arabic # UNSUPPORTED. 720: 'DOS-720', # Arabic (Transparent ASMO); Arabic (DOS) 737: 'cp737', # OEM Greek (formerly 437G); Greek (DOS) 775: 'ibm775', # OEM Baltic; Baltic (DOS) 850: 'ibm850', # OEM Multilingual Latin 1; Western European (DOS) 852: 'ibm852', # OEM Latin 2; Central European (DOS) 855: 'IBM855', # OEM Cyrillic (primarily Russian) 857: 'ibm857', # OEM Turkish; Turkish (DOS) 858: 'cp858', # OEM Multilingual Latin 1 + Euro symbol 860: 'IBM860', # OEM Portuguese; Portuguese (DOS) 861: 'ibm861', # OEM Icelandic; Icelandic (DOS) 862: 'cp862', # OEM Hebrew; Hebrew (DOS) 863: 'IBM863', # OEM French Canadian; French Canadian (DOS) 864: 'IBM864', # OEM Arabic; Arabic (864) 865: 'IBM865', # OEM Nordic; Nordic (DOS) 866: 'cp866', # OEM Russian; Cyrillic (DOS) 869: 'ibm869', # OEM Modern Greek; Greek, Modern (DOS) 870: 'cp870', # IBM870 # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 874: 'windows-874', # ANSI/OEM Thai (ISO 8859-11); Thai (Windows) 875: 'cp875', # IBM EBCDIC Greek Modern 932: 'shift_jis', # ANSI/OEM Japanese; Japanese (Shift-JIS) 936: 'gb2312', # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) 949: 'ks_c_5601-1987', # ANSI/OEM Korean (Unified Hangul Code) # We *must* use a custom encoding because the Python implementation differs # from the Microsoft implementation. 950: 'windows-950', # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) 1026: 'IBM1026', # IBM EBCDIC Turkish (Latin 5) 1047: 'cp1047', # IBM EBCDIC Latin 1/Open System 1140: 'cp1140', # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) 1141: 'cp1141', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) 1142: 'cp1142', # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) 1143: 'cp1143', # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) 1144: 'cp1144', # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) 1145: 'cp1145', # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) 1146: 'cp1146', # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) 1147: 'cp1147', # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) 1148: 'cp1148ms', # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) 1149: 'cp1149', # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) 1200: 'utf-16-le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); 1201: 'utf-16-be', # Unicode UTF-16, big endian byte order; 1250: 'windows-1250', # ANSI Central European; Central European (Windows) 1251: 'windows-1251', # ANSI Cyrillic; Cyrillic (Windows) 1252: 'windows-1252', # ANSI Latin 1; Western European (Windows) 1253: 'windows-1253', # ANSI Greek; Greek (Windows) 1254: 'windows-1254', # ANSI Turkish; Turkish (Windows) 1255: 'windows-1255', # ANSI Hebrew; Hebrew (Windows) 1256: 'windows-1256', # ANSI Arabic; Arabic (Windows) 1257: 'windows-1257', # ANSI Baltic; Baltic (Windows) 1258: 'windows-1258', # ANSI/OEM Vietnamese; Vietnamese (Windows) 1361: 'Johab', # Korean (Johab) 10000: 'macintosh', # MAC Roman; Western European (Mac) 10001: 'x-mac-japanese', # Japanese (Mac) # UNSUPPORTED. 10002: 'x-mac-chinesetrad', # MAC Traditional Chinese (Big5); Chinese Traditional (Mac) 10003: 'x-mac-korean', # Korean (Mac) # UNSUPPORTED. 10004: 'x-mac-arabic', # Arabic (Mac) # UNSUPPORTED. 10005: 'x-mac-hebrew', # Hebrew (Mac) 10006: 'x-mac-greek', # Greek (Mac) 10007: 'x-mac-cyrillic', # Cyrillic (Mac) # UNSUPPORTED. 10008: 'x-mac-chinesesimp', # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) # UNSUPPORTED. 10010: 'x-mac-romanian', # Romanian (Mac) # UNSUPPORTED. 10017: 'x-mac-ukrainian', # Ukrainian (Mac) # UNSUPPORTED. 10021: 'x-mac-thai', # Thai (Mac) 10029: 'x-mac-ce', # MAC Latin 2; Central European (Mac) 10079: 'x-mac-icelandic', # Icelandic (Mac) 10081: 'x-mac-turkish', # Turkish (Mac) # UNSUPPORTED. 10082: 'x-mac-croatian', # Croatian (Mac) 12000: 'utf-32', # Unicode UTF-32, little endian byte order 12001: 'utf-32BE', # Unicode UTF-32, big endian byte order # UNSUPPORTED. 20000: 'x-Chinese_CNS', # CNS Taiwan; Chinese Traditional (CNS) # UNSUPPORTED. 20001: 'x-cp20001', # TCA Taiwan # UNSUPPORTED. 20002: 'x_Chinese-Eten', # Eten Taiwan; Chinese Traditional (Eten) # UNSUPPORTED. 20003: 'x-cp20003', # IBM5550 Taiwan # UNSUPPORTED. 20004: 'x-cp20004', # TeleText Taiwan # UNSUPPORTED. 20005: 'x-cp20005', # Wang Taiwan # UNSUPPORTED. 20105: 'x-IA5', # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) # UNSUPPORTED. 20106: 'x-IA5-German', # IA5 German (7-bit) # UNSUPPORTED. 20107: 'x-IA5-Swedish', # IA5 Swedish (7-bit) # UNSUPPORTED. 20108: 'x-IA5-Norwegian', # IA5 Norwegian (7-bit) 20127: 'us-ascii', # US-ASCII (7-bit) # UNSUPPORTED. 20261: 'x-cp20261', # T.61 # UNSUPPORTED. 20269: 'x-cp20269', # ISO 6937 Non-Spacing Accent 20273: 'IBM273', # IBM EBCDIC Germany 20277: 'cp277', # IBM EBCDIC Denmark-Norway 20278: 'cp278', # IBM EBCDIC Finland-Sweden 20280: 'cp280', # IBM EBCDIC Italy 20284: 'cp284', # IBM EBCDIC Latin America-Spain 20285: 'cp285', # IBM EBCDIC United Kingdom 20290: 'cp290', # IBM EBCDIC Japanese Katakana Extended 20297: 'cp297', # IBM EBCDIC France 20420: 'cp420', # IBM EBCDIC Arabic # UNSUPPORTED. 20423: 'IBM423', # IBM EBCDIC Greek 20424: 'IBM424', # IBM EBCDIC Hebrew 20833: 'cp833', # IBM EBCDIC Korean Extended 20838: 'cp838', # IBM EBCDIC Thai 20866: 'koi8-r', # Russian (KOI8-R); Cyrillic (KOI8-R) 20871: 'cp871', # IBM EBCDIC Icelandic # UNSUPPORTED. 20880: 'IBM880', # IBM EBCDIC Cyrillic Russian # UNSUPPORTED. 20905: 'IBM905', # IBM EBCDIC Turkish # UNSUPPORTED. 20924: 'IBM00924', # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) 20932: 'EUC-JP', # Japanese (JIS 0208-1990 and 0212-1990) # UNSUPPORTED. 20936: 'x-cp20936', # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) # UNSUPPORTED. 20949: 'x-cp20949', # Korean Wansung 21025: 'cp1025', # IBM EBCDIC Cyrillic Serbian-Bulgarian # UNSUPPORTED. 21027: '', # (deprecated) 21866: 'koi8-u', # Ukrainian (KOI8-U); Cyrillic (KOI8-U) 28591: 'iso-8859-1', # ISO 8859-1 Latin 1; Western European (ISO) 28592: 'iso-8859-2', # ISO 8859-2 Central European; Central European (ISO) 28593: 'iso-8859-3', # ISO 8859-3 Latin 3 28594: 'iso-8859-4', # ISO 8859-4 Baltic 28595: 'iso-8859-5', # ISO 8859-5 Cyrillic 28596: 'iso-8859-6', # ISO 8859-6 Arabic 28597: 'iso-8859-7', # ISO 8859-7 Greek 28598: 'iso-8859-8', # ISO 8859-8 Hebrew; Hebrew (ISO-Visual) 28599: 'iso-8859-9', # ISO 8859-9 Turkish 28603: 'iso-8859-13', # ISO 8859-13 Estonian 28605: 'iso-8859-15', # ISO 8859-15 Latin 9 # UNSUPPORTED. 29001: 'x-Europa', # Europa 3 # UNSUPPORTED. 38598: 'iso-8859-8-i', # ISO 8859-8 Hebrew; Hebrew (ISO-Logical) 50220: 'iso-2022-jp', # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) 50221: 'csISO2022JP', # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) 50222: 'iso-2022-jp', # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) 50225: 'iso-2022-kr', # ISO 2022 Korean # UNSUPPORTED. 50227: 'x-cp50227', # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) # UNSUPPORTED. 50229: '', # ISO 2022 Traditional Chinese # UNSUPPORTED. 50930: '', # EBCDIC Japanese (Katakana) Extended # UNSUPPORTED. 50931: '', # EBCDIC US-Canada and Japanese # UNSUPPORTED. 50933: '', # EBCDIC Korean Extended and Korean # UNSUPPORTED. 50935: '', # EBCDIC Simplified Chinese Extended and Simplified Chinese # UNSUPPORTED. 50936: '', # EBCDIC Simplified Chinese # UNSUPPORTED. 50937: '', # EBCDIC US-Canada and Traditional Chinese # UNSUPPORTED. 50939: '', # EBCDIC Japanese (Latin) Extended and Japanese 51932: 'euc-jp', # EUC Japanese 51936: 'EUC-CN', # EUC Simplified Chinese; Chinese Simplified (EUC) 51949: 'euc-kr', # EUC Korean # UNSUPPORTED. 51950: '', # EUC Traditional Chinese 52936: 'hz-gb-2312', # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) 54936: 'GB18030', # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) # UNSUPPORTED. 57002: 'x-iscii-de', # ISCII Devanagari # UNSUPPORTED. 57003: 'x-iscii-be', # ISCII Bangla # UNSUPPORTED. 57004: 'x-iscii-ta', # ISCII Tamil # UNSUPPORTED. 57005: 'x-iscii-te', # ISCII Telugu # UNSUPPORTED. 57006: 'x-iscii-as', # ISCII Assamese # UNSUPPORTED. 57007: 'x-iscii-or', # ISCII Odia # UNSUPPORTED. 57008: 'x-iscii-ka', # ISCII Kannada # UNSUPPORTED. 57009: 'x-iscii-ma', # ISCII Malayalam # UNSUPPORTED. 57010: 'x-iscii-gu', # ISCII Gujarati # UNSUPPORTED. 57011: 'x-iscii-pa', # ISCII Punjabi 65000: 'utf-7', # Unicode (UTF-7) 65001: 'utf-8', # Unicode (UTF-8) } # Register new encodings. def lookupCodePage(id_: int) -> str: """ Converts an encoding id into it's name. :raises UnknownCodepageError: The code page was not recognized. :raises UnsupportedEncodingError: The code page was recognized, but no encoding exists in the environment with support for it. """ if id_ in _CODE_PAGES: if (page := _CODE_PAGES[id_]): return page else: raise UnsupportedEncodingError(f'Code page {id_} is unsupported.') else: raise UnknownCodepageError(f'Unknown code page {id_}.') def _lookupEncoding(name): return _codecsInfo.get(name) from .utils import createSBEncoding as _sb, createVBEncoding as _vb from ._dt import ( _mac_ce, _mac_cyrillic, _mac_greek, _mac_iceland, _mac_turkish, _win874_dec, _win950_dec ) _codecsInfo = { 'x_mac_ce': _sb('x-mac-ce', _mac_ce.decodingTable), 'x_mac_cyrillic': _sb('x-mac-cyrillic', _mac_cyrillic.decodingTable), 'x_mac_greek': _sb('x-mac-greek', _mac_greek.decodingTable), 'x_mac_icelandic': _sb('x-mac-icelandic', _mac_iceland.decodingTable), 'x_mac_turkish': _sb('x-mac-turkish', _mac_turkish.decodingTable), 'windows_950': _vb('windows-950', _win950_dec.decodingTable), 'windows_874': _sb('windows-874', _win874_dec.decodingTable), } codecs.register(_lookupEncoding)