from binascii import unhexlify from math import ceil from typing import Any, Dict, List, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding from ._utils import b_, logger_error, logger_warning from .generic import ( DecodedStreamObject, DictionaryObject, IndirectObject, NullObject, StreamObject, ) # code freely inspired from @twiggy ; see #711 def build_char_map( font_name: str, space_width: float, obj: DictionaryObject ) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]: """ Determine information about a font. Args: font_name: font name as a string space_width: default space width if no data is found. obj: XObject or Page where you can find a /Resource dictionary Returns: Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary. The font-dictionary itself is suitable for the curious. """ ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict( space_width, ft ) return font_subtype, font_halfspace, font_encoding, font_map, ft def build_char_map_from_dict( space_width: float, ft: DictionaryObject ) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]: """ Determine information about a font. Args: space_width: default space with if no data found (normally half the width of a character). ft: Font Dictionary Returns: Font sub-type, space_width criteria(50% of width), encoding, map character-map. The font-dictionary itself is suitable for the curious. """ font_type: str = cast(str, ft["/Subtype"]) space_code = 32 encoding, space_code = parse_encoding(ft, space_code) map_dict, space_code, int_entry = parse_to_unicode(ft, space_code) # encoding can be either a string for decode # (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me) # if empty string, it means it is than encoding field is not present and # we have to select the good encoding from cmap input data if encoding == "": if -1 not in map_dict or map_dict[-1] == 1: # I have not been able to find any rule for no /Encoding nor /ToUnicode # One example shows /Symbol,bold I consider 8 bits encoding default encoding = "charmap" else: encoding = "utf-16-be" # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : # if cmap not empty encoding should be discarded # (here transformed into identity for those characters) # if encoding is an str it is expected to be a identity translation elif isinstance(encoding, dict): for x in int_entry: if x <= 255: encoding[x] = chr(x) try: # override space_width with new params space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] except Exception: pass # I consider the space_code is available on one byte if isinstance(space_code, str): try: # one byte sp = space_code.encode("charmap")[0] except Exception: sp = space_code.encode("utf-16-be") sp = sp[0] + 256 * sp[1] else: sp = space_code sp_width = compute_space_width(ft, sp, space_width) return ( font_type, float(sp_width / 2), encoding, # https://github.com/python/mypy/issues/4374 map_dict, ) # used when missing data, e.g. font def missing unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = ( "Unknown", 9999, dict(zip(range(256), ["�"] * 256)), {}, ) _predefined_cmap: Dict[str, str] = { "/Identity-H": "utf-16-be", "/Identity-V": "utf-16-be", "/GB-EUC-H": "gbk", "/GB-EUC-V": "gbk", "/GBpc-EUC-H": "gb2312", "/GBpc-EUC-V": "gb2312", "/GBK-EUC-H": "gbk", "/GBK-EUC-V": "gbk", "/GBK2K-H": "gb18030", "/GBK2K-V": "gb18030", "/ETen-B5-H": "cp950", "/ETen-B5-V": "cp950", "/ETenms-B5-H": "cp950", "/ETenms-B5-V": "cp950", "/UniCNS-UTF16-H": "utf-16-be", "/UniCNS-UTF16-V": "utf-16-be", # UCS2 in code } # manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz _default_fonts_space_width: Dict[str, int] = { "/Courier": 600, "/Courier-Bold": 600, "/Courier-BoldOblique": 600, "/Courier-Oblique": 600, "/Helvetica": 278, "/Helvetica-Bold": 278, "/Helvetica-BoldOblique": 278, "/Helvetica-Oblique": 278, "/Helvetica-Narrow": 228, "/Helvetica-NarrowBold": 228, "/Helvetica-NarrowBoldOblique": 228, "/Helvetica-NarrowOblique": 228, "/Times-Roman": 250, "/Times-Bold": 250, "/Times-BoldItalic": 250, "/Times-Italic": 250, "/Symbol": 250, "/ZapfDingbats": 278, } def parse_encoding( ft: DictionaryObject, space_code: int ) -> Tuple[Union[str, Dict[int, str]], int]: encoding: Union[str, List[str], Dict[int, str]] = [] if "/Encoding" not in ft: try: if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: encoding = dict( zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) ) else: encoding = "charmap" return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])] except Exception: if cast(str, ft["/Subtype"]) == "/Type1": return "charmap", space_code else: return "", space_code enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore if isinstance(enc, str): try: # already done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() elif enc in _predefined_cmap: encoding = _predefined_cmap[enc] elif "-UCS2-" in enc: encoding = "utf-16-be" else: raise Exception("not found") except Exception: logger_error(f"Advanced encoding {enc} not implemented yet", __name__) encoding = enc elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: try: encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() except Exception: logger_error( f"Advanced encoding {encoding} not implemented yet", __name__, ) encoding = charset_encoding["/StandardCoding"].copy() else: encoding = charset_encoding["/StandardCoding"].copy() if "/Differences" in enc: x: int = 0 o: Union[int, str] for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): if isinstance(o, int): x = o else: # isinstance(o,str): try: encoding[x] = adobe_glyphs[o] # type: ignore except Exception: encoding[x] = o # type: ignore if o == " ": space_code = x x += 1 if isinstance(encoding, list): encoding = dict(zip(range(256), encoding)) return encoding, space_code def parse_to_unicode( ft: DictionaryObject, space_code: int ) -> Tuple[Dict[Any, Any], int, List[int]]: # will store all translation code # and map_dict[-1] we will have the number of bytes to convert map_dict: Dict[Any, Any] = {} # will provide the list of cmap keys as int to correct encoding int_entry: List[int] = [] if "/ToUnicode" not in ft: if ft.get("/Subtype", "") == "/Type1": return type1_alternative(ft, map_dict, space_code, int_entry) else: return {}, space_code, [] process_rg: bool = False process_char: bool = False multiline_rg: Union[ None, Tuple[int, int] ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file cm = prepare_cm(ft) for line in cm.split(b"\n"): process_rg, process_char, multiline_rg = process_cm_line( line.strip(b" \t"), process_rg, process_char, multiline_rg, map_dict, int_entry, ) for a, value in map_dict.items(): if value == " ": space_code = a return map_dict, space_code, int_entry def prepare_cm(ft: DictionaryObject) -> bytes: tu = ft["/ToUnicode"] cm: bytes if isinstance(tu, StreamObject): cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()) elif isinstance(tu, str) and tu.startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" if isinstance(cm, str): cm = cm.encode() # we need to prepare cm before due to missing return line in pdf printed # to pdf from word cm = ( cm.strip() .replace(b"beginbfchar", b"\nbeginbfchar\n") .replace(b"endbfchar", b"\nendbfchar\n") .replace(b"beginbfrange", b"\nbeginbfrange\n") .replace(b"endbfrange", b"\nendbfrange\n") .replace(b"<<", b"\n{\n") # text between << and >> not used but .replace(b">>", b"\n}\n") # some solution to find it back ) ll = cm.split(b"<") for i in range(len(ll)): j = ll[i].find(b">") if j >= 0: if j == 0: # string is empty: stash a placeholder here (see below) # see https://github.com/py-pdf/pypdf/issues/1111 content = b"." else: content = ll[i][:j].replace(b" ", b"") ll[i] = content + b" " + ll[i][j + 1 :] cm = ( (b" ".join(ll)) .replace(b"[", b" [ ") .replace(b"]", b" ]\n ") .replace(b"\r", b"\n") ) return cm def process_cm_line( line: bytes, process_rg: bool, process_char: bool, multiline_rg: Union[None, Tuple[int, int]], map_dict: Dict[Any, Any], int_entry: List[int], ) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]: if line == b"" or line[0] == 37: # 37 = % return process_rg, process_char, multiline_rg line = line.replace(b"\t", b" ") if b"beginbfrange" in line: process_rg = True elif b"endbfrange" in line: process_rg = False elif b"beginbfchar" in line: process_char = True elif b"endbfchar" in line: process_char = False elif process_rg: multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) elif process_char: parse_bfchar(line, map_dict, int_entry) return process_rg, process_char, multiline_rg def parse_bfrange( line: bytes, map_dict: Dict[Any, Any], int_entry: List[int], multiline_rg: Union[None, Tuple[int, int]], ) -> Union[None, Tuple[int, int]]: lst = [x for x in line.split(b" ") if x] closure_found = False if multiline_rg is not None: fmt = b"%%0%dX" % (map_dict[-1] * 2) a = multiline_rg[0] # a, b not in the current line b = multiline_rg[1] for sq in lst[0:]: if sq == b"]": closure_found = True break map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 else: a = int(lst[0], 16) b = int(lst[1], 16) nbi = max(len(lst[0]), len(lst[1])) map_dict[-1] = ceil(nbi / 2) fmt = b"%%0%dX" % (map_dict[-1] * 2) if lst[2] == b"[": for sq in lst[3:]: if sq == b"]": closure_found = True break map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 else: # case without list c = int(lst[2], 16) fmt2 = b"%%0%dX" % max(4, len(lst[2])) closure_found = True while a <= b: map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 c += 1 return None if closure_found else (a, b) def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: lst = [x for x in line.split(b" ") if x] map_dict[-1] = len(lst[0]) // 2 while len(lst) > 1: map_to = "" # placeholder (see above) means empty string if lst[1] != b".": map_to = unhexlify(lst[1]).decode( "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" ) # join is here as some cases where the code was split map_dict[ unhexlify(lst[0]).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" ) ] = map_to int_entry.append(int(lst[0], 16)) lst = lst[2:] def compute_space_width( ft: DictionaryObject, space_code: int, space_width: float ) -> float: sp_width: float = space_width * 2.0 # default value w = [] w1 = {} st: int = 0 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore try: w1[-1] = cast(float, ft1["/DW"]) except Exception: w1[-1] = 1000.0 if "/W" in ft1: w = list(ft1["/W"]) else: w = [] while len(w) > 0: st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() if isinstance(second, int): for x in range(st, second): w1[x] = w[2] w = w[3:] elif isinstance(second, list): for y in second: w1[st] = y st += 1 w = w[2:] else: logger_warning( "unknown widths : \n" + (ft1["/W"]).__repr__(), __name__, ) break try: sp_width = w1[space_code] except Exception: sp_width = ( w1[-1] / 2.0 ) # if using default we consider space will be only half size elif "/Widths" in ft: w = list(ft["/Widths"]) # type: ignore try: st = cast(int, ft["/FirstChar"]) en: int = cast(int, ft["/LastChar"]) if st > space_code or en < space_code: raise Exception("Not in range") if w[space_code - st] == 0: raise Exception("null width") sp_width = w[space_code - st] except Exception: if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore else: # will consider width of char as avg(width)/2 m = 0 cpt = 0 for x in w: if x > 0: m += x cpt += 1 sp_width = m / max(1, cpt) / 2 if isinstance(sp_width, IndirectObject): # According to # 'Table 122 - Entries common to all font descriptors (continued)' # the MissingWidth should be a number, but according to #2286 it can # be an indirect object obj = sp_width.get_object() if obj is None or isinstance(obj, NullObject): return 0.0 return obj # type: ignore return sp_width def type1_alternative( ft: DictionaryObject, map_dict: Dict[Any, Any], space_code: int, int_entry: List[int], ) -> Tuple[Dict[Any, Any], int, List[int]]: if "/FontDescriptor" not in ft: return map_dict, space_code, int_entry ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile") if ft_desc is None: return map_dict, space_code, int_entry txt = ft_desc.get_object().get_data() txt = txt.split(b"eexec\n")[0] # only clear part txt = txt.split(b"/Encoding")[1] # to get the encoding part lines = txt.replace(b"\r", b"\n").split(b"\n") for li in lines: if li.startswith(b"dup"): words = [_w for _w in li.split(b" ") if _w != b""] if len(words) > 3 and words[3] != b"put": continue try: i = int(words[1]) except ValueError: # pragma: no cover continue try: v = adobe_glyphs[words[2].decode()] except KeyError: if words[2].startswith(b"/uni"): try: v = chr(int(words[2][4:], 16)) except ValueError: # pragma: no cover continue else: continue if words[2].decode() == b" ": space_code = i map_dict[chr(i)] = v int_entry.append(i) return map_dict, space_code, int_entry