from binascii import unhexlify
from math import ceil
from typing import Any, Dict, List, Tuple, Union, cast
from ._codecs import adobe_glyphs, charset_encoding
from ._utils import b_, logger_error, logger_warning
from .generic import (
DecodedStreamObject,
DictionaryObject,
IndirectObject,
NullObject,
StreamObject,
)
# code freely inspired from @twiggy ; see #711
def build_char_map(
font_name: str, space_width: float, obj: DictionaryObject
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:
"""
Determine information about a font.
Args:
font_name: font name as a string
space_width: default space width if no data is found.
obj: XObject or Page where you can find a /Resource dictionary
Returns:
Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
The font-dictionary itself is suitable for the curious.
"""
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
space_width, ft
)
return font_subtype, font_halfspace, font_encoding, font_map, ft
def build_char_map_from_dict(
space_width: float, ft: DictionaryObject
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:
"""
Determine information about a font.
Args:
space_width: default space with if no data found
(normally half the width of a character).
ft: Font Dictionary
Returns:
Font sub-type, space_width criteria(50% of width), encoding, map character-map.
The font-dictionary itself is suitable for the curious.
"""
font_type: str = cast(str, ft["/Subtype"])
space_code = 32
encoding, space_code = parse_encoding(ft, space_code)
map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)
# encoding can be either a string for decode
# (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
# if empty string, it means it is than encoding field is not present and
# we have to select the good encoding from cmap input data
if encoding == "":
if -1 not in map_dict or map_dict[-1] == 1:
# I have not been able to find any rule for no /Encoding nor /ToUnicode
# One example shows /Symbol,bold I consider 8 bits encoding default
encoding = "charmap"
else:
encoding = "utf-16-be"
# apply rule from PDF ref 1.7 §5.9.1, 1st bullet :
# if cmap not empty encoding should be discarded
# (here transformed into identity for those characters)
# if encoding is an str it is expected to be a identity translation
elif isinstance(encoding, dict):
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
try:
# override space_width with new params
space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
pass
# I consider the space_code is available on one byte
if isinstance(space_code, str):
try: # one byte
sp = space_code.encode("charmap")[0]
except Exception:
sp = space_code.encode("utf-16-be")
sp = sp[0] + 256 * sp[1]
else:
sp = space_code
sp_width = compute_space_width(ft, sp, space_width)
return (
font_type,
float(sp_width / 2),
encoding,
# https://github.com/python/mypy/issues/4374
map_dict,
)
# used when missing data, e.g. font def missing
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
"Unknown",
9999,
dict(zip(range(256), ["�"] * 256)),
{},
)
_predefined_cmap: Dict[str, str] = {
"/Identity-H": "utf-16-be",
"/Identity-V": "utf-16-be",
"/GB-EUC-H": "gbk",
"/GB-EUC-V": "gbk",
"/GBpc-EUC-H": "gb2312",
"/GBpc-EUC-V": "gb2312",
"/GBK-EUC-H": "gbk",
"/GBK-EUC-V": "gbk",
"/GBK2K-H": "gb18030",
"/GBK2K-V": "gb18030",
"/ETen-B5-H": "cp950",
"/ETen-B5-V": "cp950",
"/ETenms-B5-H": "cp950",
"/ETenms-B5-V": "cp950",
"/UniCNS-UTF16-H": "utf-16-be",
"/UniCNS-UTF16-V": "utf-16-be",
# UCS2 in code
}
# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
_default_fonts_space_width: Dict[str, int] = {
"/Courier": 600,
"/Courier-Bold": 600,
"/Courier-BoldOblique": 600,
"/Courier-Oblique": 600,
"/Helvetica": 278,
"/Helvetica-Bold": 278,
"/Helvetica-BoldOblique": 278,
"/Helvetica-Oblique": 278,
"/Helvetica-Narrow": 228,
"/Helvetica-NarrowBold": 228,
"/Helvetica-NarrowBoldOblique": 228,
"/Helvetica-NarrowOblique": 228,
"/Times-Roman": 250,
"/Times-Bold": 250,
"/Times-BoldItalic": 250,
"/Times-Italic": 250,
"/Symbol": 250,
"/ZapfDingbats": 278,
}
def parse_encoding(
ft: DictionaryObject, space_code: int
) -> Tuple[Union[str, Dict[int, str]], int]:
encoding: Union[str, List[str], Dict[int, str]] = []
if "/Encoding" not in ft:
try:
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
encoding = dict(
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
)
else:
encoding = "charmap"
return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
if cast(str, ft["/Subtype"]) == "/Type1":
return "charmap", space_code
else:
return "", space_code
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
# already done : enc = NameObject.unnumber(enc.encode()).decode()
# for #xx decoding
if enc in charset_encoding:
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
encoding = _predefined_cmap[enc]
elif "-UCS2-" in enc:
encoding = "utf-16-be"
else:
raise Exception("not found")
except Exception:
logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
encoding = enc
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
try:
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
except Exception:
logger_error(
f"Advanced encoding {encoding} not implemented yet",
__name__,
)
encoding = charset_encoding["/StandardCoding"].copy()
else:
encoding = charset_encoding["/StandardCoding"].copy()
if "/Differences" in enc:
x: int = 0
o: Union[int, str]
for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
if isinstance(o, int):
x = o
else: # isinstance(o,str):
try:
encoding[x] = adobe_glyphs[o] # type: ignore
except Exception:
encoding[x] = o # type: ignore
if o == " ":
space_code = x
x += 1
if isinstance(encoding, list):
encoding = dict(zip(range(256), encoding))
return encoding, space_code
def parse_to_unicode(
ft: DictionaryObject, space_code: int
) -> Tuple[Dict[Any, Any], int, List[int]]:
# will store all translation code
# and map_dict[-1] we will have the number of bytes to convert
map_dict: Dict[Any, Any] = {}
# will provide the list of cmap keys as int to correct encoding
int_entry: List[int] = []
if "/ToUnicode" not in ft:
if ft.get("/Subtype", "") == "/Type1":
return type1_alternative(ft, map_dict, space_code, int_entry)
else:
return {}, space_code, []
process_rg: bool = False
process_char: bool = False
multiline_rg: Union[
None, Tuple[int, int]
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
cm = prepare_cm(ft)
for line in cm.split(b"\n"):
process_rg, process_char, multiline_rg = process_cm_line(
line.strip(b" \t"),
process_rg,
process_char,
multiline_rg,
map_dict,
int_entry,
)
for a, value in map_dict.items():
if value == " ":
space_code = a
return map_dict, space_code, int_entry
def prepare_cm(ft: DictionaryObject) -> bytes:
tu = ft["/ToUnicode"]
cm: bytes
if isinstance(tu, StreamObject):
cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
elif isinstance(tu, str) and tu.startswith("/Identity"):
# the full range 0000-FFFF will be processed
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
if isinstance(cm, str):
cm = cm.encode()
# we need to prepare cm before due to missing return line in pdf printed
# to pdf from word
cm = (
cm.strip()
.replace(b"beginbfchar", b"\nbeginbfchar\n")
.replace(b"endbfchar", b"\nendbfchar\n")
.replace(b"beginbfrange", b"\nbeginbfrange\n")
.replace(b"endbfrange", b"\nendbfrange\n")
.replace(b"<<", b"\n{\n") # text between << and >> not used but
.replace(b">>", b"\n}\n") # some solution to find it back
)
ll = cm.split(b"<")
for i in range(len(ll)):
j = ll[i].find(b">")
if j >= 0:
if j == 0:
# string is empty: stash a placeholder here (see below)
# see https://github.com/py-pdf/pypdf/issues/1111
content = b"."
else:
content = ll[i][:j].replace(b" ", b"")
ll[i] = content + b" " + ll[i][j + 1 :]
cm = (
(b" ".join(ll))
.replace(b"[", b" [ ")
.replace(b"]", b" ]\n ")
.replace(b"\r", b"\n")
)
return cm
def process_cm_line(
line: bytes,
process_rg: bool,
process_char: bool,
multiline_rg: Union[None, Tuple[int, int]],
map_dict: Dict[Any, Any],
int_entry: List[int],
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
if line == b"" or line[0] == 37: # 37 = %
return process_rg, process_char, multiline_rg
line = line.replace(b"\t", b" ")
if b"beginbfrange" in line:
process_rg = True
elif b"endbfrange" in line:
process_rg = False
elif b"beginbfchar" in line:
process_char = True
elif b"endbfchar" in line:
process_char = False
elif process_rg:
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
elif process_char:
parse_bfchar(line, map_dict, int_entry)
return process_rg, process_char, multiline_rg
def parse_bfrange(
line: bytes,
map_dict: Dict[Any, Any],
int_entry: List[int],
multiline_rg: Union[None, Tuple[int, int]],
) -> Union[None, Tuple[int, int]]:
lst = [x for x in line.split(b" ") if x]
closure_found = False
if multiline_rg is not None:
fmt = b"%%0%dX" % (map_dict[-1] * 2)
a = multiline_rg[0] # a, b not in the current line
b = multiline_rg[1]
for sq in lst[0:]:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else:
a = int(lst[0], 16)
b = int(lst[1], 16)
nbi = max(len(lst[0]), len(lst[1]))
map_dict[-1] = ceil(nbi / 2)
fmt = b"%%0%dX" % (map_dict[-1] * 2)
if lst[2] == b"[":
for sq in lst[3:]:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else: # case without list
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
closure_found = True
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
c += 1
return None if closure_found else (a, b)
def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
lst = [x for x in line.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 1:
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
map_to = unhexlify(lst[1]).decode(
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = map_to
int_entry.append(int(lst[0], 16))
lst = lst[2:]
def compute_space_width(
ft: DictionaryObject, space_code: int, space_width: float
) -> float:
sp_width: float = space_width * 2.0 # default value
w = []
w1 = {}
st: int = 0
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
try:
w1[-1] = cast(float, ft1["/DW"])
except Exception:
w1[-1] = 1000.0
if "/W" in ft1:
w = list(ft1["/W"])
else:
w = []
while len(w) > 0:
st = w[0] if isinstance(w[0], int) else w[0].get_object()
second = w[1].get_object()
if isinstance(second, int):
for x in range(st, second):
w1[x] = w[2]
w = w[3:]
elif isinstance(second, list):
for y in second:
w1[st] = y
st += 1
w = w[2:]
else:
logger_warning(
"unknown widths : \n" + (ft1["/W"]).__repr__(),
__name__,
)
break
try:
sp_width = w1[space_code]
except Exception:
sp_width = (
w1[-1] / 2.0
) # if using default we consider space will be only half size
elif "/Widths" in ft:
w = list(ft["/Widths"]) # type: ignore
try:
st = cast(int, ft["/FirstChar"])
en: int = cast(int, ft["/LastChar"])
if st > space_code or en < space_code:
raise Exception("Not in range")
if w[space_code - st] == 0:
raise Exception("null width")
sp_width = w[space_code - st]
except Exception:
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
DictionaryObject, ft["/FontDescriptor"]
):
sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore
else:
# will consider width of char as avg(width)/2
m = 0
cpt = 0
for x in w:
if x > 0:
m += x
cpt += 1
sp_width = m / max(1, cpt) / 2
if isinstance(sp_width, IndirectObject):
# According to
# 'Table 122 - Entries common to all font descriptors (continued)'
# the MissingWidth should be a number, but according to #2286 it can
# be an indirect object
obj = sp_width.get_object()
if obj is None or isinstance(obj, NullObject):
return 0.0
return obj # type: ignore
return sp_width
def type1_alternative(
ft: DictionaryObject,
map_dict: Dict[Any, Any],
space_code: int,
int_entry: List[int],
) -> Tuple[Dict[Any, Any], int, List[int]]:
if "/FontDescriptor" not in ft:
return map_dict, space_code, int_entry
ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
if ft_desc is None:
return map_dict, space_code, int_entry
txt = ft_desc.get_object().get_data()
txt = txt.split(b"eexec\n")[0] # only clear part
txt = txt.split(b"/Encoding")[1] # to get the encoding part
lines = txt.replace(b"\r", b"\n").split(b"\n")
for li in lines:
if li.startswith(b"dup"):
words = [_w for _w in li.split(b" ") if _w != b""]
if len(words) > 3 and words[3] != b"put":
continue
try:
i = int(words[1])
except ValueError: # pragma: no cover
continue
try:
v = adobe_glyphs[words[2].decode()]
except KeyError:
if words[2].startswith(b"/uni"):
try:
v = chr(int(words[2][4:], 16))
except ValueError: # pragma: no cover
continue
else:
continue
if words[2].decode() == b" ":
space_code = i
map_dict[chr(i)] = v
int_entry.append(i)
return map_dict, space_code, int_entry