# pylint: disable=fixme,protected-access
"""
This module is in work-in-progress state.
Hint tables / hint streams have not been implemented yet,
and there are a few "TODO" comment remaining.
cf. https://github.com/py-pdf/fpdf2/issues/62
"""
from .output import ContentWithoutID, OutputProducer, PDFHeader
from .sign import sign_content
from .syntax import PDFArray, PDFContentStream, PDFObject
from .syntax import iobj_ref as pdf_ref
from .util import buffer_subst
try:
from endesive import signer
except ImportError:
signer = None
HINT_STREAM_OFFSET_LENGTH_PLACEHOLDER = "0%1%2%3%4%5%6%7%8%9%a%b%c%d"
FIRST_PAGE_END_OFFSET_PLACEHOLDER = "1%2%3%4%5%6%"
MAIN_XREF_1ST_ENTRY_OFFSET_PLACEHOLDER = "2%3%4%5%6%7%"
FILE_LENGTH_PLACEHOLDER = "3%4%5%6%7%8%"
class PDFLinearization(PDFObject):
def __init__(self, pages_count):
super().__init__()
self.linearized = "1" # Version
self.n = pages_count
# Primary hint stream offset and length (part 5):
self.h = HINT_STREAM_OFFSET_LENGTH_PLACEHOLDER
self.o = None # Object number of first page’s page object (part 6)
self.e = FIRST_PAGE_END_OFFSET_PLACEHOLDER # Offset of end of first page
# Offset of first entry in main cross-reference table (part 11):
self.t = MAIN_XREF_1ST_ENTRY_OFFSET_PLACEHOLDER
self.l = FILE_LENGTH_PLACEHOLDER # The length of the entire file in bytes
class PDFXrefAndTrailer(ContentWithoutID):
PREV_MAIN_XREF_START_PLACEHOLDER = "0%1*2+3-2/1^"
def __init__(self, output_builder):
self.output_builder = output_builder
self.count = output_builder.obj_id + 1
self.start_obj_id = 1
# Must be set before the call to serialize():
self.catalog_obj = None
self.info_obj = None
self.first_xref = None
self.main_xref = None
# Computed at serialize() time based on output_builder.buffer size:
self.startxref = None
@property
def is_first_xref(self):
return bool(self.main_xref)
@property
def is_main_xref(self):
return bool(self.first_xref)
def serialize(self, _security_handler=None):
builder = self.output_builder
out = []
self.startxref = str(len(builder.buffer))
if self.is_main_xref:
builder.buffer = buffer_subst(
builder.buffer,
self.PREV_MAIN_XREF_START_PLACEHOLDER,
self.startxref.rjust(12, " "),
)
out.append("xref")
out.append(f"{0 if self.start_obj_id == 1 else self.start_obj_id} {self.count}")
if not self.is_first_xref:
out.append("0000000000 65535 f ")
assert (
len(builder.offsets) > 1
), "TODO: how to know the offsets in the 1st xref at this stage?"
for obj_id in range(self.start_obj_id, self.start_obj_id + self.count):
out.append(f"{builder.offsets[obj_id]:010} 00000 n ")
out.append("trailer")
out.append("<<")
if self.is_main_xref:
out.append(f"/Size {self.count - self.first_xref.count}")
else:
if self.is_first_xref:
out.append(f"/Size {self.main_xref.count}")
out.append(f"/Prev {self.PREV_MAIN_XREF_START_PLACEHOLDER}")
else:
out.append(f"/Size {self.count}")
out.append(f"/Root {pdf_ref(self.catalog_obj.id)}")
out.append(f"/Info {pdf_ref(self.info_obj.id)}")
fpdf = builder.fpdf
file_id = fpdf.file_id()
if file_id == -1:
file_id = fpdf._default_file_id(builder.buffer)
if file_id:
out.append(f"/ID [{file_id}]")
out.append(">>")
out.append("startxref")
startxref = self.startxref
if self.is_main_xref:
startxref = self.first_xref.startxref
if self.is_first_xref:
startxref = "0"
out.append(startxref)
out.append("%%EOF")
return "\n".join(out)
class PDFHintStream(PDFContentStream):
def __init__(self, contents, compress=False):
super().__init__(contents=contents, compress=compress)
self.s = None # (Required) Shared object hint table
self.t = None # (Present only if thumbnail images exist) Thumbnail hint table
self.o = None # (Present only if a document outline exists) Outline hint table
self.a = None # (Present only if article threads exist) Thread information hint table
self.e = None # (Present only if named destinations exist) Named destination hint table
self.v = None # (Present only if an interactive form dictionary exists) Interactive form hint table
self.i = None # (Present only if a document information dictionary exists) Information dictionary hint table
self.c = None # (Present only if a logical structure hierarchy exists; PDF 1.3) Logical structure hint table
self.l = None # (PDF 1.3) Page label hint table
self.r = None # (Present only if a renditions name tree exists; PDF 1.5) Renditions name tree hint table
self.b = None # (Present only if embedded file streams exist; PDF 1.5) Embedded file stream hint table
class LinearizedOutputProducer(OutputProducer):
def bufferize(self):
fpdf = self.fpdf
# 1. Setup - Insert all PDF objects
# (in the order required to build a linearized PDF),
# and assign unique consecutive numeric IDs to all of them
# Part 1: Header
self.pdf_objs.append(PDFHeader(fpdf.pdf_version))
# Part 2: Linearization parameter dictionary
linearization_obj = PDFLinearization(fpdf.pages_count)
self._add_pdf_obj(linearization_obj)
# Part 3: First-page cross-reference table and trailer
first_xref = PDFXrefAndTrailer(self)
self.pdf_objs.append(first_xref)
# Part 4: Document catalogue and other required document-level objects
catalog_obj = self._add_catalog()
# Part 5: Primary hint stream (may precede or follow part 6)
hint_stream_obj = PDFHintStream("") # TODO
self.pdf_objs.append(hint_stream_obj)
# Part 6: First-page section (may precede or follow part 5)
page_objs = self._add_pages(slice(0, 1))
# The following objects shall be contained in the first-page section:
# + This page object shall explicitly specify all required attributes, e.g. Resources, MediaBox
# + The entire outline hierarchy, if the PageMode entry in the catalogue is UseOutlines
# + All objects that the page object refers to [including] Contents, Resources, Annots
# TODO
first_xref.count = self.obj_id + 1
first_xref_pdf_objs = list(self.pdf_objs)
self.obj_id = 0
# Part 7: Remaining pages
page_objs.extend(self._add_pages(slice(1, None)))
# Part 8: Shared objects for all pages except the first
# = resources, that are referenced from more than one page but [not] from the first page
pages_root_obj = self._add_pages_root()
sig_annotation_obj = self._add_annotations_as_objects()
font_objs_per_index = self._add_fonts()
img_objs_per_index = self._add_images()
gfxstate_objs_per_name = self._add_gfxstates()
resources_dict_obj = self._add_resources_dict(
font_objs_per_index, img_objs_per_index, gfxstate_objs_per_name
)
# Part 9: Objects not associated with pages, if any
for embedded_file in fpdf.embedded_files:
self._add_pdf_obj(embedded_file, "embedded_files")
struct_tree_root_obj = self._add_structure_tree()
outline_dict_obj, outline_items = self._add_document_outline()
xmp_metadata_obj = self._add_xmp_metadata()
info_obj = self._add_info()
# Part 11: Main cross-reference table and trailer
main_xref = PDFXrefAndTrailer(self)
self.pdf_objs.append(main_xref)
# Re-assigning IDs of all PDF objects in the 1st xref table:
first_xref.start_obj_id = self.obj_id + 1
for pdf_obj in first_xref_pdf_objs:
if (
not isinstance(pdf_obj, ContentWithoutID)
and pdf_obj is not hint_stream_obj
):
self.obj_id += 1
pdf_obj.obj_id = self.obj_id
# The hint streams shall be assigned the last object numbers in the file:
self.obj_id += 1
hint_stream_obj.id = self.obj_id
# 2. Plumbing - Inject all PDF object references required:
linearization_obj.o = page_objs[0].id
pages_root_obj.kids = PDFArray(page_objs)
self._finalize_catalog(
catalog_obj,
pages_root_obj=pages_root_obj,
first_page_obj=page_objs[0],
sig_annotation_obj=sig_annotation_obj,
xmp_metadata_obj=xmp_metadata_obj,
struct_tree_root_obj=struct_tree_root_obj,
outline_dict_obj=outline_dict_obj,
)
dests = []
for page_obj in page_objs:
page_obj.parent = pages_root_obj
page_obj.resources = resources_dict_obj
for annot in page_obj.annots:
if annot.dest:
dests.append(annot.dest)
if annot.a and hasattr(annot.a, "dest"):
dests.append(annot.a.dest)
if not page_obj.annots:
# Avoid serializing an empty PDFArray:
page_obj.annots = None
for outline_item in outline_items:
dests.append(outline_item.dest)
# Assigning the .page_ref property of all Destination objects:
for dest in dests:
dest.page_ref = pdf_ref(page_objs[dest.page_number - 1].id)
for struct_elem in fpdf.struct_builder.doc_struct_elem.k:
struct_elem.pg = page_objs[struct_elem.page_number() - 1]
main_xref.first_xref = first_xref
first_xref.main_xref = main_xref
for xref in [main_xref, first_xref]:
xref.catalog_obj = catalog_obj
xref.info_obj = info_obj
# 3. Serializing - Append all PDF objects to the buffer:
assert (
not self.buffer
), f"Nothing should have been appended to the .buffer at this stage: {self.buffer}"
assert (
not self.offsets
), f"No offset should have been set at this stage: {len(self.offsets)}"
for pdf_obj in self.pdf_objs:
if isinstance(pdf_obj, ContentWithoutID):
# top header, xref table & trailer:
trace_label = None
else:
self.offsets[pdf_obj.id] = len(self.buffer)
trace_label = self.trace_labels_per_obj_id.get(pdf_obj.id)
if trace_label:
with self._trace_size(trace_label):
self._out(pdf_obj.serialize())
else:
self._out(pdf_obj.serialize())
self._log_final_sections_sizes()
# Now that the file size & all the offsets are known,
# substitute the values of the Linearization properties:
hs1_offset = self.offsets[hint_stream_obj.id]
hs1_length = len(hint_stream_obj.serialize())
self.buffer = buffer_subst(
self.buffer,
HINT_STREAM_OFFSET_LENGTH_PLACEHOLDER,
f"[{hs1_offset: 12d} {hs1_length: 12d}]",
)
self.buffer = buffer_subst(
self.buffer,
FIRST_PAGE_END_OFFSET_PLACEHOLDER,
f"{self.offsets[page_objs[0].id + 1]: 12d}",
)
self.buffer = buffer_subst(
self.buffer,
MAIN_XREF_1ST_ENTRY_OFFSET_PLACEHOLDER,
f"{self.offsets[main_xref.start_obj_id]: 12d}",
)
self.buffer = buffer_subst(
self.buffer,
FILE_LENGTH_PLACEHOLDER,
f"{len(self.buffer): 12d}",
)
if fpdf._sign_key:
self.buffer = sign_content(
signer,
self.buffer,
fpdf._sign_key,
fpdf._sign_cert,
fpdf._sign_extra_certs,
fpdf._sign_hashalgo,
fpdf._sign_time,
)
return self.buffer