
| Current Path : /var/www/wsgi/www/api/venv/lib64/python3.12/site-packages/pyhanko/pdf_utils/ |
Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64 |
| Current File : /var/www/wsgi/www/api/venv/lib64/python3.12/site-packages/pyhanko/pdf_utils/rw_common.py |
"""Utilities common to reading and writing PDF files."""
from typing import Tuple
from . import generic, misc
from .metadata.model import DocumentMetadata
from .misc import PdfError
__all__ = ['PdfHandler']
class PdfHandler:
"""Abstract class providing a general interface for quering objects
in PDF readers and writers alike."""
def get_object(
self, ref: generic.Reference, as_metadata_stream: bool = False
):
"""
Retrieve the object associated with the provided reference from
this PDF handler.
:param ref:
An instance of :class:`.generic.Reference`.
:param as_metadata_stream:
Whether to dereference the object as an XMP metadata stream.
:return:
A PDF object.
"""
raise NotImplementedError
@property
def trailer_view(self) -> generic.DictionaryObject:
"""
Returns a view of the document trailer of the document represented
by this :class:`.PdfHandler` instance.
The view is effectively read-only, in the sense that any writes
will not be reflected in the actual trailer (if the handler supports
writing, that is).
:return:
A :class:`.generic.DictionaryObject` representing the current state
of the document trailer.
"""
raise NotImplementedError
@property
def document_meta_view(self) -> DocumentMetadata:
raise NotImplementedError
@property
def root_ref(self) -> generic.Reference:
"""
:return: A reference to the document catalog of this PDF handler.
"""
raise NotImplementedError
@property
def root(self) -> generic.DictionaryObject:
"""
:return: The document catalog of this PDF handler.
"""
root = self.root_ref.get_object()
assert isinstance(root, generic.DictionaryObject)
return root
@property
def document_id(self) -> Tuple[bytes, bytes]:
raise NotImplementedError
# TODO write tests specifically for this helper function
def _walk_page_tree(self, page_ix, retrieve_parent):
# the spec says that this will always be an indirect reference
page_tree_root_ref = self.root.raw_get('/Pages')
assert isinstance(page_tree_root_ref, generic.IndirectObject)
page_tree_root = page_tree_root_ref.get_object()
assert isinstance(page_tree_root, generic.DictionaryObject)
try:
root_resources = page_tree_root['/Resources']
except KeyError:
root_resources = generic.DictionaryObject()
page_count = page_tree_root['/Count']
if page_ix < 0:
page_ix = page_count + page_ix
if not (0 <= page_ix < page_count):
raise PdfError('Page index out of range')
def _recurse(first_page_ix, pages_obj_ref, last_rsrc_dict, refs_seen):
pages_obj = pages_obj_ref.get_object()
kids = pages_obj['/Kids']
try:
last_rsrc_dict = pages_obj.raw_get('/Resources')
except KeyError:
pass
cur_page_ix = first_page_ix
for kid_index, kid_ref in enumerate(kids):
if not isinstance(kid_ref, generic.IndirectObject):
raise misc.PdfReadError(
"Page tree node children must be indirect objects"
)
assert isinstance(kid_ref, generic.IndirectObject)
if kid_ref.reference in refs_seen:
raise misc.PdfReadError("Circular reference in page tree")
kid = kid_ref.get_object()
node_type = kid['/Type']
if node_type == '/Pages':
# recurse into this branch if the page we need
# is part of it
desc_count = kid['/Count']
if cur_page_ix <= page_ix < cur_page_ix + desc_count:
return _recurse(
cur_page_ix,
kid_ref,
last_rsrc_dict,
refs_seen | {kid_ref.reference},
)
cur_page_ix += desc_count
elif node_type == '/Page':
if cur_page_ix == page_ix:
if retrieve_parent:
return (pages_obj_ref, kid_index, last_rsrc_dict)
else:
try:
last_rsrc_dict = kid.raw_get('/Resources')
except KeyError:
pass
return kid_ref, last_rsrc_dict
else:
cur_page_ix += 1
# This means the PDF is not standards-compliant
raise PdfError('Page not found')
return _recurse(0, page_tree_root_ref, root_resources, set())
def find_page_container(self, page_ix):
"""
Retrieve the node in the page tree containing the
page with index ``page_ix``, along with the necessary objects
to modify it in an incremental update scenario.
:param page_ix:
The (zero-indexed) number of the page for which we want to
retrieve the parent.
A negative number counts pages from the back of the document,
with index ``-1`` referring to the last page.
:return:
A triple with the ``/Pages`` object (or a reference to it),
the index of the target page in said ``/Pages`` object, and a
(possibly inherited) resource dictionary.
"""
return self._walk_page_tree(page_ix, retrieve_parent=True)
def find_page_for_modification(self, page_ix):
"""
Retrieve the page with index ``page_ix`` from the page tree, along with
the necessary objects to modify it in an incremental update scenario.
:param page_ix:
The (zero-indexed) number of the page to retrieve.
A negative number counts pages from the back of the document,
with index ``-1`` referring to the last page.
:return:
A tuple with a reference to the page object and a
(possibly inherited) resource dictionary.
"""
return self._walk_page_tree(page_ix, retrieve_parent=False)