Skip to content

Commit 79345ed

Browse files
authored
ROB: Repair PDF with invalid Root object (#2880)
Closes #2875.
1 parent c8220c6 commit 79345ed

File tree

4 files changed

+111
-18
lines changed

4 files changed

+111
-18
lines changed

pypdf/_doc_common.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1148,8 +1148,9 @@ def _flatten(
11481148
# Fix issue 327: set flattened_pages attribute only for
11491149
# decrypted file
11501150
catalog = self.root_object
1151-
pages = catalog["/Pages"].get_object() # type: ignore
1152-
assert isinstance(pages, DictionaryObject)
1151+
pages = catalog.get("/Pages").get_object() # type: ignore
1152+
if not isinstance(pages, DictionaryObject):
1153+
raise PdfReadError("Invalid object in /Pages")
11531154
self.flattened_pages = []
11541155

11551156
if PA.TYPE in pages:

pypdf/_reader.py

+33-6
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ def __init__(
127127
# map page indirect_reference number to page number
128128
self._page_id2num: Optional[Dict[Any, Any]] = None
129129

130+
self._validated_root: Optional[DictionaryObject] = None
131+
130132
self._initialize_stream(stream)
131133

132134
self._override_encryption = False
@@ -197,10 +199,35 @@ def close(self) -> None:
197199
@property
198200
def root_object(self) -> DictionaryObject:
199201
"""Provide access to "/Root". Standardized with PdfWriter."""
200-
root = self.trailer[TK.ROOT]
201-
if root is None:
202-
raise PdfReadError('Cannot find "/Root" key in trailer')
203-
return cast(DictionaryObject, root.get_object())
202+
if self._validated_root:
203+
return self._validated_root
204+
root = self.trailer.get(TK.ROOT)
205+
if is_null_or_none(root):
206+
logger_warning('Cannot find "/Root" key in trailer', __name__)
207+
elif (
208+
cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type")
209+
== "/Catalog"
210+
):
211+
self._validated_root = cast(
212+
DictionaryObject, cast(PdfObject, root).get_object()
213+
)
214+
else:
215+
logger_warning("Invalid Root object in trailer", __name__)
216+
if self._validated_root is None:
217+
logger_warning('Searching object with "/Catalog" key', __name__)
218+
nb = cast(int, self.trailer.get("/Size", 0))
219+
for i in range(nb):
220+
try:
221+
o = self.get_object(i + 1)
222+
except Exception: # to be sure to capture all errors
223+
o = None
224+
if isinstance(o, DictionaryObject) and o.get("/Type") == "/Catalog":
225+
self._validated_root = o
226+
logger_warning(f"Root found at {o.indirect_reference!r}", __name__)
227+
break
228+
if self._validated_root is None:
229+
raise PdfReadError("Cannot find Root object in pdf")
230+
return self._validated_root
204231

205232
@property
206233
def _info(self) -> Optional[DictionaryObject]:
@@ -215,11 +242,11 @@ def _info(self) -> Optional[DictionaryObject]:
215242
return None
216243
else:
217244
info = info.get_object()
218-
if info == None: # noqa: E711
245+
if not isinstance(info, DictionaryObject):
219246
raise PdfReadError(
220247
"Trailer not found or does not point to document information directory"
221248
)
222-
return cast(DictionaryObject, info)
249+
return info
223250

224251
@property
225252
def _ID(self) -> Optional[ArrayObject]:

pypdf/generic/_base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -879,5 +879,6 @@ def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]
879879
True if x is None or NullObject.
880880
"""
881881
return x is None or (
882-
isinstance(x, PdfObject) and isinstance(x.get_object(), NullObject)
882+
isinstance(x, PdfObject)
883+
and (x.get_object() is None or isinstance(x.get_object(), NullObject))
883884
)

tests/test_reader.py

+73-9
Original file line numberDiff line numberDiff line change
@@ -137,14 +137,14 @@ def test_iss1943():
137137
def test_broken_meta_data(pdf_path):
138138
with open(pdf_path, "rb") as f:
139139
reader = PdfReader(f)
140-
with pytest.raises(
141-
PdfReadError,
142-
match=(
143-
"Trailer not found or does not point to document "
144-
"information directory"
145-
),
146-
):
147-
reader.metadata
140+
assert reader.metadata is None
141+
142+
with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as f:
143+
b = f.read(-1)
144+
reader = PdfReader(BytesIO(b.replace(b"/Info 2 0 R", b"/Info 2 ")))
145+
with pytest.raises(PdfReadError) as exc:
146+
reader.metadata
147+
assert "does not point to document information directory" in repr(exc)
148148

149149

150150
@pytest.mark.parametrize(
@@ -621,7 +621,7 @@ def test_read_unknown_zero_pages(caplog):
621621
assert normalize_warnings(caplog.text) == warnings
622622
with pytest.raises(PdfReadError) as exc:
623623
len(reader.pages)
624-
assert exc.value.args[0] == 'Cannot find "/Root" key in trailer'
624+
assert exc.value.args[0] == "Invalid object in /Pages"
625625

626626

627627
def test_read_encrypted_without_decryption():
@@ -1712,3 +1712,67 @@ def test_unbalanced_brackets_in_dictionary_object(caplog):
17121712
name = "iss2877.pdf" # reused
17131713
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
17141714
assert len(reader.pages) == 43 # note: /Count = 46 but 3 kids are None
1715+
1716+
1717+
@pytest.mark.enable_socket()
1718+
def test_repair_root(caplog):
1719+
"""Cf #2877"""
1720+
url = "https://github.com/user-attachments/files/17162216/crash-6620e8b1abfe3da639b654595da859b87f985748.pdf"
1721+
name = "iss2875.pdf"
1722+
1723+
b = get_data_from_url(url, name=name)
1724+
reader = PdfReader(BytesIO(b))
1725+
assert len(reader.pages) == 1
1726+
assert all(
1727+
msg in caplog.text
1728+
for msg in (
1729+
"Invalid Root object",
1730+
'Searching object with "/Catalog" key',
1731+
"Root found at IndirectObject(2, 0,",
1732+
)
1733+
)
1734+
1735+
# no /Root Entry
1736+
reader = PdfReader(BytesIO(b.replace(b"/Root", b"/Roo ")))
1737+
caplog.clear()
1738+
assert len(reader.pages) == 1
1739+
assert all(
1740+
msg in caplog.text
1741+
for msg in (
1742+
'Cannot find "/Root" key in trailer',
1743+
'Searching object with "/Catalog" key',
1744+
"Root found at IndirectObject(2, 0,",
1745+
)
1746+
)
1747+
1748+
# Invalid /Root Entry
1749+
caplog.clear()
1750+
reader = PdfReader(
1751+
BytesIO(
1752+
b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ")
1753+
)
1754+
)
1755+
with pytest.raises(PdfReadError):
1756+
len(reader.pages)
1757+
assert all(
1758+
msg in caplog.text
1759+
for msg in (
1760+
"Invalid Root object in trailer",
1761+
'Searching object with "/Catalog" key',
1762+
)
1763+
)
1764+
1765+
# Invalid /Root Entry + error in get_object
1766+
caplog.clear()
1767+
b = b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ")
1768+
b = b[:5124] + b"A" + b[5125:]
1769+
reader = PdfReader(BytesIO(b))
1770+
with pytest.raises(PdfReadError):
1771+
len(reader.pages)
1772+
assert all(
1773+
msg in caplog.text
1774+
for msg in (
1775+
"Invalid Root object in trailer",
1776+
'Searching object with "/Catalog" key',
1777+
)
1778+
)

0 commit comments

Comments
 (0)