Skip to content

Commit

Permalink
ROB: Gracefully handle some text operators when the operands are miss…
Browse files Browse the repository at this point in the history
…ing (#3006)

* ROB: Gracefully handle some text operators when the operands are missing

Closes #2975.

* ignore complexity issue for now

* keep coverage for now

Co-authored-by: pubpub-zz <[email protected]>

* fix indentation

* noqa seems to be superfluous now

---------

Co-authored-by: pubpub-zz <[email protected]>
  • Loading branch information
stefan6419846 and pubpub-zz authored Dec 23, 2024
1 parent d24bce5 commit 07f68b4
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
6 changes: 3 additions & 3 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1980,12 +1980,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
memo_tm = tm_matrix.copy()
# Table 5.2 page 398
elif operator == b"Tz":
char_scale = float(operands[0]) / 100.0
char_scale = float(operands[0]) / 100.0 if operands else 1.0
elif operator == b"Tw":
space_scale = 1.0 + float(operands[0])
space_scale = 1.0 + float(operands[0] if operands else 0.0)
elif operator == b"TL":
scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2)
TL = float(operands[0]) * font_size * scale_x
TL = float(operands[0] if operands else 0.0) * font_size * scale_x
elif operator == b"Tf":
if text != "":
output += text # .translate(cmap)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,15 @@ def test_infinite_loop_arrays():
page = reader.pages[0]
extracted = page.extract_text()
assert "RNA structure comparison" in extracted


@pytest.mark.enable_socket
def test_tz_with_no_operands():
"""Tests for #2975"""
url = "https://github.com/user-attachments/files/17974120/9E5E080E-C8DB-4A6B-822B-9A67DC04E526-120438.pdf"
name = "iss2975.pdf"
data = get_data_from_url(url, name=name)

reader = PdfReader(BytesIO(data))
page = reader.pages[1]
assert "\nThankyouforyourattentiontothismatter.\n" in page.extract_text()

0 comments on commit 07f68b4

Please sign in to comment.