Skip to content

Commit

Permalink
Caret annotations: initial support
Browse files Browse the repository at this point in the history
 * extract Caret annotations in PDF
 * handle IRT (in reply to) property, and expose as inter-Annotation lins
 * capture (but don't yet use) the optional N name property
 * when rendering the specific case of a Caret annotation with a single
   StrikeOut annotation as a "reply" (which is how Acrobat seems to render
   replace+insert edits), render this as a "suggested replacement"

Based on the work of Suyash Mahar in #96
  • Loading branch information
0xabu committed Dec 29, 2024
1 parent 50fbc70 commit cf75b79
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 37 deletions.
22 changes: 16 additions & 6 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ def _mkannotation(
"""
Given a PDF annotation, capture relevant fields and construct an Annotation object.
Refer to Section 8.4 of the PDF spec:
https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
Refer to Section 8.4 of the PDF reference (version 1.7).
"""

subtype = pa.get('Subtype')
Expand Down Expand Up @@ -85,13 +84,17 @@ def _mkannotation(
rect = pdftypes.resolve1(pa.get('Rect'))

# QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
# Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation.
# Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation.
quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))

author = pdftypes.resolve1(pa.get('T'))
if author is not None:
author = pdfminer.utils.decode_text(author)

name = pdftypes.resolve1(pa.get('NM'))
if name is not None:
name = pdfminer.utils.decode_text(name)

created = None
dobj = pa.get('CreationDate')
# some pdf apps set modification date, but not creation date
Expand All @@ -103,8 +106,9 @@ def _mkannotation(
createds = pdfminer.utils.decode_text(createds)
created = decode_datetime(createds)

return Annotation(page, annot_type, quadpoints, rect,
contents, author=author, created=created, color=rgb)
return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name,
contents=contents, author=author, created=created, color=rgb,
in_reply_to_ref=pa.get('IRT'))


def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:
Expand Down Expand Up @@ -383,6 +387,10 @@ def emit_progress(msg: str) -> None:
o.resolve(page)
page.outlines.append(o)

# Dict from object ID (in the ObjRef) to Annotation object
# This is used while post-processing to resolve inter-annotation references
annots_by_objid: typ.Dict[int, Annotation] = {}

# Construct Annotation objects, and append them to the page.
for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []:
if isinstance(pa, pdftypes.PDFObjRef):
Expand All @@ -391,6 +399,8 @@ def emit_progress(msg: str) -> None:
annot = _mkannotation(annot_dict, page)
if annot is not None:
page.annots.append(annot)
assert pa.objid not in annots_by_objid
annots_by_objid[pa.objid] = annot
else:
logger.warning("Unknown annotation: %s", pa)

Expand All @@ -410,7 +420,7 @@ def emit_progress(msg: str) -> None:

# Give the annotations a chance to update their internals
for a in page.annots:
a.postprocess()
a.postprocess(annots_by_objid)

emit_progress("\n")

Expand Down
34 changes: 24 additions & 10 deletions pdfannots/printer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,17 @@ def format_annot(
document: Document,
extra: typ.Optional[str] = None
) -> str:
# Limited support for Caret annotations with a single "reply" of type StrikeOut
contents = annot.contents
if (annot.subtype == AnnotationType.Caret and annot.replies and
annot.replies[0].subtype == AnnotationType.StrikeOut):
annot = annot.replies[0]
if annot.contents:
logger.warning("Ignored StrikeOut comment: %s", annot.contents)

# capture item text and contents (i.e. the comment), and split the latter into paragraphs
text = annot.gettext(self.remove_hyphens) or ''
comment = ([l for l in annot.contents.splitlines() if l]
if annot.contents else [])
comment = [l for l in contents.splitlines() if l] if contents else []

if annot.has_context():
assert annot.subtype == AnnotationType.StrikeOut
Expand Down Expand Up @@ -270,13 +276,13 @@ def emit_body(
self,
document: Document
) -> typ.Iterator[str]:
for a in document.iter_annots():
for a in document.iter_annots(include_replies=False):
yield self.format_annot(a, document, a.subtype.name)


class GroupedMarkdownPrinter(MarkdownPrinter):
ANNOT_NITS = frozenset({
AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Underline})
ANNOT_NITS = frozenset({AnnotationType.Caret, AnnotationType.Squiggly,
AnnotationType.StrikeOut, AnnotationType.Underline})
ALL_SECTIONS = ["highlights", "comments", "nits"]

def __init__(
Expand Down Expand Up @@ -316,12 +322,12 @@ def fmt_header(name: str, level: int = 2) -> str:
return prefix + header + " " + name + "\n"

# Partition annotations into nits, comments, and highlights.
nits = []
comments = []
highlights = [] # When grouping by color, this holds only the undefined annotations
nits: typ.List[Annotation] = []
comments: typ.List[Annotation] = []
highlights: typ.List[Annotation] = [] # When grouping by color holds only undefined annots
highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list)

for a in document.iter_annots():
for a in document.iter_annots(include_replies=False):
if a.subtype in self.ANNOT_NITS:
nits.append(a)
elif a.contents:
Expand Down Expand Up @@ -355,5 +361,13 @@ def fmt_header(name: str, level: int = 2) -> str:
if nits and secname == 'nits':
yield fmt_header("Nits")
for a in nits:
extra = "suggested deletion" if a.subtype == AnnotationType.StrikeOut else None
extra = None
if a.subtype == AnnotationType.Caret:
if a.replies and a.replies[0].subtype == AnnotationType.StrikeOut:
extra = "suggested replacement"
else:
extra = "suggested insertion"
elif a.subtype == AnnotationType.StrikeOut:
extra = "suggested deletion"

yield self.format_annot(a, document, extra)
70 changes: 49 additions & 21 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ def __init__(self, x0: float, y0: float, x1: float, y1: float):
self.y0 = y0
self.y1 = y1

def __repr__(self) -> str:
return '<Box (%f,%f) (%f,%f)>' % (self.x0, self.y0, self.x1, self.y1)

@staticmethod
def from_item(item: LTComponent) -> Box:
"""Construct a Box from the bounding box of a given PDF component."""
Expand Down Expand Up @@ -261,6 +264,8 @@ class AnnotationType(enum.Enum):
StrikeOut = enum.auto()
Underline = enum.auto()

Caret = enum.auto()

# A single rectangle, that is abused by some Apple tools to render custom
# highlights. We do not attempt to capture the affected text.
Square = enum.auto()
Expand All @@ -274,35 +279,43 @@ class Annotation(ObjectWithPos):
A PDF annotation, and its extracted text.
Attributes:
subtype PDF annotation type
contents Contents of the annotation in the PDF (e.g. comment/description)
text Text in the order captured (use gettext() for a cleaner form)
author Author of the annotation
created Timestamp the annotation was created
color RGB color of the annotation
contents Contents of the annotation in the PDF (e.g. comment/description)
created Timestamp the annotation was created
in_reply_to Reference to another annotation on the page that this is "in reply to"
last_charseq Sequence number of the most recent character in text
name If present, uniquely identifies this annotation among others on the page
replies Annotations replying to this one (reverse of in_reply_to)
subtype PDF annotation type
text Text in the order captured (use gettext() for a cleaner form)
Attributes updated only for StrikeOut annotations:
Attributes updated for StrikeOut and Caret annotations:
pre_context Text captured just prior to the beginning of 'text'
post_context Text captured just after the end of 'text'
"""

contents: typ.Optional[str]
boxes: typ.List[Box]
text: typ.List[str]
contents: typ.Optional[str]
in_reply_to: typ.Optional[Annotation]
pre_context: typ.Optional[str]
post_context: typ.Optional[str]
replies: typ.List[Annotation]
text: typ.List[str]

def __init__(
self,
page: Page,
subtype: AnnotationType,
quadpoints: typ.Optional[typ.Sequence[float]] = None,
rect: typ.Optional[BoxCoords] = None,
contents: typ.Optional[str] = None,
*,
author: typ.Optional[str] = None,
created: typ.Optional[datetime.datetime] = None,
color: typ.Optional[RGB] = None):
color: typ.Optional[RGB] = None,
contents: typ.Optional[str] = None,
in_reply_to_ref: typ.Optional[PDFObjRef] = None,
name: type.Optional[str] = None,
quadpoints: typ.Optional[typ.Sequence[float]] = None,
rect: typ.Optional[BoxCoords] = None):

# Construct boxes from quadpoints
boxes = []
Expand All @@ -324,16 +337,22 @@ def __init__(
super().__init__(pos)

# Initialise the attributes
self.subtype = subtype
self.contents = contents if contents else None
self.author = author
self.created = created
self.text = []
self.color = color
self.pre_context = None
self.post_context = None
self.boxes = boxes
self.color = color
self.contents = contents if contents else None
self.created = created
self.name = name
self.last_charseq = 0
self.post_context = None
self.pre_context = None
self.replies = []
self.subtype = subtype
self.text = []

# The in_reply_to reference will be resolved in postprocess()
self._in_reply_to_ref = in_reply_to_ref
self.in_reply_to = None

def __repr__(self) -> str:
return ('<Annotation %s %r%s%s>' %
Expand Down Expand Up @@ -394,8 +413,15 @@ def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]:
return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False),
merge_lines(self.post_context or '', remove_hyphens, strip_space=False))

def postprocess(self) -> None:
def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None:
"""Update internal state once all text and context has been captured."""
# Resole the in_reply_to object reference to its annotation
if self._in_reply_to_ref is not None:
assert self.in_reply_to is None # This should be called once only
self.in_reply_to = annots_by_objid.get(self._in_reply_to_ref.objid)
if self.in_reply_to is not None:
self.in_reply_to.replies.append(self)

# The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose
# default initial contents are a copy of the selected text. Unless the user goes to
# the trouble of editing each annotation, this goes badly for us because we have
Expand Down Expand Up @@ -466,10 +492,12 @@ class Document:
def __init__(self) -> None:
self.pages = []

def iter_annots(self) -> typ.Iterator[Annotation]:
def iter_annots(self, *, include_replies: bool = True) -> typ.Iterator[Annotation]:
"""Iterate over all the annotations in the document."""
for p in self.pages:
yield from p.annots
for a in p.annots:
if include_replies or not a.in_reply_to:
yield a

def nearest_outline(
self,
Expand Down
15 changes: 15 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,21 @@ def test(self) -> None:
self.assertEqual(self.annots[0].gettext(), None)


class CaretAnnotations(ExtractionTestBase):
filename = 'caret.pdf'

def test(self) -> None:
self.assertEqual(len(self.annots), 5)
self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut)
self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader')
self.assertEqual(self.annots[4].subtype, AnnotationType.Caret)
self.assertEqual(self.annots[4].contents, 'Google Chrome')
self.assertEqual(self.annots[1].in_reply_to, self.annots[4])
self.assertEqual(self.annots[4].replies, [self.annots[1]])
self.assertEqual(self.annots[1].replies, [])
self.assertEqual(self.annots[4].in_reply_to, None)


class PrinterTestBase(unittest.TestCase):
filename = 'hotos17.pdf'

Expand Down
Binary file added tests/caret.pdf
Binary file not shown.

0 comments on commit cf75b79

Please sign in to comment.