Caret annotations: initial support

* extract Caret annotations in PDF * handle IRT (in reply to) property, and expose as inter-Annotation lins * capture (but don't yet use) the optional N name property * when rendering the specific case of a Caret annotation with a single StrikeOut annotation as a "reply" (which is how Acrobat seems to render replace+insert edits), render this as a "suggested replacement" Based on the work of Suyash Mahar in #96
0xabu · Dec 29, 2024 · cf75b79 · cf75b79
1 parent 50fbc70
commit cf75b79
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 37 deletions.
diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py
@@ -49,8 +49,7 @@ def _mkannotation(
     """
     Given a PDF annotation, capture relevant fields and construct an Annotation object.
 
-    Refer to Section 8.4 of the PDF spec:
-    https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
+    Refer to Section 8.4 of the PDF reference (version 1.7).
     """
 
     subtype = pa.get('Subtype')
@@ -85,13 +84,17 @@ def _mkannotation(
     rect = pdftypes.resolve1(pa.get('Rect'))
 
     # QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
-    # Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation.
+    # Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation.
     quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))
 
     author = pdftypes.resolve1(pa.get('T'))
     if author is not None:
         author = pdfminer.utils.decode_text(author)
 
+    name = pdftypes.resolve1(pa.get('NM'))
+    if name is not None:
+        name = pdfminer.utils.decode_text(name)
+
     created = None
     dobj = pa.get('CreationDate')
     # some pdf apps set modification date, but not creation date
@@ -103,8 +106,9 @@ def _mkannotation(
         createds = pdfminer.utils.decode_text(createds)
         created = decode_datetime(createds)
 
-    return Annotation(page, annot_type, quadpoints, rect,
-                      contents, author=author, created=created, color=rgb)
+    return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name,
+                      contents=contents, author=author, created=created, color=rgb,
+                      in_reply_to_ref=pa.get('IRT'))
 
 
 def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]:
@@ -383,6 +387,10 @@ def emit_progress(msg: str) -> None:
             o.resolve(page)
             page.outlines.append(o)
 
+        # Dict from object ID (in the ObjRef) to Annotation object
+        # This is used while post-processing to resolve inter-annotation references
+        annots_by_objid: typ.Dict[int, Annotation] = {}
+
         # Construct Annotation objects, and append them to the page.
         for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []:
             if isinstance(pa, pdftypes.PDFObjRef):
@@ -391,6 +399,8 @@ def emit_progress(msg: str) -> None:
                     annot = _mkannotation(annot_dict, page)
                     if annot is not None:
                         page.annots.append(annot)
+                        assert pa.objid not in annots_by_objid
+                        annots_by_objid[pa.objid] = annot
             else:
                 logger.warning("Unknown annotation: %s", pa)
 
@@ -410,7 +420,7 @@ def emit_progress(msg: str) -> None:
 
         # Give the annotations a chance to update their internals
         for a in page.annots:
-            a.postprocess()
+            a.postprocess(annots_by_objid)
 
     emit_progress("\n")
 

diff --git a/pdfannots/printer/markdown.py b/pdfannots/printer/markdown.py
@@ -216,11 +216,17 @@ def format_annot(
         document: Document,
         extra: typ.Optional[str] = None
     ) -> str:
+        # Limited support for Caret annotations with a single "reply" of type StrikeOut
+        contents = annot.contents
+        if (annot.subtype == AnnotationType.Caret and annot.replies and
+            annot.replies[0].subtype == AnnotationType.StrikeOut):
+            annot = annot.replies[0]
+            if annot.contents:
+                logger.warning("Ignored StrikeOut comment: %s", annot.contents)
 
         # capture item text and contents (i.e. the comment), and split the latter into paragraphs
         text = annot.gettext(self.remove_hyphens) or ''
-        comment = ([l for l in annot.contents.splitlines() if l]
-                   if annot.contents else [])
+        comment = [l for l in contents.splitlines() if l] if contents else []
 
         if annot.has_context():
             assert annot.subtype == AnnotationType.StrikeOut
@@ -270,13 +276,13 @@ def emit_body(
         self,
         document: Document
     ) -> typ.Iterator[str]:
-        for a in document.iter_annots():
+        for a in document.iter_annots(include_replies=False):
             yield self.format_annot(a, document, a.subtype.name)
 
 
 class GroupedMarkdownPrinter(MarkdownPrinter):
-    ANNOT_NITS = frozenset({
-        AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Underline})
+    ANNOT_NITS = frozenset({AnnotationType.Caret, AnnotationType.Squiggly,
+                            AnnotationType.StrikeOut, AnnotationType.Underline})
     ALL_SECTIONS = ["highlights", "comments", "nits"]
 
     def __init__(
@@ -316,12 +322,12 @@ def fmt_header(name: str, level: int = 2) -> str:
             return prefix + header + " " + name + "\n"
 
         # Partition annotations into nits, comments, and highlights.
-        nits = []
-        comments = []
-        highlights = []  # When grouping by color, this holds only the undefined annotations
+        nits: typ.List[Annotation] = []
+        comments: typ.List[Annotation] = []
+        highlights: typ.List[Annotation] = []  # When grouping by color holds only undefined annots
         highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list)
 
-        for a in document.iter_annots():
+        for a in document.iter_annots(include_replies=False):
             if a.subtype in self.ANNOT_NITS:
                 nits.append(a)
             elif a.contents:
@@ -355,5 +361,13 @@ def fmt_header(name: str, level: int = 2) -> str:
             if nits and secname == 'nits':
                 yield fmt_header("Nits")
                 for a in nits:
-                    extra = "suggested deletion" if a.subtype == AnnotationType.StrikeOut else None
+                    extra = None
+                    if a.subtype == AnnotationType.Caret:
+                        if a.replies and a.replies[0].subtype == AnnotationType.StrikeOut:
+                            extra = "suggested replacement"
+                        else:
+                            extra = "suggested insertion"
+                    elif a.subtype == AnnotationType.StrikeOut:
+                        extra = "suggested deletion"
+
                     yield self.format_annot(a, document, extra)
diff --git a/pdfannots/types.py b/pdfannots/types.py
@@ -33,6 +33,9 @@ def __init__(self, x0: float, y0: float, x1: float, y1: float):
         self.y0 = y0
         self.y1 = y1
 
+    def __repr__(self) -> str:
+        return '<Box (%f,%f) (%f,%f)>' % (self.x0, self.y0, self.x1, self.y1)
+
     @staticmethod
     def from_item(item: LTComponent) -> Box:
         """Construct a Box from the bounding box of a given PDF component."""
@@ -261,6 +264,8 @@ class AnnotationType(enum.Enum):
     StrikeOut = enum.auto()
     Underline = enum.auto()
 
+    Caret = enum.auto()
+
     # A single rectangle, that is abused by some Apple tools to render custom
     # highlights. We do not attempt to capture the affected text.
     Square = enum.auto()
@@ -274,35 +279,43 @@ class Annotation(ObjectWithPos):
     A PDF annotation, and its extracted text.
 
     Attributes:
-        subtype      PDF annotation type
-        contents     Contents of the annotation in the PDF (e.g. comment/description)
-        text         Text in the order captured (use gettext() for a cleaner form)
         author       Author of the annotation
-        created      Timestamp the annotation was created
         color        RGB color of the annotation
+        contents     Contents of the annotation in the PDF (e.g. comment/description)
+        created      Timestamp the annotation was created
+        in_reply_to  Reference to another annotation on the page that this is "in reply to"
         last_charseq Sequence number of the most recent character in text
+        name         If present, uniquely identifies this annotation among others on the page
+        replies      Annotations replying to this one (reverse of in_reply_to)
+        subtype      PDF annotation type
+        text         Text in the order captured (use gettext() for a cleaner form)
 
-    Attributes updated only for StrikeOut annotations:
+    Attributes updated for StrikeOut and Caret annotations:
         pre_context  Text captured just prior to the beginning of 'text'
         post_context Text captured just after the end of 'text'
     """
 
-    contents: typ.Optional[str]
     boxes: typ.List[Box]
-    text: typ.List[str]
+    contents: typ.Optional[str]
+    in_reply_to: typ.Optional[Annotation]
     pre_context: typ.Optional[str]
     post_context: typ.Optional[str]
+    replies: typ.List[Annotation]
+    text: typ.List[str]
 
     def __init__(
             self,
             page: Page,
             subtype: AnnotationType,
-            quadpoints: typ.Optional[typ.Sequence[float]] = None,
-            rect: typ.Optional[BoxCoords] = None,
-            contents: typ.Optional[str] = None,
+            *,
             author: typ.Optional[str] = None,
             created: typ.Optional[datetime.datetime] = None,
-            color: typ.Optional[RGB] = None):
+            color: typ.Optional[RGB] = None,
+            contents: typ.Optional[str] = None,
+            in_reply_to_ref: typ.Optional[PDFObjRef] = None,
+            name: type.Optional[str] = None,
+            quadpoints: typ.Optional[typ.Sequence[float]] = None,
+            rect: typ.Optional[BoxCoords] = None):
 
         # Construct boxes from quadpoints
         boxes = []
@@ -324,16 +337,22 @@ def __init__(
         super().__init__(pos)
 
         # Initialise the attributes
-        self.subtype = subtype
-        self.contents = contents if contents else None
         self.author = author
-        self.created = created
-        self.text = []
-        self.color = color
-        self.pre_context = None
-        self.post_context = None
         self.boxes = boxes
+        self.color = color
+        self.contents = contents if contents else None
+        self.created = created
+        self.name = name
         self.last_charseq = 0
+        self.post_context = None
+        self.pre_context = None
+        self.replies = []
+        self.subtype = subtype
+        self.text = []
+
+        # The in_reply_to reference will be resolved in postprocess()
+        self._in_reply_to_ref = in_reply_to_ref
+        self.in_reply_to = None
 
     def __repr__(self) -> str:
         return ('<Annotation %s %r%s%s>' %
@@ -394,8 +413,15 @@ def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]:
         return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False),
                 merge_lines(self.post_context or '', remove_hyphens, strip_space=False))
 
-    def postprocess(self) -> None:
+    def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None:
         """Update internal state once all text and context has been captured."""
+        # Resole the in_reply_to object reference to its annotation
+        if self._in_reply_to_ref is not None:
+            assert self.in_reply_to is None # This should be called once only
+            self.in_reply_to = annots_by_objid.get(self._in_reply_to_ref.objid)
+            if self.in_reply_to is not None:
+                self.in_reply_to.replies.append(self)
+
         # The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose
         # default initial contents are a copy of the selected text. Unless the user goes to
         # the trouble of editing each annotation, this goes badly for us because we have
@@ -466,10 +492,12 @@ class Document:
     def __init__(self) -> None:
         self.pages = []
 
-    def iter_annots(self) -> typ.Iterator[Annotation]:
+    def iter_annots(self, *, include_replies: bool = True) -> typ.Iterator[Annotation]:
         """Iterate over all the annotations in the document."""
         for p in self.pages:
-            yield from p.annots
+            for a in p.annots:
+                if include_replies or not a.in_reply_to:
+                    yield a
 
     def nearest_outline(
         self,

diff --git a/tests.py b/tests.py
@@ -265,6 +265,21 @@ def test(self) -> None:
         self.assertEqual(self.annots[0].gettext(), None)
 
 
+class CaretAnnotations(ExtractionTestBase):
+    filename = 'caret.pdf'
+
+    def test(self) -> None:
+        self.assertEqual(len(self.annots), 5)
+        self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut)
+        self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader')
+        self.assertEqual(self.annots[4].subtype, AnnotationType.Caret)
+        self.assertEqual(self.annots[4].contents, 'Google Chrome')
+        self.assertEqual(self.annots[1].in_reply_to, self.annots[4])
+        self.assertEqual(self.annots[4].replies, [self.annots[1]])
+        self.assertEqual(self.annots[1].replies, [])
+        self.assertEqual(self.annots[4].in_reply_to, None)
+
+
 class PrinterTestBase(unittest.TestCase):
     filename = 'hotos17.pdf'
 

diff --git a/tests/caret.pdf b/tests/caret.pdf