diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py index c693a9b..575b1b6 100644 --- a/pdfannots/__init__.py +++ b/pdfannots/__init__.py @@ -49,8 +49,7 @@ def _mkannotation( """ Given a PDF annotation, capture relevant fields and construct an Annotation object. - Refer to Section 8.4 of the PDF spec: - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf + Refer to Section 8.4 of the PDF reference (version 1.7). """ subtype = pa.get('Subtype') @@ -85,13 +84,17 @@ def _mkannotation( rect = pdftypes.resolve1(pa.get('Rect')) # QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut, - # Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation. + # Squiggly, Caret), where they specify the quadrilaterals (boxes) covered by the annotation. quadpoints = pdftypes.resolve1(pa.get('QuadPoints')) author = pdftypes.resolve1(pa.get('T')) if author is not None: author = pdfminer.utils.decode_text(author) + name = pdftypes.resolve1(pa.get('NM')) + if name is not None: + name = pdfminer.utils.decode_text(name) + created = None dobj = pa.get('CreationDate') # some pdf apps set modification date, but not creation date @@ -103,8 +106,9 @@ def _mkannotation( createds = pdfminer.utils.decode_text(createds) created = decode_datetime(createds) - return Annotation(page, annot_type, quadpoints, rect, - contents, author=author, created=created, color=rgb) + return Annotation(page, annot_type, quadpoints=quadpoints, rect=rect, name=name, + contents=contents, author=author, created=created, color=rgb, + in_reply_to_ref=pa.get('IRT')) def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]: @@ -383,6 +387,10 @@ def emit_progress(msg: str) -> None: o.resolve(page) page.outlines.append(o) + # Dict from object ID (in the ObjRef) to Annotation object + # This is used while post-processing to resolve inter-annotation references + annots_by_objid: typ.Dict[int, Annotation] = {} + # Construct Annotation objects, and append them to the page. for pa in pdftypes.resolve1(pdfpage.annots) if pdfpage.annots else []: if isinstance(pa, pdftypes.PDFObjRef): @@ -391,6 +399,8 @@ def emit_progress(msg: str) -> None: annot = _mkannotation(annot_dict, page) if annot is not None: page.annots.append(annot) + assert pa.objid not in annots_by_objid + annots_by_objid[pa.objid] = annot else: logger.warning("Unknown annotation: %s", pa) @@ -410,7 +420,7 @@ def emit_progress(msg: str) -> None: # Give the annotations a chance to update their internals for a in page.annots: - a.postprocess() + a.postprocess(annots_by_objid) emit_progress("\n") diff --git a/pdfannots/printer/markdown.py b/pdfannots/printer/markdown.py index 27c8889..bf1c78d 100644 --- a/pdfannots/printer/markdown.py +++ b/pdfannots/printer/markdown.py @@ -216,11 +216,17 @@ def format_annot( document: Document, extra: typ.Optional[str] = None ) -> str: + # Limited support for Caret annotations with a single "reply" of type StrikeOut + contents = annot.contents + if (annot.subtype == AnnotationType.Caret and annot.replies and + annot.replies[0].subtype == AnnotationType.StrikeOut): + annot = annot.replies[0] + if annot.contents: + logger.warning("Ignored StrikeOut comment: %s", annot.contents) # capture item text and contents (i.e. the comment), and split the latter into paragraphs text = annot.gettext(self.remove_hyphens) or '' - comment = ([l for l in annot.contents.splitlines() if l] - if annot.contents else []) + comment = [l for l in contents.splitlines() if l] if contents else [] if annot.has_context(): assert annot.subtype == AnnotationType.StrikeOut @@ -270,13 +276,13 @@ def emit_body( self, document: Document ) -> typ.Iterator[str]: - for a in document.iter_annots(): + for a in document.iter_annots(include_replies=False): yield self.format_annot(a, document, a.subtype.name) class GroupedMarkdownPrinter(MarkdownPrinter): - ANNOT_NITS = frozenset({ - AnnotationType.Squiggly, AnnotationType.StrikeOut, AnnotationType.Underline}) + ANNOT_NITS = frozenset({AnnotationType.Caret, AnnotationType.Squiggly, + AnnotationType.StrikeOut, AnnotationType.Underline}) ALL_SECTIONS = ["highlights", "comments", "nits"] def __init__( @@ -316,12 +322,12 @@ def fmt_header(name: str, level: int = 2) -> str: return prefix + header + " " + name + "\n" # Partition annotations into nits, comments, and highlights. - nits = [] - comments = [] - highlights = [] # When grouping by color, this holds only the undefined annotations + nits: typ.List[Annotation] = [] + comments: typ.List[Annotation] = [] + highlights: typ.List[Annotation] = [] # When grouping by color holds only undefined annots highlights_by_color: typ.DefaultDict[RGB, typ.List[Annotation]] = defaultdict(list) - for a in document.iter_annots(): + for a in document.iter_annots(include_replies=False): if a.subtype in self.ANNOT_NITS: nits.append(a) elif a.contents: @@ -355,5 +361,13 @@ def fmt_header(name: str, level: int = 2) -> str: if nits and secname == 'nits': yield fmt_header("Nits") for a in nits: - extra = "suggested deletion" if a.subtype == AnnotationType.StrikeOut else None + extra = None + if a.subtype == AnnotationType.Caret: + if a.replies and a.replies[0].subtype == AnnotationType.StrikeOut: + extra = "suggested replacement" + else: + extra = "suggested insertion" + elif a.subtype == AnnotationType.StrikeOut: + extra = "suggested deletion" + yield self.format_annot(a, document, extra) diff --git a/pdfannots/types.py b/pdfannots/types.py index 0a67671..63d8fa9 100644 --- a/pdfannots/types.py +++ b/pdfannots/types.py @@ -33,6 +33,9 @@ def __init__(self, x0: float, y0: float, x1: float, y1: float): self.y0 = y0 self.y1 = y1 + def __repr__(self) -> str: + return '' % (self.x0, self.y0, self.x1, self.y1) + @staticmethod def from_item(item: LTComponent) -> Box: """Construct a Box from the bounding box of a given PDF component.""" @@ -261,6 +264,8 @@ class AnnotationType(enum.Enum): StrikeOut = enum.auto() Underline = enum.auto() + Caret = enum.auto() + # A single rectangle, that is abused by some Apple tools to render custom # highlights. We do not attempt to capture the affected text. Square = enum.auto() @@ -274,35 +279,43 @@ class Annotation(ObjectWithPos): A PDF annotation, and its extracted text. Attributes: - subtype PDF annotation type - contents Contents of the annotation in the PDF (e.g. comment/description) - text Text in the order captured (use gettext() for a cleaner form) author Author of the annotation - created Timestamp the annotation was created color RGB color of the annotation + contents Contents of the annotation in the PDF (e.g. comment/description) + created Timestamp the annotation was created + in_reply_to Reference to another annotation on the page that this is "in reply to" last_charseq Sequence number of the most recent character in text + name If present, uniquely identifies this annotation among others on the page + replies Annotations replying to this one (reverse of in_reply_to) + subtype PDF annotation type + text Text in the order captured (use gettext() for a cleaner form) - Attributes updated only for StrikeOut annotations: + Attributes updated for StrikeOut and Caret annotations: pre_context Text captured just prior to the beginning of 'text' post_context Text captured just after the end of 'text' """ - contents: typ.Optional[str] boxes: typ.List[Box] - text: typ.List[str] + contents: typ.Optional[str] + in_reply_to: typ.Optional[Annotation] pre_context: typ.Optional[str] post_context: typ.Optional[str] + replies: typ.List[Annotation] + text: typ.List[str] def __init__( self, page: Page, subtype: AnnotationType, - quadpoints: typ.Optional[typ.Sequence[float]] = None, - rect: typ.Optional[BoxCoords] = None, - contents: typ.Optional[str] = None, + *, author: typ.Optional[str] = None, created: typ.Optional[datetime.datetime] = None, - color: typ.Optional[RGB] = None): + color: typ.Optional[RGB] = None, + contents: typ.Optional[str] = None, + in_reply_to_ref: typ.Optional[PDFObjRef] = None, + name: type.Optional[str] = None, + quadpoints: typ.Optional[typ.Sequence[float]] = None, + rect: typ.Optional[BoxCoords] = None): # Construct boxes from quadpoints boxes = [] @@ -324,16 +337,22 @@ def __init__( super().__init__(pos) # Initialise the attributes - self.subtype = subtype - self.contents = contents if contents else None self.author = author - self.created = created - self.text = [] - self.color = color - self.pre_context = None - self.post_context = None self.boxes = boxes + self.color = color + self.contents = contents if contents else None + self.created = created + self.name = name self.last_charseq = 0 + self.post_context = None + self.pre_context = None + self.replies = [] + self.subtype = subtype + self.text = [] + + # The in_reply_to reference will be resolved in postprocess() + self._in_reply_to_ref = in_reply_to_ref + self.in_reply_to = None def __repr__(self) -> str: return ('' % @@ -394,8 +413,15 @@ def get_context(self, remove_hyphens: bool = False) -> typ.Tuple[str, str]: return (merge_lines(self.pre_context or '', remove_hyphens, strip_space=False), merge_lines(self.post_context or '', remove_hyphens, strip_space=False)) - def postprocess(self) -> None: + def postprocess(self, annots_by_objid: typ.Dict[int, Annotation]) -> None: """Update internal state once all text and context has been captured.""" + # Resole the in_reply_to object reference to its annotation + if self._in_reply_to_ref is not None: + assert self.in_reply_to is None # This should be called once only + self.in_reply_to = annots_by_objid.get(self._in_reply_to_ref.objid) + if self.in_reply_to is not None: + self.in_reply_to.replies.append(self) + # The Skim PDF reader (https://skim-app.sourceforge.io/) creates annotations whose # default initial contents are a copy of the selected text. Unless the user goes to # the trouble of editing each annotation, this goes badly for us because we have @@ -466,10 +492,12 @@ class Document: def __init__(self) -> None: self.pages = [] - def iter_annots(self) -> typ.Iterator[Annotation]: + def iter_annots(self, *, include_replies: bool = True) -> typ.Iterator[Annotation]: """Iterate over all the annotations in the document.""" for p in self.pages: - yield from p.annots + for a in p.annots: + if include_replies or not a.in_reply_to: + yield a def nearest_outline( self, diff --git a/tests.py b/tests.py index ec88103..2ac981f 100755 --- a/tests.py +++ b/tests.py @@ -265,6 +265,21 @@ def test(self) -> None: self.assertEqual(self.annots[0].gettext(), None) +class CaretAnnotations(ExtractionTestBase): + filename = 'caret.pdf' + + def test(self) -> None: + self.assertEqual(len(self.annots), 5) + self.assertEqual(self.annots[1].subtype, AnnotationType.StrikeOut) + self.assertEqual(self.annots[1].gettext(), 'Adobe Acrobat Reader') + self.assertEqual(self.annots[4].subtype, AnnotationType.Caret) + self.assertEqual(self.annots[4].contents, 'Google Chrome') + self.assertEqual(self.annots[1].in_reply_to, self.annots[4]) + self.assertEqual(self.annots[4].replies, [self.annots[1]]) + self.assertEqual(self.annots[1].replies, []) + self.assertEqual(self.annots[4].in_reply_to, None) + + class PrinterTestBase(unittest.TestCase): filename = 'hotos17.pdf' diff --git a/tests/caret.pdf b/tests/caret.pdf new file mode 100644 index 0000000..89c57b7 Binary files /dev/null and b/tests/caret.pdf differ