release pText 2.0.0

jorisschellekens · Jun 28, 2021 · a32918e · a32918e
1 parent 6f4970b
commit a32918e
Show file tree

Hide file tree

Showing 473 changed files with 8,570 additions and 2,370 deletions.
diff --git a/EXAMPLES.md b/EXAMPLES.md
@@ -1106,6 +1106,45 @@ The constructor of `PDFToMP3` has some arguments that allow us to tweak the expo
 - `language` : This is the 2-letter abbreviation of the language you expect the text to be in. Default is `en`
 - `slow`: This indicates whether you want the speaking-voice to go (extra) slow, or not
 
+#### 1.9.3 Exporting Markdown to PDF
+
+First we open the markdown file and read its contents:
+
+    markdown_txt: str = ""
+    with open("readme.md", "r") as markdown_file_handle:
+        markdown_txt = markdown_file_handle.read()
+
+Then we call the `convert_markdown_to_pdf` method of the `MarkdownToPDF` class:
+
+    # convert
+    document: Document = MarkdownToPDF.convert_markdown_to_pdf(markdown_txt)
+
+Finally, we store the resulting `Document`:
+
+    # store
+    with open("output.pdf", "wb") as pdf_file_handle:
+        PDF.dumps(pdf_file_handle, document)
+        
+        
+#### 1.9.4 Exporting HTML to PDF
+
+First we open the HTML file and read its contents:
+
+    html_txt: str = ""
+    with open("readme.html", "r") as html_file_handle:
+        html_txt = html_file_handle.read()
+
+Then we call the `convert_html_to_pdf` method of the `HTMLToPDF` class:
+
+    # convert
+    document: Document = HTMLToPDF.convert_html_to_pdf(html_txt)
+
+Finally, we store the resulting `Document`:
+
+    # store
+    with open("output.pdf", "wb") as pdf_file_handle:
+        PDF.dumps(pdf_file_handle, document)
+
 ### 1.10 Concatenating PDFs, and other page-manipulations
 
 A common scenario, when working with existing PDF `Document` objects is concatenation.

diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ To give you an immediate idea of the way `pText` works, this is the classic `Hel
     from pathlib import Path
 
     from ptext.pdf.canvas.layout.page_layout import SingleColumnLayout
-    from ptext.pdf.canvas.layout.paragraph import Paragraph,
+    from ptext.pdf.canvas.layout.text.paragraph import Paragraph,
     from ptext.pdf.document import Document
     from ptext.pdf.page.page import Page
     from ptext.pdf.pdf import PDF

diff --git a/ptext/io/filter/flate_decode.py b/ptext/io/filter/flate_decode.py
@@ -2,9 +2,9 @@
 # -*- coding: utf-8 -*-
 
 """
-    (PDF 1.2) Decompresses data encoded using the zlib/deflate
-    compression method, reproducing the original text or binary
-    data.
+(PDF 1.2) Decompresses data encoded using the zlib/deflate
+compression method, reproducing the original text or binary
+data.
 """
 import copy
 import zlib

diff --git a/ptext/io/filter/lzw_decode.py b/ptext/io/filter/lzw_decode.py
@@ -2,9 +2,9 @@
 # -*- coding: utf-8 -*-
 
 """
-    Decompresses data encoded using the LZW (Lempel-Ziv-
-    Welch) adaptive compression method, reproducing the original
-    text or binary data.
+Decompresses data encoded using the LZW (Lempel-Ziv-Welch)
+adaptive compression method, reproducing the original
+text or binary data.
 """
 import copy
 
@@ -48,7 +48,7 @@ def decode(bytes_in: bytes) -> bytes:
                 entry = copy.deepcopy(w)
                 entry.append(w[0])
             else:
-                assert False
+                assert False, "Unexpected error while performing LZW decode."
             bytes_out.extend(entry)
 
             # Add w+entry[0] to the dictionary.

diff --git a/ptext/io/filter/run_length_decode.py b/ptext/io/filter/run_length_decode.py
@@ -2,10 +2,10 @@
 # -*- coding: utf-8 -*-
 
 """
-    Decompresses data encoded using a byte-oriented run-length
-    encoding algorithm, reproducing the original text or binary data
-    (typically monochrome image data, or any data that contains
-    frequent long runs of a single byte value).
+Decompresses data encoded using a byte-oriented run-length
+encoding algorithm, reproducing the original text or binary data
+(typically monochrome image data, or any data that contains
+frequent long runs of a single byte value).
 """
 
 

diff --git a/ptext/io/filter/stream_decode_util.py b/ptext/io/filter/stream_decode_util.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 """
-    This function decodes a Stream, applying the filters specified in the Filter entry
-    of its stream dictionary
+This function decodes a Stream, applying the filters specified in the Filter entry
+of its stream dictionary
 """
 import typing
 
@@ -19,8 +19,10 @@ def decode_stream(s: Stream) -> Stream:
     This function decodes a Stream, applying the filters specified in the Filter entry
     of its stream dictionary
     """
-    assert isinstance(s, Stream)
-    assert "Bytes" in s
+    assert isinstance(s, Stream), "decode_stream only works on Stream objects"
+    assert (
+        "Bytes" in s
+    ), "decode_stream only works on Stream objects with a `Bytes` key."
 
     # determine filter(s) to apply
     filters: typing.List[str] = []

diff --git a/ptext/io/read/font/read_font_dictionary_transformer.py b/ptext/io/read/font/read_font_dictionary_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a Font object
+This implementation of ReadBaseTransformer is responsible for reading a Font object
 """
 import io
 import typing

diff --git a/ptext/io/read/function/read_function_dictionary_transformer.py b/ptext/io/read/function/read_function_dictionary_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a Function Dictionary
+This implementation of ReadBaseTransformer is responsible for reading a Function Dictionary
 """
 import io
 import typing
@@ -82,7 +82,7 @@ def transform(
         xref = parent_object.get_root().get("XRef")
         for k, v in object_to_transform.items():
             if isinstance(v, Reference):
-                v = xref.get_object(v, context.tokenizer.io_source, context.tokenizer)
+                v = xref.get_object(v, context.source, context.tokenizer)
                 transformed_object[k] = v
 
         # convert (remainder of) stream dictionary

diff --git a/ptext/io/read/image/read_ccitt_fax_image_transformer.py b/ptext/io/read/image/read_ccitt_fax_image_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading CCITT fax images
+This implementation of ReadBaseTransformer is responsible for reading CCITT fax images
 """
 import io
 import logging

diff --git a/ptext/io/read/image/read_compressed_jpeg_image_transformer.py b/ptext/io/read/image/read_compressed_jpeg_image_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a jpeg image object
+This implementation of ReadBaseTransformer is responsible for reading a jpeg image object
 """
 import io
 import logging

diff --git a/ptext/io/read/image/read_grayscale_image_transformer.py b/ptext/io/read/image/read_grayscale_image_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a grayscale image object
+This implementation of ReadBaseTransformer is responsible for reading a grayscale image object
 """
 import io
 import logging

diff --git a/ptext/io/read/image/read_jbig2_image_transformer.py b/ptext/io/read/image/read_jbig2_image_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a jbig2 image object
+This implementation of ReadBaseTransformer is responsible for reading a jbig2 image object
 """
 import io
 import logging

diff --git a/ptext/io/read/image/read_jpeg_2000_image_transformer.py b/ptext/io/read/image/read_jpeg_2000_image_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a jpeg2000 image object
+This implementation of ReadBaseTransformer is responsible for reading a jpeg2000 image object
 """
 import io
 import logging

diff --git a/ptext/io/read/image/read_jpeg_image_transformer.py b/ptext/io/read/image/read_jpeg_image_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a jpeg image object
+This implementation of ReadBaseTransformer is responsible for reading a jpeg image object
 """
 import io
 import logging

diff --git a/ptext/io/read/metadata/read_xmp_metadata_transformer.py b/ptext/io/read/metadata/read_xmp_metadata_transformer.py
@@ -2,15 +2,15 @@
 # -*- coding: utf-8 -*-
 
 """
-    A metadata stream may be attached to a document through the Metadata entry in the document catalogue
-    (see 7.7.2, “Document Catalog”). The metadata framework provides a date stamp for metadata expressed in
-    the framework. If this date stamp is equal to or later than the document modification date recorded in the
-    document information dictionary, the metadata stream shall be taken as authoritative. If, however, the
-    document modification date recorded in the document information dictionary is later than the metadata
-    stream’s date stamp, the document has likely been saved by a writer that is not aware of metadata streams. In
-    this case, information stored in the document information dictionary shall be taken to override any semantically
-    equivalent items in the metadata stream. In addition, PDF document components represented as a stream or
-    dictionary may have a Metadata entry (see Table 316).
+A metadata stream may be attached to a document through the Metadata entry in the document catalogue
+(see 7.7.2, “Document Catalog”). The metadata framework provides a date stamp for metadata expressed in
+the framework. If this date stamp is equal to or later than the document modification date recorded in the
+document information dictionary, the metadata stream shall be taken as authoritative. If, however, the
+document modification date recorded in the document information dictionary is later than the metadata
+stream’s date stamp, the document has likely been saved by a writer that is not aware of metadata streams. In
+this case, information stored in the document information dictionary shall be taken to override any semantically
+equivalent items in the metadata stream. In addition, PDF document components represented as a stream or
+dictionary may have a Metadata entry (see Table 316).
 """
 import io
 import logging
@@ -80,7 +80,7 @@ def transform(
         try:
             xml_root_orig = ET.fromstring(out_value["DecodedBytes"].decode("latin1"))
 
-            # make copy so that we can add attributes like _parent and _listeners
+            # make copy so that we can add attributes like parent and listeners
             xml_root_out = Element(xml_root_orig.tag)
             xml_root_out.set_parent(parent_object)  # type: ignore [attr-defined]
             for e in xml_root_orig:

diff --git a/ptext/io/read/object/read_array_transformer.py b/ptext/io/read/object/read_array_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of BaseTransformer converts a PDFArray to a List
+This implementation of BaseTransformer converts a PDFArray to a List
 """
 import io
 import typing

diff --git a/ptext/io/read/object/read_dictionary_transformer.py b/ptext/io/read/object/read_dictionary_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading a Dictionary object
+This implementation of ReadBaseTransformer is responsible for reading a Dictionary object
 """
 import io
 import typing

diff --git a/ptext/io/read/object/read_stream_transformer.py b/ptext/io/read/object/read_stream_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading Stream objects
+This implementation of ReadBaseTransformer is responsible for reading Stream objects
 """
 import io
 import typing
@@ -53,7 +53,7 @@ def transform(
         xref = parent_object.get_root().get("XRef")
         for k, v in object_to_transform.items():
             if isinstance(v, Reference):
-                v = xref.get_object(v, context.tokenizer.io_source, context.tokenizer)
+                v = xref.get_object(v, context.source, context.tokenizer)
                 object_to_transform[k] = v
 
         # apply filter(s)

diff --git a/ptext/io/read/page/read_page_dictionary_transformer.py b/ptext/io/read/page/read_page_dictionary_transformer.py
@@ -2,17 +2,19 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading Page objects
+This implementation of ReadBaseTransformer is responsible for reading Page objects
 """
 import io
 import typing
+import zlib
 from typing import Any, Dict, Optional, Union
 
 from ptext.io.read.read_base_transformer import (
     ReadBaseTransformer,
     ReadTransformerContext,
 )
-from ptext.io.read.types import AnyPDFType, Dictionary, List, Stream
+from ptext.io.read.types import AnyPDFType, Dictionary, List, Stream, Name
+from ptext.io.read.types import Decimal as pDecimal
 from ptext.pdf.canvas.canvas import Canvas
 from ptext.pdf.canvas.canvas_stream_processor import CanvasStreamProcessor
 from ptext.pdf.canvas.event.begin_page_event import BeginPageEvent
@@ -70,28 +72,34 @@ def transform(
         # send out BeginPageEvent
         page_out._event_occurred(BeginPageEvent(page_out))
 
-        # set up canvas
+        # check whether `Contents` exists
         if "Contents" not in page_out:
             return
-        if not (
-            isinstance(page_out["Contents"], List)
-            or isinstance(page_out["Contents"], Stream)
+        if not isinstance(page_out["Contents"], List) and not isinstance(
+            page_out["Contents"], Stream
         ):
             return
+
+        # Force content to be Stream (rather than List)
         contents = page_out["Contents"]
-        if contents is not None:
-            canvas = Canvas().set_parent(page_out)  # type: ignore [attr-defined]
-
-            # process bytes in stream
-            if isinstance(contents, Stream):
-                CanvasStreamProcessor(page_out, canvas, []).read(
-                    io.BytesIO(contents["DecodedBytes"])
-                )
-
-            # process bytes in array
-            if isinstance(contents, List):
-                bts = b"".join([x["DecodedBytes"] + b" " for x in contents])
-                CanvasStreamProcessor(page_out, canvas, []).read(io.BytesIO(bts))
+        if isinstance(contents, List):
+            bts = b"".join([x["DecodedBytes"] + b" " for x in contents])
+            page_out[Name("Contents")] = Stream()
+            assert isinstance(page_out["Contents"], Stream)
+            page_out["Contents"][Name("DecodedBytes")] = bts
+            page_out["Contents"][Name("Bytes")] = zlib.compress(bts, 9)
+            page_out["Contents"][Name("Filter")] = Name("FlateDecode")
+            page_out["Contents"][Name("Length")] = pDecimal(len(bts))
+            contents = page_out["Contents"]
+            contents.set_parent(page_out)  # type: ignore [attr-defined]
+
+        # create Canvas
+        canvas = Canvas().set_parent(page_out)  # type: ignore [attr-defined]
+
+        # create CanvasStreamProcessor
+        CanvasStreamProcessor(page_out, canvas, []).read(
+            io.BytesIO(contents["DecodedBytes"])
+        )
 
         # send out EndPageEvent
         page_out._event_occurred(EndPageEvent(page_out))

diff --git a/ptext/io/read/page/read_root_dictionary_transformer.py b/ptext/io/read/page/read_root_dictionary_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading the \Catalog object
+This implementation of ReadBaseTransformer is responsible for reading the \Catalog object
 """
 import io
 import typing
@@ -55,7 +55,7 @@ def transform(
 
         # convert using Dictionary transformer
         transformed_root_dictionary: Optional[Dictionary] = None
-        for t in self.get_root_transformer().children:
+        for t in self.get_root_transformer().get_children():
             if isinstance(t, ReadDictionaryTransformer):
                 transformed_root_dictionary = t.transform(
                     object_to_transform, parent_object, context, []

diff --git a/ptext/io/read/postfix/postfix_eval.py b/ptext/io/read/postfix/postfix_eval.py
@@ -2,12 +2,12 @@
 # -*- coding: utf-8 -*-
 
 """
-    The language that shall be used in a type 4 function contains expressions involving integers, real numbers, and
-    boolean values only. There shall be no composite data structures such as strings or arrays, no procedures, and
-    no variables or names. Table 42 lists the operators that can be used in this type of function. (For more
-    information on these operators, see Appendix B of the PostScript Language Reference, Third Edition.)
-    Although the semantics are those of the corresponding PostScript operators, a full PostScript interpreter is not
-    required.
+The language that shall be used in a type 4 function contains expressions involving integers, real numbers, and
+boolean values only. There shall be no composite data structures such as strings or arrays, no procedures, and
+no variables or names. Table 42 lists the operators that can be used in this type of function. (For more
+information on these operators, see Appendix B of the PostScript Language Reference, Third Edition.)
+Although the semantics are those of the corresponding PostScript operators, a full PostScript interpreter is not
+required.
 """
 import typing
 from decimal import Decimal

diff --git a/ptext/io/read/primitive/read_number_transformer.py b/ptext/io/read/primitive/read_number_transformer.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-    This implementation of ReadBaseTransformer is responsible for reading Decimal objects
+This implementation of ReadBaseTransformer is responsible for reading Decimal objects
 """
 import io
 import typing