Skip to content

Commit

Permalink
update master (#424)
Browse files Browse the repository at this point in the history
* Improve speed of partial PDF extraction (#418)

* Some attachment refactoring (#420)

* Add PDF performance script (#419)

* Fix infinite loop in PdfTabbyReader (#421)

*  Added article type using grobid (#422)
  • Loading branch information
NastyBoget authored Apr 17, 2024
1 parent 56b44dd commit 16747b0
Show file tree
Hide file tree
Showing 75 changed files with 10,053 additions and 88 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ ignore =
ANN101
per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101,T201
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.1.1
2.2
1 change: 1 addition & 0 deletions dedoc/api/schema/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ class TableMetadata(BaseModel):
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
rotated_angle: float = Field(description="Value of the rotation angle (in degrees) by which the table was rotated during recognition", example=1.0)
title: str = Field(description="Table's title")
1 change: 1 addition & 0 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ <h4>Type of document structure parsing</h4>
<option value="law">law</option>
<option value="tz">tz</option>
<option value="diploma">diploma</option>
<option value="article">article</option>
</select> document_type
</label>
</p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .docx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .xlsx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .json extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(".json")

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.extensions import recognized_mimes
from dedoc.utils.utils import convert_datetime, get_mime_extension, get_unique_name


Expand All @@ -28,8 +28,8 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .pdf extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format
mime, _ = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return mime in recognized_mimes.pdf_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_extract(self,
"""
Checks if this extractor can get attachments from the document (it should have .pptx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand Down
12 changes: 6 additions & 6 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import time
from typing import List, Optional

from dedoc.attachments_extractors import AbstractAttachmentsExtractor
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.data_structures import AttachedFile, DocumentMetadata, ParsedDocument
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.utils.parameter_utils import get_param_with_attachments
from dedoc.utils.utils import get_empty_content


Expand Down Expand Up @@ -39,11 +39,11 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
are important, look to the API parameters documentation for more details).
:return: list of parsed document attachments
"""
parsed_attachment_files = []
attachments = []
recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1

if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0:
return parsed_attachment_files
if not get_param_with_attachments(parameters) or recursion_deep_attachments < 0:
return attachments

previous_log_time = time.time()

Expand Down Expand Up @@ -73,8 +73,8 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)

parsed_file.metadata.set_uid(attachment.uid)
parsed_attachment_files.append(parsed_file)
return parsed_attachment_files
attachments.append(parsed_file)
return attachments

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
metadata = document_parser.document_metadata_extractor.extract(
Expand Down
5 changes: 4 additions & 1 deletion dedoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@
# TESSERACT OCR confidence threshold ( values: [-1 - undefined; 0.0 : 100.0 % - confidence value)
ocr_conf_threshold=40.0,
# max depth of document structure tree
recursion_deep_subparagraphs=30
recursion_deep_subparagraphs=30,

# -------------------------------------------EXTERNAL SERVICES SETTINGS---------------------------------------------
grobid_max_connection_attempts=3
)


Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/binary_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is image-like (e.g. it has .bmp, .jpg, .tiff, etc. extension) and has `mime=application/octet-stream`.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return mime == "application/octet-stream" and extension in supported_image_types

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/docx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/excel_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is xlsx-like, e.g. it has .xls or .ods extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.excel_like_format or mime in converted_mimes.excel_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pdf-like, e.g. it has .djvu extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pdf_like_format or mime in converted_mimes.pdf_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/png_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def can_convert(self,
"""
Checks if the document is image-like, e.g. it has .bmp, .jpg, .tiff, etc. extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.image_like_format or mime in converted_mimes.image_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def can_convert(self,
"""
Checks if the document is pptx-like, e.g. it has .ppt or .odp extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.pptx_like_format or mime in converted_mimes.pptx_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/txt_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def can_convert(self,
"""
Checks if the document is txt-like, e.g. it has .xml extension.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in converted_extensions.txt_like_format or mime in converted_mimes.txt_like_format

def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/converter_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def convert(self, file_path: str, parameters: Optional[dict] = None) -> str:
:param parameters: parameters of converting, see :ref:`parameters_description` for more details
:return: path of converted file if conversion was executed else path of the original file
"""
extension, mime = get_mime_extension(file_path=file_path)
mime, extension = get_mime_extension(file_path=file_path)
converted_file_path = file_path

for converter in self.converters:
Expand Down
3 changes: 2 additions & 1 deletion dedoc/data_structures/concrete_annotations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from .superscript_annotation import SuperscriptAnnotation
from .table_annotation import TableAnnotation
from .underlined_annotation import UnderlinedAnnotation
from .reference_annotation import ReferenceAnnotation

__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'ConfidenceAnnotation',
'IndentationAnnotation', 'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation',
'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation']
'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation', 'ReferenceAnnotation']
43 changes: 43 additions & 0 deletions dedoc/data_structures/concrete_annotations/reference_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from dedoc.data_structures.annotation import Annotation


class ReferenceAnnotation(Annotation):
"""
This annotation points to a place in the document text that is a link to another line in the document (for example, another textual line).
Example of usage for document_type="article" with the example of link on the bibliography_item :class:`~dedoc.data_structures.LineWithMeta`.
LineWithMeta:
.. code-block:: python
LineWithMeta( # the line with the reference annotation
line="As for the PRF, we use the tree-based construction from Goldreich, Goldwasser and Micali [18]",
metadata=LineMetadata(page_id=0, line_id=32),
annotations=[ReferenceAnnotation(start=90, end=92, value="97cfac39-f0e3-11ee-b81c-b88584b4e4a1"), ...]
)
other LineWithMeta:
.. code-block:: python
LineWithMeta( # The line referenced by the previous one
line="some your text (can be empty)",
metadata=LineMetadata(
page_id=10,
line_id=189,
tag_hierarchy_level=HierarchyLevel(level1=2, level2=0, paragraph_type="bibliography_item")),
other_fields={"uid": "97cfac39-f0e3-11ee-b81c-b88584b4e4a1"}
),
annotations=[]
)
"""
name = "reference"

def __init__(self, value: str, start: int, end: int) -> None:
"""
:param value: unique identifier of the line to which this annotation refers
:param start: start of the annotated text with a link
:param end: end of the annotated text with a link
"""
super().__init__(start=start, end=end, name=ReferenceAnnotation.name, value=value, is_mergeable=False)
2 changes: 1 addition & 1 deletion dedoc/data_structures/line_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def __init__(self,
self.hierarchy_level = hierarchy_level
self.page_id = page_id
self.line_id = line_id
self.__other_fields = {}
if other_fields is not None and len(other_fields) > 0:
self.extend_other_fields(other_fields)
self.__other_fields = {}

def extend_other_fields(self, new_fields: dict) -> None:
"""
Expand Down
3 changes: 2 additions & 1 deletion dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def set_line(self, line: str) -> None:
self._line = line

def __repr__(self) -> str:
return f"LineWithMeta({self.line[:65]})"
return (f"LineWithMeta({self.line[:65]}, "
f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")

def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
assert isinstance(other, (LineWithMeta, str))
Expand Down
6 changes: 4 additions & 2 deletions dedoc/data_structures/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@ class TableMetadata(Serializable):
"""
This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0) -> None:
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0, title: str = "") -> None:
"""
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
:param rotated_angle: value of the rotation angle by which the table was rotated during recognition
:param title: table's title
"""
self.page_id = page_id
self.uid = str(uuid.uuid4()) if not uid else uid
self.rotated_angle = rotated_angle
self.title = title

def to_api_schema(self) -> ApiTableMetadata:
return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle)
return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle, title=self.title)
3 changes: 3 additions & 0 deletions dedoc/manager_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Optional

from dedoc.readers.article_reader.article_reader import ArticleReader


def _get_manager_config(config: dict) -> dict:
"""
Expand Down Expand Up @@ -57,6 +59,7 @@ def _get_manager_config(config: dict) -> dict:
BinaryConverter(config=config)
]
readers = [
ArticleReader(config=config),
DocxReader(config=config),
ExcelReader(config=config),
PptxReader(config=config),
Expand Down
3 changes: 2 additions & 1 deletion dedoc/readers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .archive_reader.archive_reader import ArchiveReader
from .article_reader.article_reader import ArticleReader
from .base_reader import BaseReader
from .csv_reader.csv_reader import CSVReader
from .docx_reader.docx_reader import DocxReader
Expand All @@ -17,6 +18,6 @@
from .reader_composition import ReaderComposition
from .txt_reader.raw_text_reader import RawTextReader

__all__ = ['ArchiveReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
__all__ = ['ArchiveReader', 'ArticleReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader',
'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader', 'PdfAutoReader']
2 changes: 1 addition & 1 deletion dedoc/readers/archive_reader/archive_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
Check if the document extension is suitable for this reader.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.archive_like_format or mime in recognized_mimes.archive_like_format

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
Expand Down
Empty file.
Loading

0 comments on commit 16747b0

Please sign in to comment.