From 73eb6ccd33b222049b4870a9531ecf59b7998485 Mon Sep 17 00:00:00 2001 From: Nikita Shevtsov Date: Tue, 19 Sep 2023 13:52:00 +0300 Subject: [PATCH] some fixes --- .../dedoc_add_new_doc_type_tutorial.py | 10 ++---- docs/source/tutorials/add_new_doc_type.rst | 34 ++++--------------- 2 files changed, 8 insertions(+), 36 deletions(-) diff --git a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py index d3086613..4530e9a6 100644 --- a/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_add_new_doc_type_tutorial.py @@ -28,11 +28,5 @@ document = pdf_reader.read(file_path, parameters={"with_attachments": "true"}) print(list(vars(document))) # ['tables', 'lines', 'attachments', 'warnings', 'metadata'] - - -manager = DedocManager() -result = manager.parse(file_path=file_path, parameters={}) - -print(result) # -print(result.to_dict()) # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''), ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ... - +print(len(document.attachments)) +print(len(document.lines)) diff --git a/docs/source/tutorials/add_new_doc_type.rst b/docs/source/tutorials/add_new_doc_type.rst index a59446e1..4d85b131 100644 --- a/docs/source/tutorials/add_new_doc_type.rst +++ b/docs/source/tutorials/add_new_doc_type.rst @@ -36,15 +36,9 @@ You should call the constructor of the base class in the constructor of the curr def __init__(self, config): super().__init__(config=config) - def can_convert(self, - extension: str, - mime: str, - parameters: Optional[dict] = None) -> bool: + def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: pass # some code here - def do_convert(self, - tmp_dir: str, - filename: str, - extension: str) -> str: + def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str: pass # some code here * :meth:`can_convert()` method checks if the new converter can process the file, for example, you can return True for the list of some specific file extensions. @@ -74,18 +68,10 @@ General scheme of adding Reader class NewtypeReader(BaseReader): - def can_read(self, - path: str, - mime: str, - extension: str, - document_type: Optional[str], - parameters: Optional[dict] = None) -> bool: + def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str], parameters: Optional[dict] = None) -> bool: pass # some code here - def read(self, - path: str, - document_type: Optional[str] = None, - parameters: Optional[dict] = None) -> UnstructuredDocument: + def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: pass # some code here @@ -118,18 +104,10 @@ General scheme of adding AttachmentExtractor from dedoc.data_structures.attached_file import AttachedFile class NewtypeAttachmentsExtractor(AbstractAttachmentsExtractor): - def can_extract(self, - extension: str, - mime: str, - parameters: Optional[dict] = None) -> bool: + def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: pass # some code here - - - def get_attachments(self, - tmpdir: str, - filename: str, - parameters: dict) -> List[AttachedFile]: + def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]: pass # some code here * :meth:`can_extract()` method checks if the new extractor can process the file, for example, you can return True for the list of some specific file extensions.