Skip to content

Commit

Permalink
some fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikita Shevtsov committed Sep 19, 2023
1 parent 5200dc1 commit 73eb6cc
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,5 @@

document = pdf_reader.read(file_path, parameters={"with_attachments": "true"})
print(list(vars(document))) # ['tables', 'lines', 'attachments', 'warnings', 'metadata']


manager = DedocManager()
result = manager.parse(file_path=file_path, parameters={})

print(result) # <dedoc.data_structures.ParsedDocument>
print(result.to_dict()) # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''), ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ...

print(len(document.attachments))
print(len(document.lines))
34 changes: 6 additions & 28 deletions docs/source/tutorials/add_new_doc_type.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,9 @@ You should call the constructor of the base class in the constructor of the curr
def __init__(self, config):
super().__init__(config=config)
def can_convert(self,
extension: str,
mime: str,
parameters: Optional[dict] = None) -> bool:
def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
pass # some code here
def do_convert(self,
tmp_dir: str,
filename: str,
extension: str) -> str:
def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str:
pass # some code here
* :meth:`can_convert()` method checks if the new converter can process the file, for example, you can return True for the list of some specific file extensions.
Expand Down Expand Up @@ -74,18 +68,10 @@ General scheme of adding Reader
class NewtypeReader(BaseReader):
def can_read(self,
path: str,
mime: str,
extension: str,
document_type: Optional[str],
parameters: Optional[dict] = None) -> bool:
def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str], parameters: Optional[dict] = None) -> bool:
pass # some code here
def read(self,
path: str,
document_type: Optional[str] = None,
parameters: Optional[dict] = None) -> UnstructuredDocument:
def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument:
pass # some code here
Expand Down Expand Up @@ -118,18 +104,10 @@ General scheme of adding AttachmentExtractor
from dedoc.data_structures.attached_file import AttachedFile
class NewtypeAttachmentsExtractor(AbstractAttachmentsExtractor):
def can_extract(self,
extension: str,
mime: str,
parameters: Optional[dict] = None) -> bool:
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
pass # some code here
def get_attachments(self,
tmpdir: str,
filename: str,
parameters: dict) -> List[AttachedFile]:
def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
pass # some code here
* :meth:`can_extract()` method checks if the new extractor can process the file, for example, you can return True for the list of some specific file extensions.
Expand Down

0 comments on commit 73eb6cc

Please sign in to comment.