Skip to content

Commit

Permalink
TLDR-474 remove insert_table parameter (#339)
Browse files Browse the repository at this point in the history
* TLDR-474 remove insert_table parameter

* TLDR-474 remove is_inserted attribute
  • Loading branch information
NastyBoget authored Sep 28, 2023
1 parent d752ad2 commit 4a7d3e2
Show file tree
Hide file tree
Showing 23 changed files with 54 additions and 941 deletions.
3 changes: 0 additions & 3 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ class QueryParameters(BaseModel):
return_base64: Optional[str]
attachments_dir: Optional[str]

insert_table: Optional[str]
need_pdf_table_analysis: Optional[str]
table_type: Optional[str]
orient_analysis_cells: Optional[str]
Expand Down Expand Up @@ -48,7 +47,6 @@ def __init__(self,
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa

# tables handling
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), # noqa
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa
Expand Down Expand Up @@ -83,7 +81,6 @@ def __init__(self,
self.return_base64: str = return_base64 or "false"
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or "false"
self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true"
self.table_type: str = table_type or ""
self.orient_analysis_cells: str = orient_analysis_cells or "false"
Expand Down
1 change: 0 additions & 1 deletion dedoc/api/static/html_eng/form_input.html
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ <h2>Structure Document Recognition</h2>
</select>
</label> return_format
</p>
<p><label><input type="checkbox" name="insert_table" value="true"> insert_table</label></p>
<p><label><input type="checkbox" name="handle_invisible_table" value="true"> handle_invisible_table</label></p>
<p>pages <input name="pages" type="text" size="8" value=":"> </p>

Expand Down
1 change: 0 additions & 1 deletion dedoc/api/static/html_eng/format_description.html
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ <h4 id="TableMetadata"> TableMetadata</h4>
<ol>
<li><strong>uid</strong>: <a> str (required field) </a> - unique identifier. </li>
<li><strong>page_id</strong>: <a> integer </a> (optional field) - page number on which the table begins. Can be null.</li>
<li><strong>is_inserted</strong>: <a> bool </a> (optional field) - was table inserted into document.</li>
</ol>

<h3 id="TreeNode"> TreeNode</h3>
Expand Down
2 changes: 0 additions & 2 deletions dedoc/api/static/html_eng/info.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ <h2>How to use</h2>
<ol>
<li><strong>language</strong>: string - document recognition language. The default value is "rus+eng". Available values: "rus+eng", "rus", "eng".</li>
<li><strong>with_attachments</strong>: boolean - option including analysis of attached files. The option is False by default. Available values: True, False.</li>
<li><strong>insert_table</strong>: boolean - this option enables embedding the table in the document tree.
The option is False by default. Available values: True, False.</li>
<li><strong>return_format</strong>: str - an option to return the response in pretty_json, html, json or tree form.
The default value is json. Use the pretty_json, tree and html format for debug only.<br>
Warning: html-format is used only for viewing the recognition result (in a readable form).
Expand Down
3 changes: 0 additions & 3 deletions dedoc/api/static/html_rus/form_input.html
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ <h2>Распознавание структуры документа</h2>
</label> with_attachments</p>
<p>
<p>
<label>
<input type="checkbox" name="insert_table" value=true>
</label> insert_table </p>
<p>
<label>
<select name="document_type">
Expand Down
1 change: 0 additions & 1 deletion dedoc/api/static/html_rus/format_description.html
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ <h4 id="TableMetadata"> TableMetadata. Метаинформация таблиц
<ol>
<li><strong>uid</strong>: <a> str (обязательное поле) </a> - уникальный идентификатор таблицы.</li>
<li><strong>page_id</strong>: <a> int </a> (необязательное поле) - номер страницы на которой начинается таблица.</li>
<li><strong>is_inserted</strong>: <a> bool </a> (необязательное поле) - была ли таблица встроена в тело документа.</li>
</ol>

<h3 id="TreeNode"> TreeNode. Древовидная структура документа.</h3>
Expand Down
2 changes: 0 additions & 2 deletions dedoc/api/static/html_rus/info.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ <h2>Как использовать</h2>
<ol>
<li><strong>language</strong>: str - язык рапознавания документа. По-умолчанию установлено значение "rus+eng".
Доступные значения: "rus+eng", "rus", "eng".</li>
<li><strong>insert_table</strong>: boolean - опция включает встраивание таблицы в документное дерево.
По-умолчанию установлено значение False. Доступные значения True, False.</li>
<li><strong>with_attachments</strong>: boolean - опция включающая анализ вложенных файлов. По-умолчанию установлено
значение False. Доступные значения True, False.</li>
<li><strong>return_format</strong>: str - опция для возврата ответа в html-виде, в виде дерева или в виде json.
Expand Down
6 changes: 1 addition & 5 deletions dedoc/data_structures/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,20 @@ class TableMetadata(Serializable):
"""
This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False, rotated_angle: float = 0.0) -> None:
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0) -> None:
"""
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
:param is_inserted: indicator if table was already inserted into paragraphs list
:param rotated_angle: value of the rotation angle by which the table was rotated during recognition
"""
self.page_id = page_id
self.uid = str(uuid.uuid1()) if not uid else uid
self.is_inserted = is_inserted
self.rotated_angle = rotated_angle

def to_dict(self) -> dict:
res = OrderedDict()
res["uid"] = self.uid
res["page_id"] = self.page_id
res["is_inserted"] = self.is_inserted
res["rotated_angle"] = self.rotated_angle
return res

Expand All @@ -36,6 +33,5 @@ def get_api_dict(api: Api) -> Model:
return api.model("TableMetadata", {
"page_id": fields.Integer(readonly=False, description="table start page number"),
"uid": fields.String(description="table unique id"),
"is_inserted": fields.Boolean(description="was the table inserted into document body"),
"rotated_angle": fields.Float(readonly=False, description="At what angle should the table be rotated to use boxes")
})
2 changes: 1 addition & 1 deletion dedoc/readers/docx_reader/data_structures/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def to_table(self) -> Table:
result_row.append(cell)
result_cells_with_meta.append(result_row)

return Table(cells=result_cells_with_meta, metadata=TableMetadata(page_id=None, uid=self.uid, is_inserted=False))
return Table(cells=result_cells_with_meta, metadata=TableMetadata(page_id=None, uid=self.uid))

def __get_cell_text(self, cell: Tag) -> str:
cell_text = ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[Table], List[Sc

result_cells.append(result_row)
table_bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)) # noqa TODO add table location into TableMetadata
tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number, is_inserted=False)))
tables.append(Table(cells=result_cells, metadata=TableMetadata(page_id=page_number)))
table_name = file_hash + str(page_number) + str(table_num)
tables_on_image.append(ScanTable(page_number=page_number, matrix_cells=None, bbox=table_bbox, name=table_name, order=order))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from dedoc.data_structures.parsed_document import ParsedDocument
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.structure_constructors.abstract_structure_constructor import AbstractStructureConstructor
from dedoc.structure_constructors.table_patcher import TablePatcher


class StructureConstructorComposition(AbstractStructureConstructor):
Expand All @@ -20,19 +19,13 @@ def __init__(self, constructors: Dict[str, AbstractStructureConstructor], defaul
"""
self.constructors = constructors
self.default_constructor = default_constructor
self.table_patcher = TablePatcher()

def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None, parameters: Optional[dict] = None) -> ParsedDocument:
"""
Construct the result document structure according to the `structure_type` parameter.
If `structure_type` is empty string or None the default constructor will be used.
To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`.
"""
parameters = {} if parameters is None else parameters

if parameters.get("insert_table", "False").lower() == "true":
document = self.table_patcher.insert_table(document=document)

if structure_type in self.constructors:
return self.constructors[structure_type].structure_document(document)

Expand Down
80 changes: 0 additions & 80 deletions dedoc/structure_constructors/table_patcher.py

This file was deleted.

13 changes: 0 additions & 13 deletions docs/source/_static/code_examples/dedoc_return_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,6 @@ def with_parsed_attachments_example() -> dict:
return json.loads(result)


def with_inserted_table_example() -> dict:
with open(filename, "rb") as file:
files = {"file": (filename, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=dict(insert_table="true"))
result = r.content.decode("utf-8")

assert r.status_code == 200
return json.loads(result)


if __name__ == "__main__":
with open("../json_format_examples/basic_example.json", "w") as f:
json.dump(basic_example(), f, indent=2, ensure_ascii=False)
Expand All @@ -80,6 +70,3 @@ def with_inserted_table_example() -> dict:

with open("../json_format_examples/with_parsed_attachments.json", "w") as f:
json.dump(with_parsed_attachments_example(), f, indent=2, ensure_ascii=False)

with open("../json_format_examples/with_inserted_table.json", "w") as f:
json.dump(with_inserted_table_example(), f, indent=2, ensure_ascii=False)
13 changes: 6 additions & 7 deletions docs/source/_static/json_format_examples/basic_example.json
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@
"start": 0,
"end": 14,
"name": "attachment",
"value": "attach_fa1143ae-5d3c-11ee-b518-0242ac120002"
"value": "attach_75af2486-5df1-11ee-bfc1-0242ac120002"
}
],
"metadata": {
Expand Down Expand Up @@ -420,20 +420,19 @@
"metadata": {
"uid": "3a327789721e09b3fa6fd9560f3ee263",
"page_id": null,
"is_inserted": false,
"rotated_angle": 0.0
}
}
]
},
"metadata": {
"uid": "doc_uid_auto_fa1f6786-5d3c-11ee-b518-0242ac120002",
"uid": "doc_uid_auto_75c93394-5df1-11ee-bfc1-0242ac120002",
"file_name": "example_return_format.docx",
"temporary_file_name": "1695822696_268.docx",
"temporary_file_name": "1695900213_314.docx",
"size": 21270,
"modified_time": 1695822696,
"created_time": 1695822696,
"access_time": 1695822696,
"modified_time": 1695900213,
"created_time": 1695900213,
"access_time": 1695900213,
"file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"document_subject": "",
"keywords": "",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@
"start": 0,
"end": 14,
"name": "attachment",
"value": "attach_fa23fd78-5d3c-11ee-b518-0242ac120002"
"value": "attach_75d13b70-5df1-11ee-bfc1-0242ac120002"
}
],
"metadata": {
Expand Down Expand Up @@ -504,20 +504,19 @@
"metadata": {
"uid": "3a327789721e09b3fa6fd9560f3ee263",
"page_id": null,
"is_inserted": false,
"rotated_angle": 0.0
}
}
]
},
"metadata": {
"uid": "doc_uid_auto_fa309d08-5d3c-11ee-b518-0242ac120002",
"uid": "doc_uid_auto_75e45e94-5df1-11ee-bfc1-0242ac120002",
"file_name": "example_return_format.docx",
"temporary_file_name": "1695822697_827.docx",
"temporary_file_name": "1695900214_259.docx",
"size": 21270,
"modified_time": 1695822697,
"created_time": 1695822697,
"access_time": 1695822697,
"modified_time": 1695900213,
"created_time": 1695900213,
"access_time": 1695900214,
"file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"document_subject": "",
"keywords": "",
Expand Down
23 changes: 11 additions & 12 deletions docs/source/_static/json_format_examples/with_attachments.json
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@
"start": 0,
"end": 14,
"name": "attachment",
"value": "attach_fa355abe-5d3c-11ee-b518-0242ac120002"
"value": "attach_75ea598e-5df1-11ee-bfc1-0242ac120002"
}
],
"metadata": {
Expand Down Expand Up @@ -420,20 +420,19 @@
"metadata": {
"uid": "3a327789721e09b3fa6fd9560f3ee263",
"page_id": null,
"is_inserted": false,
"rotated_angle": 0.0
}
}
]
},
"metadata": {
"uid": "doc_uid_auto_fa4285e0-5d3c-11ee-b518-0242ac120002",
"uid": "doc_uid_auto_75fac01c-5df1-11ee-bfc1-0242ac120002",
"file_name": "example_return_format.docx",
"temporary_file_name": "1695822697_953.docx",
"temporary_file_name": "1695900214_51.docx",
"size": 21270,
"modified_time": 1695822697,
"created_time": 1695822697,
"access_time": 1695822697,
"modified_time": 1695900214,
"created_time": 1695900214,
"access_time": 1695900214,
"file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"document_subject": "",
"keywords": "",
Expand Down Expand Up @@ -476,13 +475,13 @@
"tables": []
},
"metadata": {
"uid": "attach_fa355abe-5d3c-11ee-b518-0242ac120002",
"uid": "attach_75ea598e-5df1-11ee-bfc1-0242ac120002",
"file_name": "image1.png",
"temporary_file_name": "1695822697_181.png",
"temporary_file_name": "1695900214_864.png",
"size": 14874,
"modified_time": 1695822697,
"created_time": 1695822697,
"access_time": 1695822697,
"modified_time": 1695900214,
"created_time": 1695900214,
"access_time": 1695900214,
"file_type": "image/png",
"other_fields": {}
},
Expand Down
Loading

0 comments on commit 4a7d3e2

Please sign in to comment.