Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-475 fix table documentation #338

Merged
merged 2 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,23 @@

class CellWithMeta:
"""
This class holds the information about the cell information: text of the cell, text annotations and cell properties (rowspan, colspan, invisible).
This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
"""
def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
"""
:param lines: text lines (LineWithMeta) of the cell
:param colspan: The value of the rowspan attribute represents the number of columns to span. Like HTML format.
:param rowspan: The value of the rowspan attribute represents the number of rows to span. Like HTML format.
:param invisible: Display or hide cell values
:param lines: textual lines of the cell
:param colspan: number of columns to span like in HTML format
:param rowspan: number of rows to span like in HTML format
:param invisible: indicator for displaying or hiding cell text
"""
self.lines = lines
self.colspan = colspan
self.rowspan = rowspan
self.invisible = invisible

def __repr__(self) -> str:
return f"CellWithMeta({self.get_text()[:65]})"

def get_text(self) -> str:
return "\n".join([line.line for line in self.lines])

Expand Down
4 changes: 2 additions & 2 deletions dedoc/data_structures/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ class Table(Serializable):
"""
def __init__(self, cells: List[List[CellWithMeta]], metadata: TableMetadata) -> None:
"""
:param cells: a list of lists of cells (cell has text, colspan and rowspan attributes).
:param metadata: some table metadata, as location, size and so on.
:param cells: a list of lists of cells (cell has text, colspan and rowspan attributes)
:param metadata: some table metadata as location, size and so on
"""
self.metadata = metadata
self.cells = cells
Expand Down
5 changes: 2 additions & 3 deletions dedoc/data_structures/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@

class TableMetadata(Serializable):
"""
This class holds the information about the table location in the document and information about cell properties.
This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
"""
def __init__(self, page_id: Optional[int], uid: Optional[str] = None, is_inserted: bool = False, rotated_angle: float = 0.0) -> None:
"""
:param page_id: number of the page where table starts
:param uid: unique identifier of the table
:param is_inserted: indicator if table was already inserted into paragraphs list
:param rotated_angle: the value of the rotation angle by which the table was rotated during recognition. Extracted boxes from a table will need to
be rotated by this angle.
:param rotated_angle: value of the rotation angle by which the table was rotated during recognition
"""
self.page_id = page_id
self.uid = str(uuid.uuid1()) if not uid else uid
Expand Down
12 changes: 6 additions & 6 deletions docs/source/_static/code_examples/dedoc_usage_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@
document.lines[3].annotations[7] # Italic(6:12, True)
document.lines[3].annotations[8] # Size(14:19, 10.0)

document.tables[0].cells[0][0].get_text() # N
document.tables[0].cells[1][3].get_text() # Cell3
document.tables[1].cells[3][0].get_text() # 'Text 3'
cell = document.tables[0].cells[0][0]
cell # CellWithMeta(N)
cell.get_text() # N
cell.rowspan, cell.colspan, cell.invisible # (1, 1, False)
document.tables[0].metadata.uid # f2f08354fc2dbcb5ded8885479f498a6
document.tables[0].cells[0][0].colspan # 1
document.tables[0].cells[0][0].rowspan # 1
document.tables[0].cells[0][0].invisible # False
document.tables[0].metadata.page_id # None
document.tables[0].metadata.rotated_angle # 0.0
document.tables[1].cells[0][0].invisible # False
document.tables[1].cells[0][1].invisible # True
document.tables[1].cells[0][0].colspan # 2
Expand Down
168 changes: 100 additions & 68 deletions docs/source/_static/json_format_examples/basic_example.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "2023.05.26",
"version": "0.11.2",
"warnings": [],
"content": {
"structure": {
Expand Down Expand Up @@ -298,7 +298,7 @@
"start": 0,
"end": 14,
"name": "attachment",
"value": "image1.png"
"value": "attach_fa1143ae-5d3c-11ee-b518-0242ac120002"
}
],
"metadata": {
Expand All @@ -321,96 +321,128 @@
{
"cells": [
[
"Table header",
"Table header"
{
"lines": [
{
"text": "Table header",
"annotations": []
}
],
"colspan": 2,
"rowspan": 1,
"invisible": false
},
{
"lines": [
{
"text": "Table header",
"annotations": []
}
],
"colspan": 1,
"rowspan": 1,
"invisible": true
}
],
[
"Vertically merged cells",
"Text 1"
{
"lines": [
{
"text": "Vertically merged cells",
"annotations": []
}
],
"colspan": 1,
"rowspan": 2,
"invisible": false
},
{
"lines": [
{
"text": "Text 1",
"annotations": []
}
],
"colspan": 1,
"rowspan": 1,
"invisible": false
}
],
[
"Vertically merged cells",
"Text 2"
{
"lines": [
{
"text": "Vertically merged cells",
"annotations": []
}
],
"colspan": 1,
"rowspan": 1,
"invisible": true
},
{
"lines": [
{
"text": "Text 2",
"annotations": []
}
],
"colspan": 1,
"rowspan": 1,
"invisible": false
}
],
[
"Text 3",
"Text 4"
{
"lines": [
{
"text": "Text 3",
"annotations": []
}
],
"colspan": 1,
"rowspan": 1,
"invisible": false
},
{
"lines": [
{
"text": "Text 4",
"annotations": []
}
],
"colspan": 1,
"rowspan": 1,
"invisible": false
}
]
],
"metadata": {
"uid": "3a327789721e09b3fa6fd9560f3ee263",
"page_id": null,
"is_inserted": false,
"cell_properties": [
[
{
"colspan": 2,
"rowspan": 1,
"invisible": false
},
{
"colspan": 1,
"rowspan": 1,
"invisible": true
}
],
[
{
"colspan": 1,
"rowspan": 2,
"invisible": false
},
{
"colspan": 1,
"rowspan": 1,
"invisible": false
}
],
[
{
"colspan": 1,
"rowspan": 1,
"invisible": true
},
{
"colspan": 1,
"rowspan": 1,
"invisible": false
}
],
[
{
"colspan": 1,
"rowspan": 1,
"invisible": false
},
{
"colspan": 1,
"rowspan": 1,
"invisible": false
}
]
]
"rotated_angle": 0.0
}
}
]
},
"metadata": {
"uid": "doc_uid_auto_5cbfdc00-0e90-11ee-8789-4549ad8e7206",
"uid": "doc_uid_auto_fa1f6786-5d3c-11ee-b518-0242ac120002",
"file_name": "example_return_format.docx",
"temporary_file_name": "1695822696_268.docx",
"size": 21270,
"modified_time": 1687172368,
"created_time": 1687172368,
"access_time": 1687172368,
"modified_time": 1695822696,
"created_time": 1695822696,
"access_time": 1695822696,
"file_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"document_subject": "",
"keywords": "",
"category": "",
"comments": "",
"author": "",
"last_modified_by": "",
"created_date": 1568736411,
"modified_date": 1686923436,
"created_date": 1568725611,
"modified_date": 1686912636,
"last_printed_date": null,
"other_fields": {
"document_subject": "",
Expand All @@ -419,8 +451,8 @@
"comments": "",
"author": "",
"last_modified_by": "",
"created_date": 1568736411,
"modified_date": 1686923436,
"created_date": 1568725611,
"modified_date": 1686912636,
"last_printed_date": null
}
},
Expand Down
Loading
Loading