Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-851 TLDR-861 Refactor table recognition #508

Merged
merged 8 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ class QueryParameters:
# tables handling
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
table_type: str = Form("", description="Pipeline mode for table recognition")
orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
NastyBoget marked this conversation as resolved.
Show resolved Hide resolved
orient_cell_angle: str = Form("90", enum=["90", "270"],
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')

# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
Expand Down
33 changes: 10 additions & 23 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -98,31 +98,9 @@ <h4>Attachments handling</h4>
</details>
</div>


<div class="parameters">
<h4>Tables handling </h4>
<details><summary>need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle</summary>
<br>
<p>
<label>
<input type="hidden" name="need_pdf_table_analysis" value="false">
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
</p>

<p>
<label><input name="orient_analysis_cells" type="checkbox" value="true"> orient_analysis_cells</label>
</p>

<p>
<label>orient_cell_angle <input name="orient_cell_angle" type="number" size="5" value="90"></label>
</p>
</details>
</div>


<div class="parameters">
<h4>PDF handling</h4>
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
<br>
<p>
<label>
Expand Down Expand Up @@ -153,6 +131,15 @@ <h4>PDF handling</h4>
</label>
</p>

<details><summary>need_pdf_table_analysis</summary>
<br>
<p>
<label>
<input type="hidden" name="need_pdf_table_analysis" value="false">
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
</p>
</details>

<p>
<label>pages <input name="pages" type="text" size="8" value=":"></label>
</p>
Expand Down
5 changes: 2 additions & 3 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,8 @@ def get_annotations(self) -> List[Annotation]:
"""
return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations

@staticmethod
def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta":
return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible)
def __str__(self) -> str:
return f"CellWithMeta(cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"

def to_api_schema(self) -> ApiCellWithMeta:
import numpy as np
Expand Down
76 changes: 18 additions & 58 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,46 @@
import copy
from typing import List, Optional

from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_with_meta import LineWithMeta


class Cell(CellWithMeta):

@staticmethod
def copy_from(cell: "Cell",
x_top_left: Optional[int] = None,
x_bottom_right: Optional[int] = None,
y_top_left: Optional[int] = None,
y_bottom_right: Optional[int] = None) -> "Cell":
x_top_left = cell.x_top_left if x_top_left is None else x_top_left
x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right
y_top_left = cell.y_top_left if y_top_left is None else y_top_left
y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right
return Cell(x_top_left=x_top_left,
x_bottom_right=x_bottom_right,
y_top_left=y_top_left,
y_bottom_right=y_bottom_right,
id_con=cell.id_con,
lines=cell.lines,
is_attribute=cell.is_attribute,
is_attribute_required=cell.is_attribute_required,
rotated_angle=cell.rotated_angle,
uid=cell.cell_uid,
contour_coord=cell.con_coord)
def copy_from(cell: "Cell", bbox: Optional[BBox] = None) -> "Cell":
copy_cell = copy.deepcopy(cell)
if bbox:
copy_cell.bbox = bbox

return copy_cell

def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
if self.lines:
for line in self.lines:
line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
self.x_top_left += shift_x
self.x_bottom_right += shift_x
self.y_top_left += shift_y
self.y_bottom_right += shift_y
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
contour_coord: Optional[BBox] = None) -> None:
self.bbox.shift(shift_x=shift_x, shift_y=shift_y)
if self.contour_coord:
self.contour_coord.shift(shift_x=shift_x, shift_y=shift_y)

import uuid
def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: Optional[str] = None,
contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:

assert x_top_left <= x_bottom_right
assert y_top_left <= y_bottom_right
import uuid

self.lines = [] if lines is None else lines
super().__init__(lines)
super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)

self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
self.y_top_left = y_top_left
self.y_bottom_right = y_bottom_right
self.bbox = bbox
self.id_con = id_con
self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
self.con_coord = contour_coord or BBox(0, 0, 0, 0)

def __str__(self) -> str:
return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"

def get_text(self) -> str:
return "\n".join([line.line for line in self.lines])

def get_annotations(self) -> List[Annotation]:
return LineWithMeta.join(self.lines, delimiter="\n").annotations
self.uuid = uuid.uuid4() if uuid is None else uid
self.contour_coord = contour_coord or BBox(0, 0, 0, 0)

def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
Expand All @@ -96,11 +64,3 @@ def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_hei

def __repr__(self) -> str:
return self.__str__()

@property
def width(self) -> int:
return self.x_bottom_right - self.x_top_left

@property
def height(self) -> int:
return self.y_bottom_right - self.y_top_left
89 changes: 14 additions & 75 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List, Optional
from typing import List

from dedocutils.data_structures import BBox

Expand All @@ -9,106 +9,45 @@
from dedoc.readers.pdf_reader.data_classes.tables.location import Location


class ScanTable:
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
name: str = "", order: int = -1) -> None:
self.matrix_cells = matrix_cells
self.page_number = page_number
self.locations = []
self.name = name
class ScanTable(Table):
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:

super().__init__(cells, TableMetadata(page_id=page_number))
self.order = order
if bbox is not None:
self.locations.append(Location(page_number, bbox))
self.locations = [Location(page_number, bbox)]

def extended(self, table: "ScanTable") -> None:
# extend locations
self.locations.extend(table.locations)
# extend values
self.matrix_cells.extend(table.matrix_cells)
self.cells.extend(table.cells)
# extend order
self.order = max(self.order, table.order)

def check_on_cell_instance(self) -> bool:
if len(self.matrix_cells) == 0:
if len(self.cells) == 0:
return False
if len(self.matrix_cells[0]) == 0:
if len(self.cells[0]) == 0:
return False
if not isinstance(self.matrix_cells[0][0], Cell):
if not isinstance(self.cells[0][0], Cell):
return False
return True

def to_table(self) -> Table:
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
return Table(metadata=metadata, cells=cells_with_meta)

@staticmethod
def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]:
attrs = []
for i in range(0, len(attr_cells)):
attrs.append([a.get_text() for a in attr_cells[i]])

return attrs

@staticmethod
def get_key_value_attrs(attrs: List, val: Any) -> dict: # noqa
res_attrs = []
for i in range(0, len(attrs)):
res_attrs.append({"attr": attrs[i]})
res = {
"attrs": res_attrs,
"val": val
}
return res

@staticmethod
def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int:
end_attr_string = 0
for i in range(0, len(matrix_cells)):
if matrix_cells[i][0].is_attribute:
end_attr_string = i

return end_attr_string

@staticmethod
def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int):
import copy
import numpy as np

required_columns = []
for j in range(0, len(matrix_cells[0])):
if matrix_cells[0][j].is_attribute_required:
required_columns.append(j)

end_attr_string = ScanTable.get_index_of_end_string_attr(matrix_cells)

attrs = copy.deepcopy(np.array(matrix_cells[0:end_attr_string + 1]))
attrs = attrs.transpose().tolist()

return [required_columns, attrs, end_attr_string]

@staticmethod
def get_matrix_attrs_and_data(matrix_cells: List[List[Cell]]) -> (List[List[Cell]], List[List[str]], List[List[str]]):
required_columns, attrs, end_attr_string = ScanTable.get_attributes_cell(matrix_cells)
attrs_text = ScanTable.get_cells_text(attrs)

data = matrix_cells[(end_attr_string + 1):]
data_text = ScanTable.get_cells_text(data)

return [attrs, attrs_text, data_text]
def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]:
return [[cell.get_text() for cell in row] for row in cells]

@property
def location(self) -> Location:
return min(self.locations)

@property
def uid(self) -> str:
return self.name
return self.metadata.uid

def to_dict(self) -> dict:
from collections import OrderedDict

data_text = ScanTable.get_cells_text(self.matrix_cells)
data_text = self.__get_cells_text(self.cells)

res = OrderedDict()
res["locations"] = [location.to_dict() for location in self.locations]
Expand Down
22 changes: 2 additions & 20 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@


ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
"orient_cell_angle",
"is_one_column_document",
"document_orientation",
"language",
Expand Down Expand Up @@ -73,8 +71,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure

params_for_parse = ParametersForParseDoc(
language=param_utils.get_param_language(parameters),
orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters),
is_one_column_document=param_utils.get_param_is_one_column_document(parameters),
document_orientation=param_utils.get_param_document_orientation(parameters),
need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters),
Expand All @@ -91,12 +87,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
)

lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
tables = [scan_table.to_table() for scan_table in scan_tables]

if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path):
attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)

result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=metadata)
result = UnstructuredDocument(lines=lines, tables=scan_tables, attachments=attachments, warnings=warnings, metadata=metadata)
return self._postprocess(result)

def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
Expand Down Expand Up @@ -177,7 +172,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[Scan
table_page_number = location.page_number
location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
page_number = scan_table.locations[0].page_number
for row in scan_table.matrix_cells:
for row in scan_table.cells:
for cell in row:
image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
Expand Down Expand Up @@ -275,16 +270,3 @@ def _binarization(self, gray_image: ndarray) -> ndarray:
binary_mask = gray_image >= np.quantile(gray_image, 0.05)
gray_image[binary_mask] = 255
return gray_image

def eval_tables_by_batch(self,
batch: Iterator[ndarray],
page_number_begin: int,
language: str,
orient_analysis_cells: bool = False,
orient_cell_angle: int = 270,
table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]:
from joblib import Parallel, delayed

result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)(
image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch))
return result_batch
2 changes: 0 additions & 2 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ def _process_one_page(self,
image=rotated_image,
page_number=page_number,
language=parameters.language,
orient_analysis_cells=parameters.orient_analysis_cells,
orient_cell_angle=parameters.orient_cell_angle,
table_type=parameters.table_type
)
else:
Expand Down
Loading
Loading