TLDR-851 TLDR-861 Refactor table recognition (#508)

Co-authored-by: Nasty <[email protected]> Co-authored-by: Belyaeva Oksana <[email protected]>
ispras · Dec 20, 2024 · 76a7f4a · 76a7f4a
1 parent e4ec06b
commit 76a7f4a
Show file tree

Hide file tree

Showing 28 changed files with 367 additions and 840 deletions.
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -22,9 +22,6 @@ class QueryParameters:
     # tables handling
     need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
     table_type: str = Form("", description="Pipeline mode for table recognition")
-    orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
-    orient_cell_angle: str = Form("90", enum=["90", "270"],
-                                  description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
 
     # pdf handling
     pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],

diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
@@ -98,31 +98,9 @@ <h4>Attachments handling</h4>
             </details>
         </div>
 
-
-        <div class="parameters">
-            <h4>Tables handling </h4>
-            <details><summary>need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle</summary>
-                <br>
-                <p>
-                    <label>
-                        <input type="hidden" name="need_pdf_table_analysis" value="false">
-                        <input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
-                </p>
-
-                <p>
-                    <label><input name="orient_analysis_cells" type="checkbox" value="true"> orient_analysis_cells</label>
-                </p>
-
-                <p>
-                    <label>orient_cell_angle <input name="orient_cell_angle" type="number" size="5" value="90"></label>
-                </p>
-            </details>
-        </div>
-
-
         <div class="parameters">
             <h4>PDF handling</h4>
-            <details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
+            <details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
                 <br>
                 <p>
                     <label>
@@ -153,6 +131,15 @@ <h4>PDF handling</h4>
                     </label>
                 </p>
 
+                <details><summary>need_pdf_table_analysis</summary>
+                <br>
+                    <p>
+                    <label>
+                        <input type="hidden" name="need_pdf_table_analysis" value="false">
+                        <input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
+                    </p>
+                </details>
+
                 <p>
                     <label>pages <input name="pages" type="text" size="8" value=":"></label>
                 </p>

diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py
@@ -47,9 +47,8 @@ def get_annotations(self) -> List[Annotation]:
         """
         return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations
 
-    @staticmethod
-    def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta":
-        return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible)
+    def __str__(self) -> str:
+        return f"CellWithMeta(cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
 
     def to_api_schema(self) -> ApiCellWithMeta:
         import numpy as np

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -1,78 +1,46 @@
+import copy
 from typing import List, Optional
 
 from dedocutils.data_structures import BBox
 
-from dedoc.data_structures.annotation import Annotation
 from dedoc.data_structures.cell_with_meta import CellWithMeta
 from dedoc.data_structures.line_with_meta import LineWithMeta
 
 
 class Cell(CellWithMeta):
 
     @staticmethod
-    def copy_from(cell: "Cell",
-                  x_top_left: Optional[int] = None,
-                  x_bottom_right: Optional[int] = None,
-                  y_top_left: Optional[int] = None,
-                  y_bottom_right: Optional[int] = None) -> "Cell":
-        x_top_left = cell.x_top_left if x_top_left is None else x_top_left
-        x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right
-        y_top_left = cell.y_top_left if y_top_left is None else y_top_left
-        y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right
-        return Cell(x_top_left=x_top_left,
-                    x_bottom_right=x_bottom_right,
-                    y_top_left=y_top_left,
-                    y_bottom_right=y_bottom_right,
-                    id_con=cell.id_con,
-                    lines=cell.lines,
-                    is_attribute=cell.is_attribute,
-                    is_attribute_required=cell.is_attribute_required,
-                    rotated_angle=cell.rotated_angle,
-                    uid=cell.cell_uid,
-                    contour_coord=cell.con_coord)
+    def copy_from(cell: "Cell", bbox: Optional[BBox] = None) -> "Cell":
+        copy_cell = copy.deepcopy(cell)
+        if bbox:
+            copy_cell.bbox = bbox
+
+        return copy_cell
 
     def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
         if self.lines:
             for line in self.lines:
                 line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
-        self.x_top_left += shift_x
-        self.x_bottom_right += shift_x
-        self.y_top_left += shift_y
-        self.y_bottom_right += shift_y
-        if self.con_coord:
-            self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
 
-    def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
-                 is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
-                 contour_coord: Optional[BBox] = None) -> None:
+        self.bbox.shift(shift_x=shift_x, shift_y=shift_y)
+        if self.contour_coord:
+            self.contour_coord.shift(shift_x=shift_x, shift_y=shift_y)
 
-        import uuid
+    def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
+                 is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: Optional[str] = None,
+                 contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
 
-        assert x_top_left <= x_bottom_right
-        assert y_top_left <= y_bottom_right
+        import uuid
 
-        self.lines = [] if lines is None else lines
-        super().__init__(lines)
+        super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)
 
-        self.x_top_left = x_top_left
-        self.x_bottom_right = x_bottom_right
-        self.y_top_left = y_top_left
-        self.y_bottom_right = y_bottom_right
+        self.bbox = bbox
         self.id_con = id_con
         self.is_attribute = is_attribute
         self.is_attribute_required = is_attribute_required
         self.rotated_angle = rotated_angle
-        self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
-        self.con_coord = contour_coord or BBox(0, 0, 0, 0)
-
-    def __str__(self) -> str:
-        return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
-
-    def get_text(self) -> str:
-        return "\n".join([line.line for line in self.lines])
-
-    def get_annotations(self) -> List[Annotation]:
-        return LineWithMeta.join(self.lines, delimiter="\n").annotations
+        self.uuid = uuid.uuid4() if uuid is None else uid
+        self.contour_coord = contour_coord or BBox(0, 0, 0, 0)
 
     def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
         from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
@@ -96,11 +64,3 @@ def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_hei
 
     def __repr__(self) -> str:
         return self.__str__()
-
-    @property
-    def width(self) -> int:
-        return self.x_bottom_right - self.x_top_left
-
-    @property
-    def height(self) -> int:
-        return self.y_bottom_right - self.y_top_left
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Optional
+from typing import List
 
 from dedocutils.data_structures import BBox
 
@@ -9,106 +9,45 @@
 from dedoc.readers.pdf_reader.data_classes.tables.location import Location
 
 
-class ScanTable:
-    def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
-                 name: str = "", order: int = -1) -> None:
-        self.matrix_cells = matrix_cells
-        self.page_number = page_number
-        self.locations = []
-        self.name = name
+class ScanTable(Table):
+    def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
+
+        super().__init__(cells, TableMetadata(page_id=page_number))
         self.order = order
-        if bbox is not None:
-            self.locations.append(Location(page_number, bbox))
+        self.locations = [Location(page_number, bbox)]
 
     def extended(self, table: "ScanTable") -> None:
         # extend locations
         self.locations.extend(table.locations)
         # extend values
-        self.matrix_cells.extend(table.matrix_cells)
+        self.cells.extend(table.cells)
         # extend order
         self.order = max(self.order, table.order)
 
     def check_on_cell_instance(self) -> bool:
-        if len(self.matrix_cells) == 0:
+        if len(self.cells) == 0:
             return False
-        if len(self.matrix_cells[0]) == 0:
+        if len(self.cells[0]) == 0:
             return False
-        if not isinstance(self.matrix_cells[0][0], Cell):
+        if not isinstance(self.cells[0][0], Cell):
             return False
         return True
 
-    def to_table(self) -> Table:
-        metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
-        cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
-        return Table(metadata=metadata, cells=cells_with_meta)
-
-    @staticmethod
-    def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]:
-        attrs = []
-        for i in range(0, len(attr_cells)):
-            attrs.append([a.get_text() for a in attr_cells[i]])
-
-        return attrs
-
-    @staticmethod
-    def get_key_value_attrs(attrs: List, val: Any) -> dict:  # noqa
-        res_attrs = []
-        for i in range(0, len(attrs)):
-            res_attrs.append({"attr": attrs[i]})
-        res = {
-            "attrs": res_attrs,
-            "val": val
-        }
-        return res
-
-    @staticmethod
-    def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int:
-        end_attr_string = 0
-        for i in range(0, len(matrix_cells)):
-            if matrix_cells[i][0].is_attribute:
-                end_attr_string = i
-
-        return end_attr_string
-
-    @staticmethod
-    def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int):
-        import copy
-        import numpy as np
-
-        required_columns = []
-        for j in range(0, len(matrix_cells[0])):
-            if matrix_cells[0][j].is_attribute_required:
-                required_columns.append(j)
-
-        end_attr_string = ScanTable.get_index_of_end_string_attr(matrix_cells)
-
-        attrs = copy.deepcopy(np.array(matrix_cells[0:end_attr_string + 1]))
-        attrs = attrs.transpose().tolist()
-
-        return [required_columns, attrs, end_attr_string]
-
-    @staticmethod
-    def get_matrix_attrs_and_data(matrix_cells: List[List[Cell]]) -> (List[List[Cell]], List[List[str]], List[List[str]]):
-        required_columns, attrs, end_attr_string = ScanTable.get_attributes_cell(matrix_cells)
-        attrs_text = ScanTable.get_cells_text(attrs)
-
-        data = matrix_cells[(end_attr_string + 1):]
-        data_text = ScanTable.get_cells_text(data)
-
-        return [attrs, attrs_text, data_text]
+    def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]:
+        return [[cell.get_text() for cell in row] for row in cells]
 
     @property
     def location(self) -> Location:
         return min(self.locations)
 
     @property
     def uid(self) -> str:
-        return self.name
+        return self.metadata.uid
 
     def to_dict(self) -> dict:
         from collections import OrderedDict
 
-        data_text = ScanTable.get_cells_text(self.matrix_cells)
+        data_text = self.__get_cells_text(self.cells)
 
         res = OrderedDict()
         res["locations"] = [location.to_dict() for location in self.locations]

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -15,8 +15,6 @@
 
 
 ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
-    "orient_analysis_cells",
-    "orient_cell_angle",
     "is_one_column_document",
     "document_orientation",
     "language",
@@ -73,8 +71,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
 
         params_for_parse = ParametersForParseDoc(
             language=param_utils.get_param_language(parameters),
-            orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
-            orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters),
             is_one_column_document=param_utils.get_param_is_one_column_document(parameters),
             document_orientation=param_utils.get_param_document_orientation(parameters),
             need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters),
@@ -91,12 +87,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         )
 
         lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
-        tables = [scan_table.to_table() for scan_table in scan_tables]
 
         if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path):
             attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
 
-        result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=metadata)
+        result = UnstructuredDocument(lines=lines, tables=scan_tables, attachments=attachments, warnings=warnings, metadata=metadata)
         return self._postprocess(result)
 
     def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
@@ -177,7 +172,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[Scan
                 table_page_number = location.page_number
                 location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
             page_number = scan_table.locations[0].page_number
-            for row in scan_table.matrix_cells:
+            for row in scan_table.cells:
                 for cell in row:
                     image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
                     shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
@@ -275,16 +270,3 @@ def _binarization(self, gray_image: ndarray) -> ndarray:
             binary_mask = gray_image >= np.quantile(gray_image, 0.05)
             gray_image[binary_mask] = 255
         return gray_image
-
-    def eval_tables_by_batch(self,
-                             batch: Iterator[ndarray],
-                             page_number_begin: int,
-                             language: str,
-                             orient_analysis_cells: bool = False,
-                             orient_cell_angle: int = 270,
-                             table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]:
-        from joblib import Parallel, delayed
-
-        result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)(
-            image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch))
-        return result_batch
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -85,8 +85,6 @@ def _process_one_page(self,
                 image=rotated_image,
                 page_number=page_number,
                 language=parameters.language,
-                orient_analysis_cells=parameters.orient_analysis_cells,
-                orient_cell_angle=parameters.orient_cell_angle,
                 table_type=parameters.table_type
             )
         else: