TLDR-861 fixes after review

ispras · Dec 17, 2024 · 1a02c48 · 1a02c48
1 parent 75db1a7
commit 1a02c48
Show file tree

Hide file tree

Showing 14 changed files with 54 additions and 263 deletions.
diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py
@@ -48,7 +48,7 @@ def get_annotations(self) -> List[Annotation]:
         return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations
 
     def __str__(self) -> str:
-        return f"CellWithMeta((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
+        return f"CellWithMeta(cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
 
     def to_api_schema(self) -> ApiCellWithMeta:
         import numpy as np

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -23,26 +23,24 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
                 line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
 
         self.bbox.shift(shift_x=shift_x, shift_y=shift_y)
-        if self.con_coord:
-            self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
+        if self.contour_coord:
+            self.contour_coord.shift(shift_x=shift_x, shift_y=shift_y)
 
     def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
-                 is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None],
+                 is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: Optional[str] = None,
                  contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
 
         import uuid
 
-        self.lines = [] if lines is None else lines
         super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)
 
         self.bbox = bbox
         self.id_con = id_con
         self.is_attribute = is_attribute
         self.is_attribute_required = is_attribute_required
         self.rotated_angle = rotated_angle
-
         self.uuid = uuid.uuid4() if uuid is None else uid
-        self.con_coord = contour_coord or BBox(0, 0, 0, 0)
+        self.contour_coord = contour_coord or BBox(0, 0, 0, 0)
 
     def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
         from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py
@@ -33,8 +33,7 @@ def check_on_cell_instance(self) -> bool:
             return False
         return True
 
-    @staticmethod
-    def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]:
+    def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]:
         return [[cell.get_text() for cell in row] for row in cells]
 
     @property
@@ -48,7 +47,7 @@ def uid(self) -> str:
     def to_dict(self) -> dict:
         from collections import OrderedDict
 
-        data_text = ScanTable.get_cells_text(self.cells)
+        data_text = self.__get_cells_text(self.cells)
 
         res = OrderedDict()
         res["locations"] = [location.to_dict() for location in self.locations]

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -87,7 +87,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         )
 
         lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
-        # tables = [scan_table.to_table() for scan_table in scan_tables]
 
         if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path):
             attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py
@@ -130,8 +130,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
     x_left = union_cell[0].bbox.x_top_left + eps
     x_right = union_cell[-1].bbox.x_bottom_right
     # get y coordinate from cell before union cell
-    y_top_split = cell_splitter.con_coord.y_top_left
-    y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height
+    y_top_split = cell_splitter.contour_coord.y_top_left
+    y_bottom_split = cell_splitter.contour_coord.y_top_left + cell_splitter.contour_coord.height
     if abs(y_bottom_split - y_top_split) < 10:
         for cell in union_cell:
             cell.lines = []
@@ -162,9 +162,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra
     for line in list(ocr_result.lines):
         text_line = OCRCellExtractor.get_line_with_meta("")
         for word in line.words:
-            # do absolute coordinate on src_image (inside src_image)
-            word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value)
-            word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left)
+            # do absolute coordinates on src_image (inside src_image)
+            word.bbox.shift(shift_x=cell_bbox.x_top_left - padding_cell_value, shift_y=cell_bbox.y_top_left - padding_cell_value)
 
             # add space between words
             if len(text_line) != 0:

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None:
 
         self.image = None
         self.page_number = 0
-        self.table_header_selector = TableHeaderExtractor(logger=self.logger)
+        self.table_header_extractor = TableHeaderExtractor(logger=self.logger)
         self.count_vertical_extended = 0
         self.splitter = CellSplitter()
         self.table_options = TableTypeAdditionalOptions()
@@ -108,7 +108,7 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li
         if self.table_options.split_last_column in table_type:
             cells = split_last_column(cells, language=self.language, image=self.image)
 
-        self.table_header_selector.set_header_cells(cells)
+        self.table_header_extractor.set_header_cells(cells)
 
         if self.config.get("debug_mode", False):
             self._print_table_attr(cells)

diff --git a/...reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/...reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py
@@ -68,25 +68,23 @@ def clear_attributes(matrix_table: List[List[Cell]]) -> None:
 
     def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, max_raw_of_search: int) -> bool:
         # № п/п
-        for i in range(0, max_raw_of_search + 1):
+        for i in range(max_raw_of_search + 1):
             if column_id < len(matrix_table[i]) and "№" in matrix_table[i][column_id].get_text() and len(
                     matrix_table[i][column_id].get_text()) < len("№ п/п\n"):
                 return True
         return False
 
     def __set_attributes_for_type_top(self, cells: List[List[Cell]]) -> List[List[Cell]]:
-        vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(cells)
         horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(cells)
 
-        # simple table
-        if (0 not in horizontal_union_rows) and len(vertical_union_columns) == 0:
+        if 0 not in horizontal_union_rows:
             self.__analyze_attr_for_simple_table(cells)
 
         return cells
 
     def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> bool:
         all_empty = True
-        for i in range(0, len(matrix_table)):
+        for i in range(len(matrix_table)):
             if len(matrix_table[i]) <= column_id:
                 break
             if matrix_table[i][column_id].get_text() != "":
@@ -96,46 +94,26 @@ def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> b
 
     def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool:
         all_empty = True
-        for j in range(0, len(matrix_table[row_index])):
+        for j in range(len(matrix_table[row_index])):
             if matrix_table[row_index][j].get_text() != "":
                 all_empty = False
                 break
         return all_empty
 
-    def __analyze_attr_for_vertical_union_columns(self, cells: List[List[Cell]]) -> List[int]:
-        vertical_union_columns = []
-        if len(vertical_union_columns) != 0 and len(cells) > 1:
-            self.logger.debug("ATTR_TYPE: vertical union table")
-            row_max_attr = 1
-
-            # Установка атрибутов таблицы
-            for i in range(0, row_max_attr):
-                for j in range(0, len(cells[i])):
-                    cells[i][j].is_attribute = True
-
-            # Установка обязательных атрибутов
-            cells[0][0].is_attribute_required = True
-            for j in range(1, len(cells[0])):
-                is_attribute_required = True
-                if is_attribute_required:
-                    cells[0][j].is_attribute_required = True
-
-        return vertical_union_columns
-
     def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> List[int]:
         horizontal_union_rows = []
         union_first = False
 
-        for i in range(0, len(cells)):
+        for i in range(len(cells)):
             if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows:
                 horizontal_union_rows.append(i)
                 if not self.__is_empty_row(cells, i):
                     break
 
         if union_first and len(horizontal_union_rows) != 0:
             self.logger.debug("ATTR_TYPE: horizontal_union_rows")
-            for i in range(0, len(horizontal_union_rows)):
-                for j in range(0, len(cells[i])):
+            for i in range(len(horizontal_union_rows)):
+                for j in range(len(cells[i])):
                     cells[i][j].is_attribute = True
             cells[0][0].is_attribute_required = True
             first_required_column = 0
@@ -160,20 +138,19 @@ def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> L
 
     def __analyze_attr_for_simple_table(self, cells: List[List[Cell]]) -> None:
         self.logger.debug("ATTR_TYPE: simple table")
-        for j in range(0, len(cells[0])):
-            cells[0][j].is_attribute = True
+        for cell in cells[0]:
+            cell.is_attribute = True
+
         # set first required column
-        j = 0
-        first_required_column = j
-        while j < len(cells[0]):
+        first_required_column = 0
+        for j in range(len(cells[0])):
             if not self.__is_empty_column(cells, j):
                 cells[0][j].is_attribute_required = True
                 first_required_column = j
                 break
-            j += 1
         # search indexable_column
-        # один один столбец должен быть (0) - нумерованным,
-        # один (1) - с обязательными поляями, один (2) - с необязательными
+        # один столбец должен быть (0) - нумерованным,
+        # один (1) - с обязательными полями, один (2) - с необязательными
         # поэтому len(matrix_table) > first_required_column + 2
         if self.__is_indexable_column(cells, first_required_column, 0) and len(cells) > first_required_column + 2:
             cells[0][first_required_column + 1].is_attribute_required = True
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -21,9 +21,7 @@
 class TableRecognizer(object):
 
     def __init__(self, *, config: dict = None) -> None:
-
         self.logger = config.get("logger", logging.getLogger())
-
         self.onepage_tables_extractor = OnePageTableExtractor(config=config, logger=self.logger)
         self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger)
         self.config = config
@@ -109,11 +107,8 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool:
         std = table_image.std()
         white_mean = (table_image > 225).mean()
         black_mean = (table_image < 225).mean()
-        table_area = bbox.width * bbox.height
-        cells_area = 0
-        for row in table.cells:
-            for cell in row:
-                cells_area += cell.bbox.width * cell.bbox.height
+        table_area = bbox.square
+        cells_area = sum([cell.bbox.square for row in table.cells for cell in row])
 
         ratio = cells_area / table_area
         res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or ratio < 0.65

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py