From 1a02c48a4c71b52c7963f118ac64441fdce9daf3 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Tue, 17 Dec 2024 19:05:09 +0300 Subject: [PATCH] TLDR-861 fixes after review --- dedoc/data_structures/cell_with_meta.py | 2 +- .../pdf_reader/data_classes/tables/cell.py | 10 +- .../data_classes/tables/scantable.py | 5 +- dedoc/readers/pdf_reader/pdf_base_reader.py | 1 - .../split_last_hor_union_cells.py | 9 +- .../onepage_table_extractor.py | 4 +- .../table_attribute_extractor.py | 51 ++---- .../table_recognizer/table_recognizer.py | 9 +- .../table_utils/accuracy_table_rec.py | 148 ------------------ .../table_recognizer/table_utils/utils.py | 14 ++ .../pdf_txtlayer_reader/pdf_tabby_reader.py | 6 +- docs/source/dedoc_api_usage/api.rst | 19 --- docs/source/parameters/pdf_handling.rst | 24 --- .../unit_tests/test_module_table_detection.py | 15 +- 14 files changed, 54 insertions(+), 263 deletions(-) delete mode 100644 dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index 1ef652b0..03ee0c67 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -48,7 +48,7 @@ def get_annotations(self) -> List[Annotation]: return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations def __str__(self) -> str: - return f"CellWithMeta((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" + return f"CellWithMeta(cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" def to_api_schema(self) -> ApiCellWithMeta: import numpy as np diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index b2b28bf2..d83e2b6c 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -23,16 +23,15 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) self.bbox.shift(shift_x=shift_x, shift_y=shift_y) - if self.con_coord: - self.con_coord.shift(shift_x=shift_x, shift_y=shift_y) + if self.contour_coord: + self.contour_coord.shift(shift_x=shift_x, shift_y=shift_y) def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, - is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None], + is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: Optional[str] = None, contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid - self.lines = [] if lines is None else lines super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) self.bbox = bbox @@ -40,9 +39,8 @@ def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMe self.is_attribute = is_attribute self.is_attribute_required = is_attribute_required self.rotated_angle = rotated_angle - self.uuid = uuid.uuid4() if uuid is None else uid - self.con_coord = contour_coord or BBox(0, 0, 0, 0) + self.contour_coord = contour_coord or BBox(0, 0, 0, 0) def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None: from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index fa60aaeb..9ae91c18 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -33,8 +33,7 @@ def check_on_cell_instance(self) -> bool: return False return True - @staticmethod - def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]: + def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]: return [[cell.get_text() for cell in row] for row in cells] @property @@ -48,7 +47,7 @@ def uid(self) -> str: def to_dict(self) -> dict: from collections import OrderedDict - data_text = ScanTable.get_cells_text(self.cells) + data_text = self.__get_cells_text(self.cells) res = OrderedDict() res["locations"] = [location.to_dict() for location in self.locations] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 41e2990f..3a6e29ef 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -87,7 +87,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure ) lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse) - # tables = [scan_table.to_table() for scan_table in scan_tables] if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index e80769e0..8dd0bbac 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -130,8 +130,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image x_left = union_cell[0].bbox.x_top_left + eps x_right = union_cell[-1].bbox.x_bottom_right # get y coordinate from cell before union cell - y_top_split = cell_splitter.con_coord.y_top_left - y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height + y_top_split = cell_splitter.contour_coord.y_top_left + y_bottom_split = cell_splitter.contour_coord.y_top_left + cell_splitter.contour_coord.height if abs(y_bottom_split - y_top_split) < 10: for cell in union_cell: cell.lines = [] @@ -162,9 +162,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra for line in list(ocr_result.lines): text_line = OCRCellExtractor.get_line_with_meta("") for word in line.words: - # do absolute coordinate on src_image (inside src_image) - word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value) - word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left) + # do absolute coordinates on src_image (inside src_image) + word.bbox.shift(shift_x=cell_bbox.x_top_left - padding_cell_value, shift_y=cell_bbox.y_top_left - padding_cell_value) # add space between words if len(text_line) != 0: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index c7c59414..6271b2ac 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None: self.image = None self.page_number = 0 - self.table_header_selector = TableHeaderExtractor(logger=self.logger) + self.table_header_extractor = TableHeaderExtractor(logger=self.logger) self.count_vertical_extended = 0 self.splitter = CellSplitter() self.table_options = TableTypeAdditionalOptions() @@ -108,7 +108,7 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li if self.table_options.split_last_column in table_type: cells = split_last_column(cells, language=self.language, image=self.image) - self.table_header_selector.set_header_cells(cells) + self.table_header_extractor.set_header_cells(cells) if self.config.get("debug_mode", False): self._print_table_attr(cells) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index e25dbd2d..99420036 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -68,25 +68,23 @@ def clear_attributes(matrix_table: List[List[Cell]]) -> None: def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, max_raw_of_search: int) -> bool: # № п/п - for i in range(0, max_raw_of_search + 1): + for i in range(max_raw_of_search + 1): if column_id < len(matrix_table[i]) and "№" in matrix_table[i][column_id].get_text() and len( matrix_table[i][column_id].get_text()) < len("№ п/п\n"): return True return False def __set_attributes_for_type_top(self, cells: List[List[Cell]]) -> List[List[Cell]]: - vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(cells) horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(cells) - # simple table - if (0 not in horizontal_union_rows) and len(vertical_union_columns) == 0: + if 0 not in horizontal_union_rows: self.__analyze_attr_for_simple_table(cells) return cells def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> bool: all_empty = True - for i in range(0, len(matrix_table)): + for i in range(len(matrix_table)): if len(matrix_table[i]) <= column_id: break if matrix_table[i][column_id].get_text() != "": @@ -96,37 +94,17 @@ def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> b def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool: all_empty = True - for j in range(0, len(matrix_table[row_index])): + for j in range(len(matrix_table[row_index])): if matrix_table[row_index][j].get_text() != "": all_empty = False break return all_empty - def __analyze_attr_for_vertical_union_columns(self, cells: List[List[Cell]]) -> List[int]: - vertical_union_columns = [] - if len(vertical_union_columns) != 0 and len(cells) > 1: - self.logger.debug("ATTR_TYPE: vertical union table") - row_max_attr = 1 - - # Установка атрибутов таблицы - for i in range(0, row_max_attr): - for j in range(0, len(cells[i])): - cells[i][j].is_attribute = True - - # Установка обязательных атрибутов - cells[0][0].is_attribute_required = True - for j in range(1, len(cells[0])): - is_attribute_required = True - if is_attribute_required: - cells[0][j].is_attribute_required = True - - return vertical_union_columns - def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> List[int]: horizontal_union_rows = [] union_first = False - for i in range(0, len(cells)): + for i in range(len(cells)): if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows: horizontal_union_rows.append(i) if not self.__is_empty_row(cells, i): @@ -134,8 +112,8 @@ def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> L if union_first and len(horizontal_union_rows) != 0: self.logger.debug("ATTR_TYPE: horizontal_union_rows") - for i in range(0, len(horizontal_union_rows)): - for j in range(0, len(cells[i])): + for i in range(len(horizontal_union_rows)): + for j in range(len(cells[i])): cells[i][j].is_attribute = True cells[0][0].is_attribute_required = True first_required_column = 0 @@ -160,20 +138,19 @@ def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> L def __analyze_attr_for_simple_table(self, cells: List[List[Cell]]) -> None: self.logger.debug("ATTR_TYPE: simple table") - for j in range(0, len(cells[0])): - cells[0][j].is_attribute = True + for cell in cells[0]: + cell.is_attribute = True + # set first required column - j = 0 - first_required_column = j - while j < len(cells[0]): + first_required_column = 0 + for j in range(len(cells[0])): if not self.__is_empty_column(cells, j): cells[0][j].is_attribute_required = True first_required_column = j break - j += 1 # search indexable_column - # один один столбец должен быть (0) - нумерованным, - # один (1) - с обязательными поляями, один (2) - с необязательными + # один столбец должен быть (0) - нумерованным, + # один (1) - с обязательными полями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 if self.__is_indexable_column(cells, first_required_column, 0) and len(cells) > first_required_column + 2: cells[0][first_required_column + 1].is_attribute_required = True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index eb07732d..11c30cab 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -21,9 +21,7 @@ class TableRecognizer(object): def __init__(self, *, config: dict = None) -> None: - self.logger = config.get("logger", logging.getLogger()) - self.onepage_tables_extractor = OnePageTableExtractor(config=config, logger=self.logger) self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger) self.config = config @@ -109,11 +107,8 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: std = table_image.std() white_mean = (table_image > 225).mean() black_mean = (table_image < 225).mean() - table_area = bbox.width * bbox.height - cells_area = 0 - for row in table.cells: - for cell in row: - cells_area += cell.bbox.width * cell.bbox.height + table_area = bbox.square + cells_area = sum([cell.bbox.square for row in table.cells for cell in row]) ratio = cells_area / table_area res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or ratio < 0.65 diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py deleted file mode 100644 index c98d71a5..00000000 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py +++ /dev/null @@ -1,148 +0,0 @@ -import csv -import json -import os -from typing import List, Tuple - -import cv2 -from dedocutils.data_structures import BBox - -from dedoc.config import get_config -from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell -from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader - - -def _create_cell(c: str, text_cells: list) -> Cell: - cell = Cell(BBox(x_top_left=-1, y_top_left=-1, width=0, height=0)) - if "a" in c: - cell.is_attribute = True - # loading cell text - if len(text_cells) != 0: - cell_text = [r for r in text_cells if r[0] == c] - if len(cell_text) != 0: - cell.text = cell_text[0][-1] - return cell - - -def load_from_csv(path_csv: str, path_class_2_csv: str = "") -> List[List[Cell]]: - text_cells = [] - if path_class_2_csv != "": - csv_file_class_2 = open(path_class_2_csv, "r", newline="") - reader_class_2 = csv.reader(csv_file_class_2) - text_cells = [r for r in reader_class_2] - - matrix = [] - with open(path_csv, "r", newline="") as csv_file: - reader = csv.reader(csv_file) - - for raw in reader: - if len(raw) >= 5 and raw[0] == "bbox": - pass - else: - line = [_create_cell(c, text_cells) for c in raw if c != ""] - if len(line) != 0: - matrix.append(line) - return matrix - - -def get_quantitative_parameters(matrix: List[List[Cell]]) -> Tuple[int, int, int, int]: - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = 0, 0, 0, 0 - - # calculating data - if len(matrix) > 0: - cnt_columns = len(matrix[0]) - cnt_rows = len(matrix) - - for i in range(0, len(matrix)): - for j in range(0, len(matrix[i])): - if matrix[i][j].is_attribute: - cnt_a_cell += 1 - - cnt_cell += 1 - - return cnt_a_cell, cnt_cell, cnt_columns, cnt_rows - - -def calc_agreement(matrix_gt: List[List[Cell]], matrix: List[List[Cell]]) -> float: - q_params = get_quantitative_parameters(matrix) - q_params_gt = get_quantitative_parameters(matrix_gt) - - equal_indexes = [i for i in range(0, len(q_params)) if q_params[i] == q_params_gt[i]] - - agreement = 1.0 * len(equal_indexes) / len(q_params_gt) - return agreement - - -def draw_recognized_cell(tables: List[ScanTable], path_image: str, path_save: str) -> None: - img = cv2.imread(path_image) - for t_index in range(0, len(tables)): - table = tables[t_index].cells - bbox = tables[t_index].locations.location - blue_color, green_color, red_color = (255, 0, 0), (0, 255, 0), (0, 0, 255) - cv2.rectangle(img, (bbox.x_top_left, bbox.y_top_left), (bbox.width, bbox.height), blue_color, 6) - for i in range(0, len(table)): - for j in range(0, len(table[i])): - cv2.rectangle(img, - (table[i][j].bbox.x_top_left, table[i][j].bbox.y_top_left), - (table[i][j].bbox.x_bottom_right, table[i][j].bbox.y_bottom_right), - red_color, 4 - ) - cv2.putText(img, str(table[i][j].id_con), - (table[i][j].bbox.x_top_left, table[i][j].bbox.y_bottom_right), - cv2.FONT_HERSHEY_PLAIN, 4, green_color - ) - cv2.imwrite(path_save, img) - - -def save_json(tables: List[ScanTable], number_test_string: str, path_output: str) -> None: - for i in range(0, len(tables)): - with open(f"{path_output}{number_test_string}_table_{i}.json", "w") as out: - json.dump(tables[i].to_dict(), out, ensure_ascii=False, indent=2) - - -def calc_accuracy(path_image: str, path_gt_struct: str, path_gt_text: str, path_save_image: str, path_save_json: str) -> None: - from os import listdir - from os.path import isfile, join - - os.makedirs(path_save_image, exist_ok=True) - os.makedirs(path_save_json, exist_ok=True) - - image_files = [f for f in listdir(path_image) if isfile(join(path_image, f))] - agreements = [] - - for image_file in image_files: - name_example = image_file.split(".")[0].split("_")[0] - # predict tables - image = cv2.imread(path_image + image_file, 0) - # TODO fix this - clean_images, tables = PdfImageReader(config=get_config()).get_tables([image]) - draw_recognized_cell(tables, path_image + image_file, path_save_image + image_file) - save_json(tables, name_example, path_save_json) - - gt_files = [f for f in listdir(path_gt_struct) if isfile(join(path_gt_struct, f)) and name_example + "_" in f] - for index_table in range(0, len(gt_files)): - - csv_filename = path_gt_struct + name_example + "_" + str(index_table + 1) + ".csv" - csv_text_filename = path_gt_text + name_example + "_" + str(index_table + 1) + "_text.csv" - if os.path.exists(csv_filename): - if not os.path.exists(csv_text_filename): - csv_text_filename = "" - # load_GT - matrix_cell_gt = load_from_csv(csv_filename, csv_text_filename) - # calc agreement - if len(tables) == 0 and matrix_cell_gt == []: - agreements.append(1.0) - elif len(tables) <= index_table: - agreements.append(0) - else: - agreement = calc_agreement(matrix_cell_gt, tables[index_table].cells) - agreements.append(agreement) - - -if __name__ == "__main__": - current_path = os.path.dirname(__file__) + "/" - calc_accuracy(current_path + "../../backend/test_dataset_table/images/", - current_path + "../../backend/test_dataset_table/GT_struct/", - current_path + "../../backend/test_dataset_table/GT_text/", - "/tmp/backend_claw/out_tables/acc/draw_tables/", - "/tmp/backend_claw/out_tables/acc/json_tables/") diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py index 80ac01e7..693b8417 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py @@ -1,5 +1,9 @@ +from typing import List, Tuple + import numpy as np +from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell + def equal_with_eps(x: int, y: int, eps: int = 10) -> bool: return y + eps >= x >= y - eps @@ -24,3 +28,13 @@ def similarity(s1: str, s2: str) -> float: normalized2 = s2.lower() matcher = difflib.SequenceMatcher(None, normalized1, normalized2) return matcher.ratio() + + +def get_statistic_values(cells: List[List[Cell]]) -> Tuple[int, int, int, int]: + + cnt_rows = len(cells) + cnt_columns = len(cells[0]) if cnt_rows else 0 + cnt_cell = cnt_columns * cnt_rows + cnt_attr_cell = len([cell for row in cells for cell in row if cell.is_attribute]) + + return cnt_attr_cell, cnt_cell, cnt_columns, cnt_rows diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index cce14d01..b60cbed7 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -14,8 +14,6 @@ from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor class PdfTabbyReader(PdfBaseReader): @@ -31,6 +29,10 @@ class PdfTabbyReader(PdfBaseReader): def __init__(self, *, config: Optional[dict] = None) -> None: import os from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import \ + OnePageTableExtractor + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import \ + TableHeaderExtractor super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) self.tabby_java_version = "2.0.0" diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index c357ac78..59310477 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -162,25 +162,6 @@ Api parameters description If the document has a textual layer, it is recommended to use ``pdf_with_text_layer=tabby``, in this case tables will be parsed much easier and faster. - * - orient_analysis_cells - - true, false - - false - - This option is used for a table recognition in case of PDF documents without a textual layer - (images, scanned documents or when ``pdf_with_text_layer`` is ``true``, ``false`` or ``auto``). - When set to ``true``, it enables analysis of rotated cells in table headers. - Use this option if you are sure that the cells of the table header are rotated. - - * - orient_cell_angle - - 90, 270 - - 90 - - This option is used for a table recognition in case of PDF documents without a textual layer - (images, scanned documents or when ``pdf_with_text_layer`` is ``true``, ``false`` or ``auto``). - It is ignored when ``orient_analysis_cells=false``. - The option is used to set orientation of cells in table headers: - - * **270** -- cells are rotated 90 degrees clockwise; - * **90** -- cells are rotated 90 degrees counterclockwise (or 270 clockwise). - * - :cspan:`3` **PDF handling** * - pdf_with_text_layer diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 20fabec9..46c03416 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -161,30 +161,6 @@ PDF and images handling It allows :class:`dedoc.readers.PdfImageReader`, :class:`dedoc.readers.PdfTxtlayerReader` and :class:`dedoc.readers.PdfTabbyReader` to properly process the content of the document containing GOST frame, see :ref:`gost_frame_handling` for more details. - * - orient_analysis_cells - - True, False - - False - - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` - * :meth:`dedoc.readers.ReaderComposition.read` - - This option is used for a table recognition for PDF documents or images. - It is ignored when ``need_pdf_table_analysis=False``. - When set to ``True``, it enables analysis of rotated cells in table headers. - Use this option if you are sure that the cells of the table header are rotated. - - * - orient_cell_angle - - 90, 270 - - 90 - - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` - * :meth:`dedoc.readers.ReaderComposition.read` - - This option is used for a table recognition for PDF documents or images. - It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``. - The option is used to set orientation of cells in table headers: - - * **270** -- cells are rotated 90 degrees clockwise; - * **90** -- cells are rotated 90 degrees counterclockwise (or 270 clockwise). - .. toctree:: :maxdepth: 1 diff --git a/tests/unit_tests/test_module_table_detection.py b/tests/unit_tests/test_module_table_detection.py index 39b1b4dc..29d2e8da 100644 --- a/tests/unit_tests/test_module_table_detection.py +++ b/tests/unit_tests/test_module_table_detection.py @@ -7,13 +7,12 @@ from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.accuracy_table_rec import get_quantitative_parameters -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps, similarity as utils_similarity +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps, get_statistic_values, similarity as sim from tests.test_utils import get_full_path, get_test_config def similarity(s1: str, s2: str, threshold: float = 0.8) -> bool: - return True if utils_similarity(s1, s2) > threshold else False + return True if sim(s1, s2) > threshold else False class TestRecognizedTable(unittest.TestCase): @@ -110,7 +109,7 @@ def test_table_recognition_1(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table3.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 8) self.assertEqual(cnt_columns, 3) @@ -125,7 +124,7 @@ def test_table_recognition_2(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table4.jpg"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 5) self.assertEqual(cnt_columns, 3) @@ -140,7 +139,7 @@ def test_table_recognition_3(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) @@ -155,7 +154,7 @@ def test_table_recognition_4(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) @@ -170,7 +169,7 @@ def test_table_recognition_with_rotate_5(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table6.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_statistic_values(tables[0].cells) self.assertEqual(cnt_rows, 3) self.assertEqual(cnt_columns, 7)