Skip to content

Commit

Permalink
TLDR-861 fixes after review
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Dec 17, 2024
1 parent 75db1a7 commit 1a02c48
Show file tree
Hide file tree
Showing 14 changed files with 54 additions and 263 deletions.
2 changes: 1 addition & 1 deletion dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_annotations(self) -> List[Annotation]:
return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations

def __str__(self) -> str:
return f"CellWithMeta((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
return f"CellWithMeta(cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"

def to_api_schema(self) -> ApiCellWithMeta:
import numpy as np
Expand Down
10 changes: 4 additions & 6 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,24 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)

self.bbox.shift(shift_x=shift_x, shift_y=shift_y)
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
if self.contour_coord:
self.contour_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None],
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: Optional[str] = None,
contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:

import uuid

self.lines = [] if lines is None else lines
super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)

self.bbox = bbox
self.id_con = id_con
self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle

self.uuid = uuid.uuid4() if uuid is None else uid
self.con_coord = contour_coord or BBox(0, 0, 0, 0)
self.contour_coord = contour_coord or BBox(0, 0, 0, 0)

def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
Expand Down
5 changes: 2 additions & 3 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def check_on_cell_instance(self) -> bool:
return False
return True

@staticmethod
def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]:
def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]:
return [[cell.get_text() for cell in row] for row in cells]

@property
Expand All @@ -48,7 +47,7 @@ def uid(self) -> str:
def to_dict(self) -> dict:
from collections import OrderedDict

data_text = ScanTable.get_cells_text(self.cells)
data_text = self.__get_cells_text(self.cells)

res = OrderedDict()
res["locations"] = [location.to_dict() for location in self.locations]
Expand Down
1 change: 0 additions & 1 deletion dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
)

lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
# tables = [scan_table.to_table() for scan_table in scan_tables]

if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path):
attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
x_left = union_cell[0].bbox.x_top_left + eps
x_right = union_cell[-1].bbox.x_bottom_right
# get y coordinate from cell before union cell
y_top_split = cell_splitter.con_coord.y_top_left
y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height
y_top_split = cell_splitter.contour_coord.y_top_left
y_bottom_split = cell_splitter.contour_coord.y_top_left + cell_splitter.contour_coord.height
if abs(y_bottom_split - y_top_split) < 10:
for cell in union_cell:
cell.lines = []
Expand Down Expand Up @@ -162,9 +162,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra
for line in list(ocr_result.lines):
text_line = OCRCellExtractor.get_line_with_meta("")
for word in line.words:
# do absolute coordinate on src_image (inside src_image)
word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value)
word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left)
# do absolute coordinates on src_image (inside src_image)
word.bbox.shift(shift_x=cell_bbox.x_top_left - padding_cell_value, shift_y=cell_bbox.y_top_left - padding_cell_value)

# add space between words
if len(text_line) != 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None:

self.image = None
self.page_number = 0
self.table_header_selector = TableHeaderExtractor(logger=self.logger)
self.table_header_extractor = TableHeaderExtractor(logger=self.logger)
self.count_vertical_extended = 0
self.splitter = CellSplitter()
self.table_options = TableTypeAdditionalOptions()
Expand Down Expand Up @@ -108,7 +108,7 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li
if self.table_options.split_last_column in table_type:
cells = split_last_column(cells, language=self.language, image=self.image)

self.table_header_selector.set_header_cells(cells)
self.table_header_extractor.set_header_cells(cells)

if self.config.get("debug_mode", False):
self._print_table_attr(cells)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,25 +68,23 @@ def clear_attributes(matrix_table: List[List[Cell]]) -> None:

def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, max_raw_of_search: int) -> bool:
# № п/п
for i in range(0, max_raw_of_search + 1):
for i in range(max_raw_of_search + 1):
if column_id < len(matrix_table[i]) and "№" in matrix_table[i][column_id].get_text() and len(
matrix_table[i][column_id].get_text()) < len("№ п/п\n"):
return True
return False

def __set_attributes_for_type_top(self, cells: List[List[Cell]]) -> List[List[Cell]]:
vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(cells)
horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(cells)

# simple table
if (0 not in horizontal_union_rows) and len(vertical_union_columns) == 0:
if 0 not in horizontal_union_rows:
self.__analyze_attr_for_simple_table(cells)

return cells

def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> bool:
all_empty = True
for i in range(0, len(matrix_table)):
for i in range(len(matrix_table)):
if len(matrix_table[i]) <= column_id:
break
if matrix_table[i][column_id].get_text() != "":
Expand All @@ -96,46 +94,26 @@ def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> b

def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool:
all_empty = True
for j in range(0, len(matrix_table[row_index])):
for j in range(len(matrix_table[row_index])):
if matrix_table[row_index][j].get_text() != "":
all_empty = False
break
return all_empty

def __analyze_attr_for_vertical_union_columns(self, cells: List[List[Cell]]) -> List[int]:
vertical_union_columns = []
if len(vertical_union_columns) != 0 and len(cells) > 1:
self.logger.debug("ATTR_TYPE: vertical union table")
row_max_attr = 1

# Установка атрибутов таблицы
for i in range(0, row_max_attr):
for j in range(0, len(cells[i])):
cells[i][j].is_attribute = True

# Установка обязательных атрибутов
cells[0][0].is_attribute_required = True
for j in range(1, len(cells[0])):
is_attribute_required = True
if is_attribute_required:
cells[0][j].is_attribute_required = True

return vertical_union_columns

def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> List[int]:
horizontal_union_rows = []
union_first = False

for i in range(0, len(cells)):
for i in range(len(cells)):
if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows:
horizontal_union_rows.append(i)
if not self.__is_empty_row(cells, i):
break

if union_first and len(horizontal_union_rows) != 0:
self.logger.debug("ATTR_TYPE: horizontal_union_rows")
for i in range(0, len(horizontal_union_rows)):
for j in range(0, len(cells[i])):
for i in range(len(horizontal_union_rows)):
for j in range(len(cells[i])):
cells[i][j].is_attribute = True
cells[0][0].is_attribute_required = True
first_required_column = 0
Expand All @@ -160,20 +138,19 @@ def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> L

def __analyze_attr_for_simple_table(self, cells: List[List[Cell]]) -> None:
self.logger.debug("ATTR_TYPE: simple table")
for j in range(0, len(cells[0])):
cells[0][j].is_attribute = True
for cell in cells[0]:
cell.is_attribute = True

# set first required column
j = 0
first_required_column = j
while j < len(cells[0]):
first_required_column = 0
for j in range(len(cells[0])):
if not self.__is_empty_column(cells, j):
cells[0][j].is_attribute_required = True
first_required_column = j
break
j += 1
# search indexable_column
# один один столбец должен быть (0) - нумерованным,
# один (1) - с обязательными поляями, один (2) - с необязательными
# один столбец должен быть (0) - нумерованным,
# один (1) - с обязательными полями, один (2) - с необязательными
# поэтому len(matrix_table) > first_required_column + 2
if self.__is_indexable_column(cells, first_required_column, 0) and len(cells) > first_required_column + 2:
cells[0][first_required_column + 1].is_attribute_required = True
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@
class TableRecognizer(object):

def __init__(self, *, config: dict = None) -> None:

self.logger = config.get("logger", logging.getLogger())

self.onepage_tables_extractor = OnePageTableExtractor(config=config, logger=self.logger)
self.multipage_tables_extractor = MultiPageTableExtractor(config=config, logger=self.logger)
self.config = config
Expand Down Expand Up @@ -109,11 +107,8 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool:
std = table_image.std()
white_mean = (table_image > 225).mean()
black_mean = (table_image < 225).mean()
table_area = bbox.width * bbox.height
cells_area = 0
for row in table.cells:
for cell in row:
cells_area += cell.bbox.width * cell.bbox.height
table_area = bbox.square
cells_area = sum([cell.bbox.square for row in table.cells for cell in row])

ratio = cells_area / table_area
res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or ratio < 0.65
Expand Down

This file was deleted.

Loading

0 comments on commit 1a02c48

Please sign in to comment.