From 839557b162e32d9824fd9a0854fa428bc234b580 Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Thu, 18 Jan 2024 00:19:43 +0530 Subject: [PATCH 1/6] chore: Small optimisations --- hotpdf/memory_map.py | 33 +++++++++++++-------------------- hotpdf/utils.py | 6 +++--- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/hotpdf/memory_map.py b/hotpdf/memory_map.py index ff6993d..dfa574b 100644 --- a/hotpdf/memory_map.py +++ b/hotpdf/memory_map.py @@ -133,34 +133,27 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) - self.height = self.memory_map.rows @lru_cache - def extract_text_from_bbox(self, x0: float, x1: float, y0: float, y1: float) -> str: + def extract_text_from_bbox(self, x0: int, x1: int, y0: int, y1: int) -> str: """ Extract text within a specified bounding box. Args: - x0 (float): Left x-coordinate of the bounding box. - x1 (float): Right x-coordinate of the bounding box. - y0 (float): Bottom y-coordinate of the bounding box. - y1 (float): Top y-coordinate of the bounding box. + x0 (int): Left x-coordinate of the bounding box. + x1 (int): Right x-coordinate of the bounding box. + y0 (int): Bottom y-coordinate of the bounding box. + y1 (int): Top y-coordinate of the bounding box. Returns: str: Extracted text within the bounding box. """ - cell_x0 = math.floor(x0) - cell_x1 = math.ceil(x1) - cell_y0 = math.floor(y0) - cell_y1 = math.ceil(y1) - - extracted_text = "" - for row in range(cell_y0, cell_y1 + 1): - if 0 <= row < self.memory_map.rows: - row_text = "" - for col in range(cell_x0, cell_x1 + 1): - if 0 <= col < self.memory_map.columns: - row_text += self.memory_map.get(row_idx=row, column_idx=col) - if row_text: - extracted_text += row_text - extracted_text += "\n" + extracted_text: str = "" + for row in range(max(y0, 0), min(y1, self.memory_map.rows - 1) + 1): + row_text: str = "" + row_text = "".join([ + self.memory_map.get(row_idx=row, column_idx=col) for col in range(max(x0, 0), min(x1, self.memory_map.columns - 1) + 1) + ]) + if row_text: + extracted_text += row_text + "\n" return extracted_text diff --git a/hotpdf/utils.py b/hotpdf/utils.py index ea00aa2..069caf5 100644 --- a/hotpdf/utils.py +++ b/hotpdf/utils.py @@ -1,5 +1,4 @@ import math -from copy import deepcopy from typing import Union from .data.classes import ElementDimension, HotCharacter, PageResult @@ -60,7 +59,7 @@ def filter_adjacent_coords(text: list[str], page_hot_character_occurences: PageR for anchor_hot_character in anchor_hot_character_instances: neighbours = [anchor_hot_character] reference_hot_character = anchor_hot_character - for _, coords_j in enumerate(page_hot_character_occurences[1:]): + for coords_j in page_hot_character_occurences[1:]: neighbour_hot_character = find_neighbour_coord( reference_character=reference_hot_character, hot_characters=coords_j, @@ -69,7 +68,8 @@ def filter_adjacent_coords(text: list[str], page_hot_character_occurences: PageR neighbours.append(neighbour_hot_character) reference_hot_character = neighbour_hot_character if len(neighbours) == max_len: - adjacent_groups.append(deepcopy(neighbours)) + adjacent_groups.append(neighbours[:]) + neighbours.clear() neighbours = [] return adjacent_groups From 7434c721d342ace479793e95dc98c97569502131 Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Thu, 18 Jan 2024 00:23:34 +0530 Subject: [PATCH 2/6] chore: continue in previous step --- hotpdf/memory_map.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hotpdf/memory_map.py b/hotpdf/memory_map.py index dfa574b..cc0a77a 100644 --- a/hotpdf/memory_map.py +++ b/hotpdf/memory_map.py @@ -102,7 +102,9 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) - chars = self.__get_page_chars(page) for char in chars: char_bbox = char.attrib["bbox"] - char_x0, char_y0, char_x1, _ = map(float, char_bbox.split()) + char_x0, char_y0, char_x1, char_y1 = map(float, char_bbox.split()) + if any([char_x0 < 0, char_y0 < 0, char_x1 < 0, char_y1 < 0]): + continue char_c = char.attrib["c"] char_span_id = char.attrib.get("span_id") cell_x = math.floor(char_x0) @@ -115,9 +117,6 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) - x_end=cell_x_end, span_id=char_span_id, ) - if not 0 < cell_x or not 0 < cell_y: - continue - self.memory_map.insert(value=char_c, row_idx=cell_y, column_idx=cell_x) char_hot_characters.append(( char_c, From d74a43e08615c337903828f4a672f9acff0226e5 Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Thu, 18 Jan 2024 00:25:45 +0530 Subject: [PATCH 3/6] chore: use or instead of any --- hotpdf/memory_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hotpdf/memory_map.py b/hotpdf/memory_map.py index cc0a77a..030be38 100644 --- a/hotpdf/memory_map.py +++ b/hotpdf/memory_map.py @@ -103,7 +103,7 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) - for char in chars: char_bbox = char.attrib["bbox"] char_x0, char_y0, char_x1, char_y1 = map(float, char_bbox.split()) - if any([char_x0 < 0, char_y0 < 0, char_x1 < 0, char_y1 < 0]): + if char_x0 < 0 or char_y0 < 0 or char_x1 < 0 or char_y1 < 0: continue char_c = char.attrib["c"] char_span_id = char.attrib.get("span_id") From 3c6122ab5dbed6fd7b0ad0100b76db1c21779c54 Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Thu, 18 Jan 2024 00:38:40 +0530 Subject: [PATCH 4/6] chore: remove extra line for del --- hotpdf/memory_map.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hotpdf/memory_map.py b/hotpdf/memory_map.py index 030be38..076c6dd 100644 --- a/hotpdf/memory_map.py +++ b/hotpdf/memory_map.py @@ -78,7 +78,6 @@ def __get_span_chars(self, spans: Generator[ET.Element, None, None], drop_duplic for char in span.iterfind(".//"): char.set("span_id", span_id) yield char - del seen_span_hashes def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -> None: """ From b47be056b619c70b5b1129863e110c075ffb82a2 Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Thu, 18 Jan 2024 00:38:57 +0530 Subject: [PATCH 5/6] reduce coverage % for coverall until we fix the bug --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 685ca35..77282b5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -63,6 +63,6 @@ jobs: run: pip install -e '.[dev]' - name: Run tests with coverage run: - python -m pytest --cov=hotpdf -n=auto tests/ --cov-fail-under=98 --cov-report= + python -m pytest --cov=hotpdf -n=auto tests/ --cov-fail-under=95 --cov-report= - name: Upload coverage to coveralls uses: coverallsapp/github-action@v2.2.3 From e8f7d0ce054e1b758369696b94d8723402f01a3e Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Thu, 18 Jan 2024 00:42:58 +0530 Subject: [PATCH 6/6] revert: reduction of coverage % --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 77282b5..685ca35 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -63,6 +63,6 @@ jobs: run: pip install -e '.[dev]' - name: Run tests with coverage run: - python -m pytest --cov=hotpdf -n=auto tests/ --cov-fail-under=95 --cov-report= + python -m pytest --cov=hotpdf -n=auto tests/ --cov-fail-under=98 --cov-report= - name: Upload coverage to coveralls uses: coverallsapp/github-action@v2.2.3