Skip to content

Commit

Permalink
Merge pull request #54 from krishnasism/optimisations/small-optimisat…
Browse files Browse the repository at this point in the history
…ions-refactor-krish-18012024

Small optimisations - rewrote some lines
  • Loading branch information
krishnasism authored Jan 17, 2024
2 parents 8ca4d48 + e8f7d0c commit fe3ba9c
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 28 deletions.
41 changes: 16 additions & 25 deletions hotpdf/memory_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def __get_span_chars(self, spans: Generator[ET.Element, None, None], drop_duplic
for char in span.iterfind(".//"):
char.set("span_id", span_id)
yield char
del seen_span_hashes

def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -> None:
"""
Expand All @@ -102,7 +101,9 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -
chars = self.__get_page_chars(page)
for char in chars:
char_bbox = char.attrib["bbox"]
char_x0, char_y0, char_x1, _ = map(float, char_bbox.split())
char_x0, char_y0, char_x1, char_y1 = map(float, char_bbox.split())
if char_x0 < 0 or char_y0 < 0 or char_x1 < 0 or char_y1 < 0:
continue
char_c = char.attrib["c"]
char_span_id = char.attrib.get("span_id")
cell_x = math.floor(char_x0)
Expand All @@ -115,9 +116,6 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -
x_end=cell_x_end,
span_id=char_span_id,
)
if not 0 < cell_x or not 0 < cell_y:
continue

self.memory_map.insert(value=char_c, row_idx=cell_y, column_idx=cell_x)
char_hot_characters.append((
char_c,
Expand All @@ -133,34 +131,27 @@ def load_memory_map(self, page: ET.Element, drop_duplicate_spans: bool = True) -
self.height = self.memory_map.rows

@lru_cache
def extract_text_from_bbox(self, x0: float, x1: float, y0: float, y1: float) -> str:
def extract_text_from_bbox(self, x0: int, x1: int, y0: int, y1: int) -> str:
"""
Extract text within a specified bounding box.
Args:
x0 (float): Left x-coordinate of the bounding box.
x1 (float): Right x-coordinate of the bounding box.
y0 (float): Bottom y-coordinate of the bounding box.
y1 (float): Top y-coordinate of the bounding box.
x0 (int): Left x-coordinate of the bounding box.
x1 (int): Right x-coordinate of the bounding box.
y0 (int): Bottom y-coordinate of the bounding box.
y1 (int): Top y-coordinate of the bounding box.
Returns:
str: Extracted text within the bounding box.
"""
cell_x0 = math.floor(x0)
cell_x1 = math.ceil(x1)
cell_y0 = math.floor(y0)
cell_y1 = math.ceil(y1)

extracted_text = ""
for row in range(cell_y0, cell_y1 + 1):
if 0 <= row < self.memory_map.rows:
row_text = ""
for col in range(cell_x0, cell_x1 + 1):
if 0 <= col < self.memory_map.columns:
row_text += self.memory_map.get(row_idx=row, column_idx=col)
if row_text:
extracted_text += row_text
extracted_text += "\n"
extracted_text: str = ""
for row in range(max(y0, 0), min(y1, self.memory_map.rows - 1) + 1):
row_text: str = ""
row_text = "".join([
self.memory_map.get(row_idx=row, column_idx=col) for col in range(max(x0, 0), min(x1, self.memory_map.columns - 1) + 1)
])
if row_text:
extracted_text += row_text + "\n"

return extracted_text

Expand Down
6 changes: 3 additions & 3 deletions hotpdf/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import math
from copy import deepcopy
from typing import Union

from .data.classes import ElementDimension, HotCharacter, PageResult
Expand Down Expand Up @@ -60,7 +59,7 @@ def filter_adjacent_coords(text: list[str], page_hot_character_occurences: PageR
for anchor_hot_character in anchor_hot_character_instances:
neighbours = [anchor_hot_character]
reference_hot_character = anchor_hot_character
for _, coords_j in enumerate(page_hot_character_occurences[1:]):
for coords_j in page_hot_character_occurences[1:]:
neighbour_hot_character = find_neighbour_coord(
reference_character=reference_hot_character,
hot_characters=coords_j,
Expand All @@ -69,7 +68,8 @@ def filter_adjacent_coords(text: list[str], page_hot_character_occurences: PageR
neighbours.append(neighbour_hot_character)
reference_hot_character = neighbour_hot_character
if len(neighbours) == max_len:
adjacent_groups.append(deepcopy(neighbours))
adjacent_groups.append(neighbours[:])
neighbours.clear()
neighbours = []
return adjacent_groups

Expand Down

0 comments on commit fe3ba9c

Please sign in to comment.