Skip to content

Commit

Permalink
Merge pull request #59 from krishnasism/chore/remove-duplicate-functi…
Browse files Browse the repository at this point in the history
…ons-and-refactor-krish19022024
  • Loading branch information
krishnasism authored Jan 19, 2024
2 parents 54ea148 + 931a2c0 commit 75a3b61
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 53 deletions.
6 changes: 3 additions & 3 deletions hotpdf/hotpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,15 +252,15 @@ def extract_text(
)
return extracted_text

def extract_page(
def extract_page_text(
self,
page: int = 0,
page: int,
) -> str:
"""
Extract text from a specified page.
Args:
page (int): The page number. Defaults to 0.
page (int): The page number.
Raises:
ValueError: If the page number is invalid.
Returns:
Expand Down
28 changes: 0 additions & 28 deletions hotpdf/memory_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,34 +32,6 @@ def build_memory_map(self) -> None:
"""
self.memory_map = SparseMatrix()

def text(self) -> str:
"""
Get text of the memory map
Returns:
str: Text in the page of the pdf preserving the order of occurence.
"""
memory_map_str = ""
for row in range(self.memory_map.rows):
for col in range(self.memory_map.columns):
memory_map_str += self.memory_map.get(row_idx=row, column_idx=col)
memory_map_str += "\n"
return memory_map_str

def display_memory_map(self, save: bool = False, filename: str = "memory_map.txt") -> None:
"""
Display or save the memory map.
Args:
save (bool, optional): Whether to save to a file. Defaults to False.
filename (str, optional): The filename to save the map. Defaults to "memory_map.txt".
"""
memory_map_str = self.text()
if save:
with open(filename, "w", encoding="utf-8") as file:
file.write(memory_map_str)
else:
print(memory_map_str)

def __get_page_spans(self, page: ET.Element) -> Generator[ET.Element, None, None]:
return page.iterfind(".//span")

Expand Down
18 changes: 7 additions & 11 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import shutil
from unittest.mock import patch

Expand Down Expand Up @@ -52,16 +51,6 @@ def test_span_map_behaviours(valid_file_name):
assert hot_pdf_object.pages[0].span_map.get_span(None) is None


def test_memory_map_behaviour(valid_file_name):
hot_pdf_object = HotPdf()
with pytest.raises(Exception, match="list index out of range"):
hot_pdf_object.pages[0].text()
hot_pdf_object.load(valid_file_name, drop_duplicate_spans=False)
hot_pdf_object.pages[0].display_memory_map(save=True, filename="test.txt")
assert os.path.exists("test.txt")
os.remove("test.txt")


def test_sparse_matrix_insert_and_get():
matrix = SparseMatrix(3, 3)
matrix.insert("A", 0, 0)
Expand Down Expand Up @@ -182,3 +171,10 @@ def test_intersect(bbox1, bbox2, expected):
)
def test_to_text(hot_characters, expected):
assert to_text(hot_characters) == expected


def test_invalid_page_number(valid_file_name):
hotpdf_object = HotPdf()
hotpdf_object.load(valid_file_name)
with pytest.raises(ValueError, match="Invalid page number"):
_ = hotpdf_object.extract_page_text(page=2)
15 changes: 4 additions & 11 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def test_load(valid_file_name):
def test_full_text(valid_file_name):
hot_pdf_object = HotPdf()
hot_pdf_object.load(valid_file_name)
pages = hot_pdf_object.pages
text_first_page = hot_pdf_object.extract_page_text(page=0)
# Not blank extraction
assert len(pages[0].text()) > 1000
assert len(text_first_page) > 500


def test_pages_length(valid_file_name):
Expand Down Expand Up @@ -116,8 +116,8 @@ def test_non_existent_file_path(non_existent_file_name):
def test_blank_pdf(blank_file_name):
hot_pdf_object = HotPdf()
hot_pdf_object.load(blank_file_name)
pages = hot_pdf_object.pages
assert all([len(page.text().strip("\n").strip()) == 0 for page in pages])
len_pages = len(hot_pdf_object.pages)
assert all([len(hot_pdf_object.extract_page_text(page=i).strip("\n").strip()) == 0 for i in range(len_pages)])


def test_row_index_greater_than_rows_of_memory_map(valid_file_name):
Expand Down Expand Up @@ -165,13 +165,6 @@ def test_extract_invalid_coordinates(valid_file_name, coordinates):
hot_pdf_object.extract_text(x0=coordinates[0], y0=coordinates[1], x1=coordinates[2], y1=coordinates[3])


def test_display_memory_map(valid_file_name):
hot_pdf_object = HotPdf()
hot_pdf_object.load(valid_file_name)
pages = hot_pdf_object.pages
pages[0].display_memory_map(save=False, filename="test.txt")


def test_get_spans(valid_file_name):
INCOMPLETE_WORD = "EXPERIEN"
NON_EXISTENT_WORD = "BLAH"
Expand Down

0 comments on commit 75a3b61

Please sign in to comment.