From c75a027ad0818eb87235c10e19cd87dc9812624c Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Fri, 19 Jan 2024 10:09:01 +0530 Subject: [PATCH 1/4] chore: Remove duplicate functions --- hotpdf/memory_map.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/hotpdf/memory_map.py b/hotpdf/memory_map.py index 076c6dd..7bd0b82 100644 --- a/hotpdf/memory_map.py +++ b/hotpdf/memory_map.py @@ -32,34 +32,6 @@ def build_memory_map(self) -> None: """ self.memory_map = SparseMatrix() - def text(self) -> str: - """ - Get text of the memory map - Returns: - str: Text in the page of the pdf preserving the order of occurence. - """ - memory_map_str = "" - for row in range(self.memory_map.rows): - for col in range(self.memory_map.columns): - memory_map_str += self.memory_map.get(row_idx=row, column_idx=col) - memory_map_str += "\n" - return memory_map_str - - def display_memory_map(self, save: bool = False, filename: str = "memory_map.txt") -> None: - """ - Display or save the memory map. - - Args: - save (bool, optional): Whether to save to a file. Defaults to False. - filename (str, optional): The filename to save the map. Defaults to "memory_map.txt". - """ - memory_map_str = self.text() - if save: - with open(filename, "w", encoding="utf-8") as file: - file.write(memory_map_str) - else: - print(memory_map_str) - def __get_page_spans(self, page: ET.Element) -> Generator[ET.Element, None, None]: return page.iterfind(".//span") From 5941a21e1ec9abb5f8d8a0ceea16b2db1e6eec35 Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Fri, 19 Jan 2024 10:09:16 +0530 Subject: [PATCH 2/4] refactor: update function signature --- hotpdf/hotpdf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py index 2e7d3cc..2d1f800 100644 --- a/hotpdf/hotpdf.py +++ b/hotpdf/hotpdf.py @@ -258,15 +258,15 @@ def extract_text( ) return extracted_text - def extract_page( + def extract_page_text( self, - page: int = 0, + page: int, ) -> str: """ Extract text from a specified page. Args: - page (int): The page number. Defaults to 0. + page (int): The page number. Raises: ValueError: If the page number is invalid. Returns: From 79443ec76a0ff70ed74c2b1224dfa027e29dace1 Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Fri, 19 Jan 2024 10:13:59 +0530 Subject: [PATCH 3/4] test: Update tests --- tests/test_functions.py | 11 ----------- tests/test_load.py | 15 ++++----------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/tests/test_functions.py b/tests/test_functions.py index 361de8f..50c39e5 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,4 +1,3 @@ -import os import shutil from unittest.mock import patch @@ -52,16 +51,6 @@ def test_span_map_behaviours(valid_file_name): assert hot_pdf_object.pages[0].span_map.get_span(None) is None -def test_memory_map_behaviour(valid_file_name): - hot_pdf_object = HotPdf() - with pytest.raises(Exception, match="list index out of range"): - hot_pdf_object.pages[0].text() - hot_pdf_object.load(valid_file_name, drop_duplicate_spans=False) - hot_pdf_object.pages[0].display_memory_map(save=True, filename="test.txt") - assert os.path.exists("test.txt") - os.remove("test.txt") - - def test_sparse_matrix_insert_and_get(): matrix = SparseMatrix(3, 3) matrix.insert("A", 0, 0) diff --git a/tests/test_load.py b/tests/test_load.py index f393c47..1bfd90e 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -36,9 +36,9 @@ def test_load(valid_file_name): def test_full_text(valid_file_name): hot_pdf_object = HotPdf() hot_pdf_object.load(valid_file_name) - pages = hot_pdf_object.pages + text_first_page = hot_pdf_object.extract_page_text(page=0) # Not blank extraction - assert len(pages[0].text()) > 1000 + assert len(text_first_page) > 500 def test_pages_length(valid_file_name): @@ -123,8 +123,8 @@ def test_double_loading(valid_file_name): def test_blank_pdf(blank_file_name): hot_pdf_object = HotPdf() hot_pdf_object.load(blank_file_name) - pages = hot_pdf_object.pages - assert all([len(page.text().strip("\n").strip()) == 0 for page in pages]) + len_pages = len(hot_pdf_object.pages) + assert all([len(hot_pdf_object.extract_page_text(page=i).strip("\n").strip()) == 0 for i in range(len_pages)]) def test_row_index_greater_than_rows_of_memory_map(valid_file_name): @@ -172,13 +172,6 @@ def test_extract_invalid_coordinates(valid_file_name, coordinates): hot_pdf_object.extract_text(x0=coordinates[0], y0=coordinates[1], x1=coordinates[2], y1=coordinates[3]) -def test_display_memory_map(valid_file_name): - hot_pdf_object = HotPdf() - hot_pdf_object.load(valid_file_name) - pages = hot_pdf_object.pages - pages[0].display_memory_map(save=False, filename="test.txt") - - def test_get_spans(valid_file_name): INCOMPLETE_WORD = "EXPERIEN" NON_EXISTENT_WORD = "BLAH" From 931a2c023280432b63024a01012bdeed8512eafd Mon Sep 17 00:00:00 2001 From: Krishnasis Mandal Date: Fri, 19 Jan 2024 10:22:33 +0530 Subject: [PATCH 4/4] test: Add test for invalid page number --- tests/test_functions.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_functions.py b/tests/test_functions.py index 50c39e5..376baa0 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -171,3 +171,10 @@ def test_intersect(bbox1, bbox2, expected): ) def test_to_text(hot_characters, expected): assert to_text(hot_characters) == expected + + +def test_invalid_page_number(valid_file_name): + hotpdf_object = HotPdf() + hotpdf_object.load(valid_file_name) + with pytest.raises(ValueError, match="Invalid page number"): + _ = hotpdf_object.extract_page_text(page=2)