Skip to content

Commit

Permalink
Remove reference to tests file in prod code (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
callegarimattia authored Jan 18, 2024
1 parent fe3ba9c commit 5f5c9dd
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 29 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ jobs:
run: pip install -e '.[dev]'
- name: Run tests with coverage
run:
python -m pytest --cov=hotpdf -n=auto tests/ --cov-fail-under=98 --cov-report=
python -m pytest --cov -n=auto tests/ --cov-fail-under=98 --cov-report=
- name: Upload coverage to coveralls
if: github.event_name == 'push'
uses: coverallsapp/[email protected]
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,6 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.vscode/

# MACOS
.DS_Store
15 changes: 5 additions & 10 deletions hotpdf/hotpdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
import math
import os
import warnings
Expand Down Expand Up @@ -27,16 +26,11 @@ def __init__(
"""
self.pages: list[MemoryMap] = []
self.extraction_tolerance: int = extraction_tolerance
self.xml_file_path: str
self.xml_file_path: Optional[str] = None

def __del__(self) -> None:
try:
if "tests/resources/xml" not in self.xml_file_path:
os.remove(self.xml_file_path)
logging.info("[hotpdf] Deleted")
except Exception as e:
logging.error("[hotpdf] Unable to delete xml_file")
logging.error(str(e))
def __delete_xml_file(self) -> None:
if self.xml_file_path and os.path.exists(self.xml_file_path):
os.remove(self.xml_file_path)

def __check_file_exists(self, pdf_file: str) -> None:
if not os.path.exists(pdf_file):
Expand Down Expand Up @@ -99,6 +93,7 @@ def load(
self.pages.append(parsed_page)
element.clear()
root.clear()
self.__delete_xml_file()

def __extract_full_text_span(
self,
Expand Down
19 changes: 13 additions & 6 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import shutil
from unittest.mock import patch

import pytest
Expand All @@ -19,6 +20,11 @@ def mock_hotpdf_bank_file_name():
return "tests/resources/hotpdf_bank.pdf"


def xml_copy_file_name(xml_file_name: str):
shutil.copy(xml_file_name, f"{xml_file_name}_copy.xml")
return f"{xml_file_name}_copy.xml"


def test_load_file(valid_file_name):
hot_pdf_object = HotPdf()
hot_pdf_object.load(valid_file_name)
Expand Down Expand Up @@ -98,24 +104,25 @@ def test_sparse_matrix_iterator():
assert non_empty_values == expected_result


@patch("hotpdf.processor.generate_xml_file", return_value="tests/resources/xml/hotpdf_bank_dup_span.xml", autospec=True)
def test_duplicate_spans_not_removed(_, mock_hotpdf_bank_file_name):
def test_duplicate_spans_not_removed(mock_hotpdf_bank_file_name):
hot_pdf_object = HotPdf()
hot_pdf_object_with_dup_span = HotPdf()
hot_pdf_object_with_dup_span.load(mock_hotpdf_bank_file_name, drop_duplicate_spans=False)
hot_pdf_object.load(mock_hotpdf_bank_file_name)
with patch("hotpdf.processor.generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml")):
hot_pdf_object_with_dup_span.load(mock_hotpdf_bank_file_name, drop_duplicate_spans=False)
with patch("hotpdf.processor.generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_dup_span.xml")):
hot_pdf_object.load(mock_hotpdf_bank_file_name)

assert len(hot_pdf_object.pages[0].span_map) < len(hot_pdf_object_with_dup_span.pages[0].span_map)


def test_load_negative_coordinates(mock_hotpdf_bank_file_name):
QUERY = "HOTPDF BANK"
with patch("hotpdf.processor.generate_xml_file", return_value="tests/resources/xml/hotpdf_bank_negative_coords.xml"):
with patch("hotpdf.processor.generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_negative_coords.xml")):
hot_pdf_object = HotPdf()
hot_pdf_object.load(mock_hotpdf_bank_file_name)
assert not hot_pdf_object.find_text(QUERY)[0], "Expected string to be empty"
# For sanity: The following file is same as above, except the coords are normal
with patch("hotpdf.processor.generate_xml_file", return_value="tests/resources/xml/hotpdf_bank_normal_coords.xml"):
with patch("hotpdf.processor.generate_xml_file", return_value=xml_copy_file_name("tests/resources/xml/hotpdf_bank_normal_coords.xml")):
hot_pdf_object_normal = HotPdf()
hot_pdf_object_normal.load(mock_hotpdf_bank_file_name)
assert hot_pdf_object_normal.find_text(QUERY)[0], "Expected string to be not empty"
Expand Down
13 changes: 1 addition & 12 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,18 +224,7 @@ def test_get_spans(valid_file_name):

# Test Non Existent Word
occurences = hot_pdf_object.find_text(NON_EXISTENT_WORD)
for _, page_num in enumerate(occurences):
occurences_by_page = occurences[page_num]
for occurence_by_page in occurences_by_page:
element_dimension = get_element_dimension(occurence_by_page)
full_spans_in_bbox = hot_pdf_object.extract_spans(
x0=element_dimension.x0,
y0=element_dimension.y0,
x1=element_dimension.x1,
y1=element_dimension.y1,
)
# Assert empty list returned
assert full_spans_in_bbox == []
assert occurences == {0: []}


@pytest.mark.parametrize("first_page, last_page", [(1, 1), (1, 2)])
Expand Down

0 comments on commit 5f5c9dd

Please sign in to comment.