Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

URL in post support and test case #13

Merged
merged 4 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/unstract/llmwhisperer/client_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,13 @@ def whisper(
file_path: str = "",
stream: IO[bytes] = None,
url: str = "",
mode: str = "high_quality",
mode: str = "form",
output_mode: str = "layout_preserving",
page_seperator: str = "<<<",
pages_to_extract: str = "",
median_filter_size: int = 0,
gaussian_blur_radius: int = 0,
line_splitter_tolerance: float = 0.75,
line_splitter_tolerance: float = 0.4,
horizontal_stretch_factor: float = 1.0,
mark_vertical_lines: bool = False,
mark_horizontal_lines: bool = False,
Expand All @@ -178,7 +178,7 @@ def whisper(
file_path (str, optional): The path to the file to be processed. Defaults to "".
stream (IO[bytes], optional): A stream of bytes to be processed. Defaults to None.
url (str, optional): The URL of the file to be processed. Defaults to "".
mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text". Defaults to "high_quality".
mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text". Defaults to "form".
output_mode (str, optional): The output mode. Can be "layout_preserving" or "text". Defaults to "layout_preserving".
page_seperator (str, optional): The page separator. Defaults to "<<<".
pages_to_extract (str, optional): The pages to extract. Defaults to "".
Expand Down Expand Up @@ -207,7 +207,6 @@ def whisper(
self.logger.debug("whisper called")
api_url = f"{self.base_url}/whisper"
params = {
"url": url,
"mode": mode,
"output_mode": output_mode,
"page_seperator": page_seperator,
Expand Down Expand Up @@ -272,7 +271,8 @@ def generate():
data=data,
)
else:
req = requests.Request("POST", api_url, params=params, headers=self.headers)
params["url_in_post"] = True
req = requests.Request("POST", api_url, params=params, headers=self.headers, data=url)
prepared = req.prepare()
s = requests.Session()
response = s.send(prepared, timeout=120, stream=should_stream)
Expand Down Expand Up @@ -340,7 +340,7 @@ def generate():
return message

# Will not reach here if status code is 202
message = response.text
message = json.loads(response.text)
nagesh-zip marked this conversation as resolved.
Show resolved Hide resolved
message["status_code"] = response.status_code
return message

Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import os

import pytest
from dotenv import load_dotenv

from unstract.llmwhisperer.client import LLMWhispererClient
from unstract.llmwhisperer.client_v2 import LLMWhispererClientV2

load_dotenv()
chandrasekharan-zipstack marked this conversation as resolved.
Show resolved Hide resolved


@pytest.fixture(name="client")
def llm_whisperer_client():
Expand Down
63 changes: 59 additions & 4 deletions tests/integration/client_v2_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,52 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):

exp_basename = f"{Path(input_file).stem}.{mode}.{output_mode}.txt"
exp_file = os.path.join(data_dir, "expected", exp_basename)
with open(exp_file, encoding="utf-8") as f:
# verify extracted text
do_fuzzy_assertion_with_extracted_text(exp_file, whisper_result, mode, output_mode)


@pytest.mark.parametrize(
"output_mode, mode, url, input_file, page_count",
[
("layout_preserving", "native_text", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
"credit_card.pdf", 7),
("layout_preserving", "low_cost", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
"credit_card.pdf", 7),
(
"layout_preserving", "high_quality",
"https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf",
"restaurant_invoice_photo.pdf", 1),
("layout_preserving", "form", "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf",
"handwritten-form.pdf", 1),
]
)
def test_whisper_v2_url_in_post(client_v2, data_dir, output_mode, mode, url, input_file, page_count):
usage_before = client_v2.get_usage_info()
whisper_result = client_v2.whisper(
mode=mode, output_mode=output_mode, url=url, wait_for_completion=True
)
logger.debug(f"Result for '{output_mode}', '{mode}', " f"'{input_file}: {whisper_result}")

exp_basename = f"{Path(input_file).stem}.{mode}.{output_mode}.txt"
exp_file = os.path.join(data_dir, "expected", exp_basename)
# verify extracted text
do_fuzzy_assertion_with_extracted_text(exp_file, whisper_result, mode, output_mode)
usage_after = client_v2.get_usage_info()
# Verify usage after extraction
verify_usage(usage_before, usage_after, page_count, mode)


def do_fuzzy_assertion_with_extracted_text(file_path, whisper_result, mode=None, output_mode=None):
nagesh-zip marked this conversation as resolved.
Show resolved Hide resolved
with open(file_path, encoding="utf-8") as f:
exp = f.read()

assert isinstance(whisper_result, dict)
assert whisper_result["status_code"] == 200

# For text based processing, perform a strict match
if mode == "native_text" and output_mode == "text":
assert whisper_result["extraction"]["result_text"] == exp
if mode and output_mode:
nagesh-zip marked this conversation as resolved.
Show resolved Hide resolved
# For text based processing, perform a strict match
if mode == "native_text" and output_mode == "text":
assert whisper_result["extraction"]["result_text"] == exp
# For OCR based processing, perform a fuzzy match
else:
extracted_text = whisper_result["extraction"]["result_text"]
Expand All @@ -69,3 +106,21 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
)
pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")


def verify_usage(before_extract, after_extract, page_count, mode='form'):
chandrasekharan-zipstack marked this conversation as resolved.
Show resolved Hide resolved
all_modes = ['form', 'high_quality', 'low_cost', 'native_text']
all_modes.remove(mode)
assert (after_extract['today_page_count'] == before_extract['today_page_count'] + page_count), \
"today_page_count calculation is wrong"
assert (after_extract['current_page_count'] == before_extract['current_page_count'] + page_count), \
"current_page_count calculation is wrong"
if after_extract['overage_page_count'] > 0:
assert (after_extract['overage_page_count'] == before_extract['overage_page_count'] + page_count), \
"overage_page_count calculation is wrong"
assert (after_extract[f'current_page_count_{mode}'] == before_extract[f'current_page_count_{mode}'] + page_count), \
f"{mode} mode calculation is wrong"
for i in range(len(all_modes)):
assert (after_extract[f'current_page_count_{all_modes[i]}'] ==
before_extract[f'current_page_count_{all_modes[i]}']), \
f"{all_modes[i]} mode calculation is wrong"
Loading