Skip to content

Commit

Permalink
replace token api str with Secret (#449)
Browse files Browse the repository at this point in the history
* replace token api str with Secret

* PR fixes and adding one more test

* cleaning leftovers
  • Loading branch information
davidsbatista authored Feb 20, 2024
1 parent 3a6cf81 commit 102be5e
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 24 deletions.
8 changes: 3 additions & 5 deletions integrations/unstructured/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,13 @@ ban-relative-imports = "parents"
"tests/**/*" = ["PLR2004", "S101", "TID252"]

[tool.coverage.run]
source_pkgs = ["src", "tests"]
source = ["haystack_integrations"]
branch = true
parallel = true

[tool.coverage.paths]
unstructured_fileconverter_haystack = ["src/haystack_integrations", "*/unstructured-fileconverter-haystack/src"]
tests = ["tests", "*/unstructured-fileconverter-haystack/tests"]

[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing=true
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from haystack import Document, component, default_to_dict
from haystack.components.converters.utils import normalize_metadata
from haystack.utils import Secret
from tqdm import tqdm

from unstructured.documents.elements import Element # type: ignore[import]
Expand All @@ -29,7 +30,7 @@ class UnstructuredFileConverter:
def __init__(
self,
api_url: str = UNSTRUCTURED_HOSTED_API_URL,
api_key: Optional[str] = None,
api_key: Optional[Secret] = Secret.from_env_var("UNSTRUCTURED_API_KEY", strict=False), # noqa: B008
document_creation_mode: Literal[
"one-doc-per-file", "one-doc-per-page", "one-doc-per-element"
] = "one-doc-per-file",
Expand Down Expand Up @@ -57,24 +58,23 @@ def __init__(
"""

self.api_url = api_url
self.api_key = api_key
self.document_creation_mode = document_creation_mode
self.unstructured_kwargs = unstructured_kwargs or {}
self.separator = separator
self.progress_bar = progress_bar

is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL

api_key = api_key or os.environ.get("UNSTRUCTURED_API_KEY")
# we check whether api_key is None or an empty string
if is_hosted_api and not api_key:
api_key_value = api_key.resolve_value() if api_key else None
if is_hosted_api and not api_key_value:
msg = (
"To use the hosted version of Unstructured, you need to set the environment variable "
"UNSTRUCTURED_API_KEY (recommended) or explictly pass the parameter api_key."
"UNSTRUCTURED_API_KEY (recommended) or explicitly pass the parameter api_key."
)
raise ValueError(msg)

self.api_key = api_key

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Expand All @@ -84,6 +84,7 @@ def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
api_url=self.api_url,
api_key=self.api_key.to_dict() if self.api_key else None,
document_creation_mode=self.document_creation_mode,
separator=self.separator,
unstructured_kwargs=self.unstructured_kwargs,
Expand Down Expand Up @@ -140,8 +141,8 @@ def run(
documents.extend(docs_for_file)
return {"documents": documents}

@staticmethod
def _create_documents(
self,
filepath: Path,
elements: List[Element],
document_creation_mode: Literal["one-doc-per-file", "one-doc-per-page", "one-doc-per-element"],
Expand Down Expand Up @@ -194,7 +195,10 @@ def _partition_file_into_elements(self, filepath: Path) -> List[Element]:
elements = []
try:
elements = partition_via_api(
filename=str(filepath), api_url=self.api_url, api_key=self.api_key, **self.unstructured_kwargs
filename=str(filepath),
api_url=self.api_url,
api_key=self.api_key.resolve_value() if self.api_key else None,
**self.unstructured_kwargs,
)
except Exception as e:
logger.warning(f"Unstructured could not process file {filepath}. Error: {e}")
Expand Down
13 changes: 13 additions & 0 deletions integrations/unstructured/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from pathlib import Path

import pytest


@pytest.fixture
def set_env_variables(monkeypatch):
monkeypatch.setenv("UNSTRUCTURED_API_KEY", "test-api-key")


@pytest.fixture
def samples_path():
return Path(__file__).parent / "samples"
22 changes: 11 additions & 11 deletions integrations/unstructured/tests/test_converter.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path

import pytest
from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter


@pytest.fixture
def samples_path():
return Path(__file__).parent / "samples"


class TestUnstructuredFileConverter:
@pytest.mark.usefixtures("set_env_variables")
def test_init_default(self):
converter = UnstructuredFileConverter(api_key="test-api-key")
converter = UnstructuredFileConverter()
assert converter.api_url == "https://api.unstructured.io/general/v0/general"
assert converter.api_key == "test-api-key"
assert converter.api_key.resolve_value() == "test-api-key"
assert converter.document_creation_mode == "one-doc-per-file"
assert converter.separator == "\n\n"
assert converter.unstructured_kwargs == {}
Expand All @@ -31,20 +25,26 @@ def test_init_with_parameters(self):
progress_bar=False,
)
assert converter.api_url == "http://custom-url:8000/general"
assert converter.api_key is None
assert converter.api_key.resolve_value() is None
assert converter.document_creation_mode == "one-doc-per-element"
assert converter.separator == "|"
assert converter.unstructured_kwargs == {"foo": "bar"}
assert not converter.progress_bar

def test_init_hosted_without_api_key_raises_error(self):
with pytest.raises(ValueError):
UnstructuredFileConverter(api_url="https://api.unstructured.io/general/v0/general")

@pytest.mark.usefixtures("set_env_variables")
def test_to_dict(self):
converter = UnstructuredFileConverter(api_key="test-api-key")
converter = UnstructuredFileConverter()
converter_dict = converter.to_dict()

assert converter_dict == {
"type": "haystack_integrations.components.converters.unstructured.converter.UnstructuredFileConverter",
"init_parameters": {
"api_url": "https://api.unstructured.io/general/v0/general",
"api_key": {"env_vars": ["UNSTRUCTURED_API_KEY"], "strict": False, "type": "env_var"},
"document_creation_mode": "one-doc-per-file",
"separator": "\n\n",
"unstructured_kwargs": {},
Expand Down

0 comments on commit 102be5e

Please sign in to comment.