From e5eed9116ec3dd7f5cfb5ad45d777719efce1174 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 17 Jan 2024 09:47:32 +0100 Subject: [PATCH] Mount unstructured in haystack_integrations and refactor the structure of the project (#221) * reorganize integrations * some other changes, including the workflow * fix fmt * retry * fix workflow * try to fix coverage error * fix fmt again * standardize README --- ...red_fileconverter.yml => unstructured.yml} | 12 +-- .../unstructured/{fileconverter => }/LICENSE | 0 integrations/unstructured/README.md | 37 ++++++++ .../unstructured/fileconverter/README.md | 86 ------------------ .../{fileconverter => }/pyproject.toml | 28 +++--- .../converters/unstructured}/__init__.py | 2 +- .../converters/unstructured/converter.py} | 1 + integrations/unstructured/tests/__init__.py | 3 + .../tests/samples/sample_pdf.pdf | Bin .../test_converter.py} | 5 +- 10 files changed, 64 insertions(+), 110 deletions(-) rename .github/workflows/{unstructured_fileconverter.yml => unstructured.yml} (80%) rename integrations/unstructured/{fileconverter => }/LICENSE (100%) create mode 100644 integrations/unstructured/README.md delete mode 100644 integrations/unstructured/fileconverter/README.md rename integrations/unstructured/{fileconverter => }/pyproject.toml (80%) rename integrations/unstructured/{fileconverter/src/unstructured_fileconverter_haystack => src/haystack_integrations/components/converters/unstructured}/__init__.py (63%) rename integrations/unstructured/{fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py => src/haystack_integrations/components/converters/unstructured/converter.py} (99%) create mode 100644 integrations/unstructured/tests/__init__.py rename integrations/unstructured/{fileconverter => }/tests/samples/sample_pdf.pdf (100%) rename integrations/unstructured/{fileconverter/tests/test_fileconverter.py => tests/test_converter.py} (94%) diff --git a/.github/workflows/unstructured_fileconverter.yml b/.github/workflows/unstructured.yml similarity index 80% rename from .github/workflows/unstructured_fileconverter.yml rename to .github/workflows/unstructured.yml index ee70510e9..6338b06e8 100644 --- a/.github/workflows/unstructured_fileconverter.yml +++ b/.github/workflows/unstructured.yml @@ -1,17 +1,17 @@ # This workflow comes from https://github.com/ofek/hatch-mypyc # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml -name: Test / unstructured / fileconverter +name: Test / unstructured on: schedule: - cron: "0 0 * * *" pull_request: paths: - - "integrations/unstructured/fileconverter/**" - - ".github/workflows/unstructured_fileconverter.yml" + - "integrations/unstructured/**" + - ".github/workflows/unstructured.yml" concurrency: - group: unstructured_fileconverter-${{ github.head_ref }} + group: unstructured-${{ github.head_ref }} cancel-in-progress: true env: @@ -50,10 +50,10 @@ jobs: run: pip install --upgrade hatch - name: Lint - working-directory: integrations/unstructured/fileconverter + working-directory: integrations/unstructured if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run tests - working-directory: integrations/unstructured/fileconverter + working-directory: integrations/unstructured run: hatch run cov diff --git a/integrations/unstructured/fileconverter/LICENSE b/integrations/unstructured/LICENSE similarity index 100% rename from integrations/unstructured/fileconverter/LICENSE rename to integrations/unstructured/LICENSE diff --git a/integrations/unstructured/README.md b/integrations/unstructured/README.md new file mode 100644 index 000000000..db74c5306 --- /dev/null +++ b/integrations/unstructured/README.md @@ -0,0 +1,37 @@ +# unstructured-fileconverter-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) + +----- + +**Table of Contents** + +- [unstructured-fileconverter-haystack](#unstructured-fileconverter-haystack) + - [Installation](#installation) + - [License](#license) + - [Testing](#testing) + +## Installation + +```console +pip install unstructured-fileconverter-haystack +``` + +## License + +`unstructured-fileconverter-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. + +## Testing + +To run tests, first start a Docker container running the Unstructured API: + +```console +docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0 +``` + +Then run tests: + +```console +hatch run test +``` \ No newline at end of file diff --git a/integrations/unstructured/fileconverter/README.md b/integrations/unstructured/fileconverter/README.md deleted file mode 100644 index 274c01c0f..000000000 --- a/integrations/unstructured/fileconverter/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# Unstructured FileConverter for Haystack - - - -Component for the Haystack (2.x) LLM framework to easily convert files and directories into Documents using the Unstructured API. - -**[Unstructured](https://unstructured-io.github.io/unstructured/index.html)** provides a series of tools to do **ETL for LLMs**. This component calls the Unstructured API that simply extracts text and other information from a vast range of file formats. -**[Supported file types](https://unstructured-io.github.io/unstructured/api.html#supported-file-types)**. - -**[Haystack](https://github.com/deepset-ai/haystack)** is an **orchestration framework** to build customizable, production-ready **LLM applications**. -Once your files are converted into Documents, you can start building RAG, question answering, semantic search applications and more. - -- [Installation](#installation) -- [Usage](#usage) -- [Configuration](#configuration) - -## Installation - -```bash -pip install unstructured-fileconverter-haystack -``` - -### Hosted API -If you plan to use the hosted version of the Unstructured API, you just need the **(free) Unsctructured API key**. You can get it by signing up [here](https://unstructured.io/api-key). - -### Local API (Docker) -If you want to run your own local instance of the Unstructured API, you need Docker and you can find instructions [here](https://unstructured-io.github.io/unstructured/api.html#using-docker-images). - -In short, this should work: -```bash -docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0 -``` - -## Usage - -### In isolation -```python -import os -from unstructured_fileconverter_haystack import UnstructuredFileConverter - -os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" - -converter = UnstructuredFileConverter() - -documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["documents"] - -``` - -### In a Haystack Pipeline -```python -import os -from haystack import Pipeline -from haystack.components.writers import DocumentWriter -from haystack.document_stores import MemoryDocumentStore -from unstructured_fileconverter_haystack import UnstructuredFileConverter - -os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" - -document_store = MemoryDocumentStore() - -indexing = Pipeline() -indexing.add_component("converter", UnstructuredFileConverter()) -indexing.add_component("writer", DocumentWriter(document_store)) -indexing.connect("converter", "writer") - -indexing.run({"converter": {"paths": ["a/file/path.pdf", "a/directory/path"]}}) -``` - -## Configuration - -### Initialization parameters -- `api_url`: URL of the Unstructured API. Defaults to the hosted version. If you run the API locally, you should specify this parameter. -- `api_key`: API key for the Unstructured API (https://unstructured.io/#get-api-key). - If you run the API locally, it is not needed. - If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. -- `document_creation_mode`: How to create Haystack Documents from the elements returned by Unstructured. - - `"one-doc-per-file"`: One Haystack Document per file. All elements are concatenated into one text field. - - `"one-doc-per-page"`: One Haystack Document per page. All elements on a page are concatenated into one text field. - - `"one-doc-per-element"`: One Haystack Document per element. Each element is converted to a Haystack Document - - `separator`: Separator between elements when concatenating them into one text field. -- `unstructured_kwargs`: Additional keyword arguments that are passed to the Unstructured API. They can be helpful to improve or speed up the conversion. See https://unstructured-io.github.io/unstructured/api.html#parameters. - -### `run` method -The method `run` just expects a list of paths (files or directories) in the `paths` parameter. - -If `paths` contains a directory, all files in the first level of the directory are converted. Subdirectories are ignored. diff --git a/integrations/unstructured/fileconverter/pyproject.toml b/integrations/unstructured/pyproject.toml similarity index 80% rename from integrations/unstructured/fileconverter/pyproject.toml rename to integrations/unstructured/pyproject.toml index 97d3e068c..e199b3c3e 100644 --- a/integrations/unstructured/fileconverter/pyproject.toml +++ b/integrations/unstructured/pyproject.toml @@ -24,23 +24,25 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" "haystack-ai", "unstructured<0.11.4", # FIXME: investigate why 0.11.4 broke the tests ] [project.urls] -Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured/fileconverter#readme" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured#readme" Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" -Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured/fileconverter" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] [tool.hatch.version] source = "vcs" -tag-pattern = 'integrations\/unstructured-fileconverter-v(?P.*)' +tag-pattern = 'integrations\/unstructured-v(?P.*)' [tool.hatch.version.raw-options] -root = "../../.." -git_describe_command = 'git describe --tags --match="integrations/unstructured-fileconverter-v[0-9]*"' +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/unstructured-v[0-9]*"' [tool.hatch.envs.default] dependencies = [ @@ -71,7 +73,7 @@ dependencies = [ "ruff>=0.0.243", ] [tool.hatch.envs.lint.scripts] -typing = "mypy --install-types --non-interactive {args:src/unstructured_fileconverter_haystack tests}" +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" style = [ "ruff {args:.}", "black --check --diff {args:.}", @@ -140,25 +142,22 @@ unfixable = [ ] [tool.ruff.isort] -known-first-party = ["unstructured_fileconverter_haystack"] +known-first-party = ["src"] [tool.ruff.flake8-tidy-imports] -ban-relative-imports = "all" +ban-relative-imports = "parents" [tool.ruff.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] [tool.coverage.run] -source_pkgs = ["unstructured_fileconverter_haystack", "tests"] +source_pkgs = ["src", "tests"] branch = true parallel = true -omit = [ - "src/unstructured_fileconverter/__about__.py", -] [tool.coverage.paths] -unstructured_fileconverter_haystack = ["src/unstructured_fileconverter_haystack", "*/unstructured-fileconverter-haystack/src/unstructured_fileconverter_haystack"] +unstructured_fileconverter_haystack = ["src/haystack_integrations", "*/unstructured-fileconverter-haystack/src"] tests = ["tests", "*/unstructured-fileconverter-haystack/tests"] [tool.coverage.report] @@ -178,6 +177,7 @@ markers = [ [[tool.mypy.overrides]] module = [ "haystack.*", + "haystack_integrations.*", "pytest.*" ] ignore_missing_imports = true diff --git a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py similarity index 63% rename from integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py rename to integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py index bcce95bea..26f14134b 100644 --- a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from unstructured_fileconverter_haystack.fileconverter import UnstructuredFileConverter +from .converter import UnstructuredFileConverter __all__ = ["UnstructuredFileConverter"] diff --git a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py similarity index 99% rename from integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py rename to integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index d94cb49c4..92348e6cd 100644 --- a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -9,6 +9,7 @@ from haystack import Document, component, default_to_dict from tqdm import tqdm + from unstructured.documents.elements import Element # type: ignore[import] from unstructured.partition.api import partition_via_api # type: ignore[import] diff --git a/integrations/unstructured/tests/__init__.py b/integrations/unstructured/tests/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/integrations/unstructured/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/unstructured/fileconverter/tests/samples/sample_pdf.pdf b/integrations/unstructured/tests/samples/sample_pdf.pdf similarity index 100% rename from integrations/unstructured/fileconverter/tests/samples/sample_pdf.pdf rename to integrations/unstructured/tests/samples/sample_pdf.pdf diff --git a/integrations/unstructured/fileconverter/tests/test_fileconverter.py b/integrations/unstructured/tests/test_converter.py similarity index 94% rename from integrations/unstructured/fileconverter/tests/test_fileconverter.py rename to integrations/unstructured/tests/test_converter.py index a9c724cba..b0473df25 100644 --- a/integrations/unstructured/fileconverter/tests/test_fileconverter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -4,8 +4,7 @@ from pathlib import Path import pytest - -from unstructured_fileconverter_haystack import UnstructuredFileConverter +from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter @pytest.fixture @@ -43,7 +42,7 @@ def test_to_dict(self): converter_dict = converter.to_dict() assert converter_dict == { - "type": "unstructured_fileconverter_haystack.fileconverter.UnstructuredFileConverter", + "type": "haystack_integrations.components.converters.unstructured.converter.UnstructuredFileConverter", "init_parameters": { "api_url": "https://api.unstructured.io/general/v0/general", "document_creation_mode": "one-doc-per-file",