Skip to content

Commit

Permalink
Mount unstructured in haystack_integrations and refactor the structur…
Browse files Browse the repository at this point in the history
…e of the project (#221)

* reorganize integrations

* some other changes, including the workflow

* fix fmt

* retry

* fix workflow

* try to fix coverage error

* fix fmt again

* standardize README
  • Loading branch information
anakin87 authored Jan 17, 2024
1 parent e98a754 commit e5eed91
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 110 deletions.
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
# This workflow comes from https://github.com/ofek/hatch-mypyc
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
name: Test / unstructured / fileconverter
name: Test / unstructured

on:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- "integrations/unstructured/fileconverter/**"
- ".github/workflows/unstructured_fileconverter.yml"
- "integrations/unstructured/**"
- ".github/workflows/unstructured.yml"

concurrency:
group: unstructured_fileconverter-${{ github.head_ref }}
group: unstructured-${{ github.head_ref }}
cancel-in-progress: true

env:
Expand Down Expand Up @@ -50,10 +50,10 @@ jobs:
run: pip install --upgrade hatch

- name: Lint
working-directory: integrations/unstructured/fileconverter
working-directory: integrations/unstructured
if: matrix.python-version == '3.9'
run: hatch run lint:all

- name: Run tests
working-directory: integrations/unstructured/fileconverter
working-directory: integrations/unstructured
run: hatch run cov
File renamed without changes.
37 changes: 37 additions & 0 deletions integrations/unstructured/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# unstructured-fileconverter-haystack

[![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack)

-----

**Table of Contents**

- [unstructured-fileconverter-haystack](#unstructured-fileconverter-haystack)
- [Installation](#installation)
- [License](#license)
- [Testing](#testing)

## Installation

```console
pip install unstructured-fileconverter-haystack
```

## License

`unstructured-fileconverter-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.

## Testing

To run tests, first start a Docker container running the Unstructured API:

```console
docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0
```

Then run tests:

```console
hatch run test
```
86 changes: 0 additions & 86 deletions integrations/unstructured/fileconverter/README.md

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,25 @@ classifiers = [
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
# we distribute the preview version of Haystack 2.0 under the package "haystack-ai"
"haystack-ai",
"unstructured<0.11.4", # FIXME: investigate why 0.11.4 broke the tests
]

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured/fileconverter#readme"
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured#readme"
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured/fileconverter"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/unstructured"

[tool.hatch.build.targets.wheel]
packages = ["src/haystack_integrations"]

[tool.hatch.version]
source = "vcs"
tag-pattern = 'integrations\/unstructured-fileconverter-v(?P<version>.*)'
tag-pattern = 'integrations\/unstructured-v(?P<version>.*)'

[tool.hatch.version.raw-options]
root = "../../.."
git_describe_command = 'git describe --tags --match="integrations/unstructured-fileconverter-v[0-9]*"'
root = "../.."
git_describe_command = 'git describe --tags --match="integrations/unstructured-v[0-9]*"'

[tool.hatch.envs.default]
dependencies = [
Expand Down Expand Up @@ -71,7 +73,7 @@ dependencies = [
"ruff>=0.0.243",
]
[tool.hatch.envs.lint.scripts]
typing = "mypy --install-types --non-interactive {args:src/unstructured_fileconverter_haystack tests}"
typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
style = [
"ruff {args:.}",
"black --check --diff {args:.}",
Expand Down Expand Up @@ -140,25 +142,22 @@ unfixable = [
]

[tool.ruff.isort]
known-first-party = ["unstructured_fileconverter_haystack"]
known-first-party = ["src"]

[tool.ruff.flake8-tidy-imports]
ban-relative-imports = "all"
ban-relative-imports = "parents"

[tool.ruff.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests/**/*" = ["PLR2004", "S101", "TID252"]

[tool.coverage.run]
source_pkgs = ["unstructured_fileconverter_haystack", "tests"]
source_pkgs = ["src", "tests"]
branch = true
parallel = true
omit = [
"src/unstructured_fileconverter/__about__.py",
]

[tool.coverage.paths]
unstructured_fileconverter_haystack = ["src/unstructured_fileconverter_haystack", "*/unstructured-fileconverter-haystack/src/unstructured_fileconverter_haystack"]
unstructured_fileconverter_haystack = ["src/haystack_integrations", "*/unstructured-fileconverter-haystack/src"]
tests = ["tests", "*/unstructured-fileconverter-haystack/tests"]

[tool.coverage.report]
Expand All @@ -178,6 +177,7 @@ markers = [
[[tool.mypy.overrides]]
module = [
"haystack.*",
"haystack_integrations.*",
"pytest.*"
]
ignore_missing_imports = true
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from unstructured_fileconverter_haystack.fileconverter import UnstructuredFileConverter
from .converter import UnstructuredFileConverter

__all__ = ["UnstructuredFileConverter"]
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from haystack import Document, component, default_to_dict
from tqdm import tqdm

from unstructured.documents.elements import Element # type: ignore[import]
from unstructured.partition.api import partition_via_api # type: ignore[import]

Expand Down
3 changes: 3 additions & 0 deletions integrations/unstructured/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from pathlib import Path

import pytest

from unstructured_fileconverter_haystack import UnstructuredFileConverter
from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter


@pytest.fixture
Expand Down Expand Up @@ -43,7 +42,7 @@ def test_to_dict(self):
converter_dict = converter.to_dict()

assert converter_dict == {
"type": "unstructured_fileconverter_haystack.fileconverter.UnstructuredFileConverter",
"type": "haystack_integrations.components.converters.unstructured.converter.UnstructuredFileConverter",
"init_parameters": {
"api_url": "https://api.unstructured.io/general/v0/general",
"document_creation_mode": "one-doc-per-file",
Expand Down

0 comments on commit e5eed91

Please sign in to comment.