diff --git a/.gitignore b/.gitignore index 68bc17f9..d7a0b168 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +# Local run files +qa.db +**/qa.db +**/*qa*.db +**/test-reports + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -20,6 +26,7 @@ parts/ sdist/ var/ wheels/ +pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg @@ -49,7 +56,6 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ -cover/ # Translations *.mo @@ -68,11 +74,10 @@ instance/ # Scrapy stuff: .scrapy -# Sphinx documentation -docs/_build/ +# documentation +docs/pydoc/temp/ # PyBuilder -.pybuilder/ target/ # Jupyter Notebook @@ -83,9 +88,7 @@ profile_default/ ipython_config.py # pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -94,22 +97,7 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +# pyflow __pypackages__/ # Celery stuff @@ -146,15 +134,17 @@ dmypy.json # Pyre type checker .pyre/ -# pytype static type analyzer -.pytype/ +# PyCharm +.idea -# Cython debug symbols -cython_debug/ +# VSCode +.vscode -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +# macOS +.DS_Store + +# http cache (requests-cache) +**/http_cache.sqlite + +# ruff +.ruff_cache diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..1d7370e7 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,26 @@ +# Security Policy + +## Report a Vulnerability + +If you found a security vulnerability in Haystack, send a message to +[security@deepset.ai](mailto:security@deepset.ai). + +In your message, please include: + +1. Reproducible steps to trigger the vulnerability. +2. An explanation of what makes you think there is a vulnerability. +3. Any information you may have on active exploitations of the vulnerability (zero-day). + +## Vulnerability Response + +We'll review your report within 5 business days and we will do a preliminary analysis +to confirm that the vulnerability is plausible. Otherwise, we'll decline the report. + +We won't disclose any information you share with us but we'll use it to get the issue +fixed or to coordinate a vendor response, as needed. + +We'll keep you updated of the status of the issue. + +Our goal is to disclose bugs as soon as possible once a user mitigation is available. +Once we get a good understanding of the vulnerability, we'll set a disclosure date after +consulting the author of the report and Haystack maintainers. diff --git a/VERSION.txt b/VERSION.txt new file mode 100644 index 00000000..77d6f4ca --- /dev/null +++ b/VERSION.txt @@ -0,0 +1 @@ +0.0.0 diff --git a/haystack-experimental/__init__.py b/haystack-experimental/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/haystack-experimental/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/haystack-experimental/components/__init__.py b/haystack-experimental/components/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/haystack-experimental/components/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/haystack-experimental/evaluation/__init__.py b/haystack-experimental/evaluation/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/haystack-experimental/evaluation/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/haystack-experimental/version.py b/haystack-experimental/version.py new file mode 100644 index 00000000..8b18fc69 --- /dev/null +++ b/haystack-experimental/version.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from importlib import metadata + +try: + __version__ = str(metadata.version("haystack-experimental")) +except metadata.PackageNotFoundError: + __version__ = "main" diff --git a/license-header.txt b/license-header.txt new file mode 100644 index 00000000..ec467322 --- /dev/null +++ b/license-header.txt @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: ${inceptionYear}-present ${copyrightOwner} + +SPDX-License-Identifier: Apache-2.0 diff --git a/licenserc.toml b/licenserc.toml new file mode 100644 index 00000000..eb0e8100 --- /dev/null +++ b/licenserc.toml @@ -0,0 +1,23 @@ +headerPath = "license-header.txt" + +excludes = [ + ".github", + "docker", + "docs", + "examples", + "proposals", + "releasenotes", + "test", + "CITATION.cff", + "*.ini", + "*.jinja2", + "*.md", + "*.toml", + "*.txt", + "*.yaml", + "*.wav", +] + +[properties] +inceptionYear = 2022 +copyrightOwner = "deepset GmbH " diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..722e12f4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,272 @@ +[build-system] +requires = ["hatchling>=1.8.0"] +build-backend = "hatchling.build" + +[project] +name = "haystack-experimental" +dynamic = ["version"] +version = "0.0.0.dev0" +description = "Experimental components and features for the Haystack LLM framework." +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.8" +authors = [{ name = "deepset.ai", email = "malte.pietsch@deepset.ai" }] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: Freely Distributable", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dependencies = ["haystack-ai"] + +[tool.hatch.envs.default] +dependencies = [ + "pre-commit", + # Type check + "mypy", + # Test + "pytest", + "pytest-cov", + "pytest-custom_exit_code", # used in the CI + "pytest-asyncio", + "pytest-rerunfailures", + "responses", + "tox", + "coverage", + "python-multipart", + "psutil", + # Linting + "pylint", + "ruff", + # Documentation + "toml", + "reno", + # dulwich is a reno dependency, they pin it at >=0.15.0 so pip takes ton of time to resolve the dependency tree. + # We pin it here to avoid taking too much time. + # https://opendev.org/openstack/reno/src/branch/master/requirements.txt#L7 + "dulwich>=0.21.0,<1.0.0", + # Version specified following Black stability policy: + # https://black.readthedocs.io/en/stable/the_black_code_style/index.html#stability-policy + "black[jupyter]~=23.0", +] + +[tool.hatch.envs.default.scripts] +format = "black ." +format-check = "black --check ." + +[tool.hatch.envs.test] +extra-dependencies = [] + +[tool.hatch.envs.test.scripts] +e2e = "pytest e2e" +unit = 'pytest --cov-report xml:coverage.xml --cov="haystack-experimental" -m "not integration" {args:test}' +integration = 'pytest --maxfail=5 -m "integration" {args:test}' +integration-mac = 'pytest --maxfail=5 -m "integration" -k "not tika" {args:test}' +integration-windows = 'pytest --maxfail=5 -m "integration" -k "not tika" {args:test}' +types = "mypy --install-types --non-interactive --cache-dir=.mypy_cache/ {args:haystack-experimental}" +lint = [ + "ruff check {args:haystack-experimental}", + "pylint -ry -j 0 {args:haystack-experimental}", +] +lint-fix = ["black .", "ruff check {args:haystack-experimental} --fix"] + +[tool.hatch.envs.readme] +detached = true # To avoid installing the dependencies from the default environment +dependencies = ["haystack-pydoc-tools"] + +[tool.hatch.envs.readme.scripts] +sync = "./.github/utils/pydoc-markdown.sh" +delete-outdated = "python ./.github/utils/delete_outdated_docs.py {args}" + +[tool.hatch.envs.snippets] +extra-dependencies = ["torch", "pydantic"] + +[project.urls] +"CI: GitHub" = "https://github.com/deepset-ai/haystack-experimental/actions" +"GitHub: issues" = "https://github.com/deepset-ai/haystack-experimental/issues" +"GitHub: repo" = "https://github.com/deepset-ai/haystack-experimental" +Homepage = "https://github.com/deepset-ai/haystack-experimental" + +[tool.hatch.version] +path = "VERSION.txt" +pattern = "(?P.+)" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.sdist] +include = ["/haystack-experimental", "/VERSION.txt"] + +[tool.hatch.build.targets.wheel] +packages = ["haystack-experimental"] + +[tool.black] +line-length = 120 +skip_magic_trailing_comma = true # For compatibility with pydoc>=4.6, check if still needed. + +[tool.codespell] +ignore-words-list = "ans,astroid,nd,ned,nin,ue,rouge,ist" +quiet-level = 3 +skip = "test/nodes/*,test/others/*,test/samples/*,e2e/*" + +[tool.pylint.'MESSAGES CONTROL'] +max-line-length = 120 +disable = [ + + # To keep + "fixme", + "c-extension-no-member", + + # To review: + "missing-docstring", + "unused-argument", + "no-member", + "line-too-long", + "protected-access", + "too-few-public-methods", + "raise-missing-from", + "invalid-name", + "duplicate-code", + "arguments-differ", + "consider-using-f-string", + "no-else-return", + "attribute-defined-outside-init", + "super-with-arguments", + "redefined-builtin", + "abstract-method", + "unspecified-encoding", + "unidiomatic-typecheck", + "no-name-in-module", + "consider-using-with", + "redefined-outer-name", + "arguments-renamed", + "unnecessary-pass", + "broad-except", + "unnecessary-comprehension", + "subprocess-run-check", + "singleton-comparison", + "consider-iterating-dictionary", + "undefined-loop-variable", + "consider-using-in", + "bare-except", + "unexpected-keyword-arg", + "simplifiable-if-expression", + "use-list-literal", + "broad-exception-raised", + + # To review later + "cyclic-import", + "import-outside-toplevel", + "deprecated-method", +] +[tool.pylint.'DESIGN'] +max-args = 38 # Default is 5 +max-attributes = 28 # Default is 7 +max-branches = 34 # Default is 12 +max-locals = 45 # Default is 15 +max-module-lines = 2468 # Default is 1000 +max-nested-blocks = 9 # Default is 5 +max-statements = 206 # Default is 50 +[tool.pylint.'SIMILARITIES'] +min-similarity-lines = 6 + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "--strict-markers" +markers = [ + "unit: unit tests", + "integration: integration tests", + + "generator: generator tests", + "summarizer: summarizer tests", + "embedding_dim: uses a document store with non-default embedding dimension (e.g @pytest.mark.embedding_dim(128))", + + "tika: requires Tika container", + "parsr: requires Parsr container", + "ocr: requires Tesseract", + + "elasticsearch: requires Elasticsearch container", + "weaviate: requires Weaviate container", + "pinecone: requires Pinecone credentials", + "faiss: uses FAISS", + "opensearch", + "document_store", +] +log_cli = true + +[tool.mypy] +warn_return_any = false +warn_unused_configs = true +ignore_missing_imports = true + +[tool.ruff] +line-length = 301 +target-version = "py38" +exclude = ["test"] + + +[tool.ruff.lint] +select = [ + "ASYNC", # flake8-async + "C4", # flake8-comprehensions + "C90", # McCabe cyclomatic complexity + "E501", # Long lines + "EXE", # flake8-executable + "F", # Pyflakes + "INT", # flake8-gettext + "PERF", # Perflint + "PL", # Pylint + "Q", # flake8-quotes + "SIM", # flake8-simplify + "SLOT", # flake8-slots + "T10", # flake8-debugger + "W", # pycodestyle + "YTT", # flake8-2020 + "I", # isort + # built-in shadowing + "A001", # builtin-variable-shadowing + "A002", # builtin-argument-shadowing + "A003", # builtin-attribute-shadowing + # docstring rules + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D209", # Closing triple quotes go to new line + "D205", # 1 blank line required between summary line and description + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # undocumented-parameter + "D419", # undocumented-returns +] + +ignore = [ + "F401", # unused-import + "PERF203", # `try`-`except` within a loop incurs performance overhead + "PERF401", # Use a list comprehension to create a transformed list + "PLR1714", # repeated-equality-comparison + "PLR5501", # collapsible-else-if + "PLW0603", # global-statement + "PLW1510", # subprocess-run-without-check + "PLW2901", # redefined-loop-name + "SIM108", # if-else-block-instead-of-if-exp + "SIM115", # open-file-with-context-handler + "SIM118", # in-dict-keys +] + +[tool.ruff.lint.mccabe] +max-complexity = 28 + +[tool.ruff.lint.per-file-ignores] + +[tool.ruff.lint.pylint] +allow-magic-value-types = ["float", "int", "str"] +max-args = 14 # Default is 5 +max-branches = 21 # Default is 12 +max-public-methods = 20 # Default is 20 +max-returns = 7 # Default is 6 +max-statements = 60 # Default is 50 diff --git a/releasenotes/config.yaml b/releasenotes/config.yaml new file mode 100644 index 00000000..1becbe28 --- /dev/null +++ b/releasenotes/config.yaml @@ -0,0 +1,47 @@ +default_branch: main +collapse_pre_releases: true +pre_release_tag_re: (?P-(?:[ab]|rc)+\d*)$ +prelude_section_name: highlights +template: | + --- + highlights: > + Replace this text with content to appear at the top of the section for this + release. The highlights might repeat some details that are also present in other notes + from the same release, that's ok. Not every release note requires highlights, + use this section only to describe major features or notable changes. + upgrade: + - | + List upgrade notes here, or remove this section. + Upgrade notes should be rare: only list known/potential breaking changes, + or major changes that require user action before the upgrade. + Notes here must include steps that users can follow to 1. know if they're + affected and 2. handle the change gracefully on their end. + features: + - | + List new features here, or remove this section. + enhancements: + - | + List new behavior that is too small to be + considered a new feature, or remove this section. + issues: + - | + List known issues here, or remove this section. For example, if some change is experimental or known to not work in some cases, it should be mentioned here. + deprecations: + - | + List deprecations notes here, or remove this section. Deprecations should not be used for something that is removed in the release, use upgrade section instead. Deprecation should allow time for users to make necessary changes for the removal to happen in a future release. + security: + - | + Add security notes here, or remove this section. + fixes: + - | + Add normal bug fixes here, or remove this section. + +sections: + # The prelude section is implicitly included. + - [upgrade, ⬆️ Upgrade Notes] + - [features, 🚀 New Features] + - [enhancements, ⚡️ Enhancement Notes] + - [issues, Known Issues] + - [deprecations, ⚠️ Deprecation Notes] + - [security, Security Notes] + - [fixes, 🐛 Bug Fixes] diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/test/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 00000000..a440a7f1 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from pathlib import Path +from test.tracing.utils import SpyingTracer +from typing import Generator + +import pytest + +from haystack import tracing +from haystack.testing.test_utils import set_all_seeds + +set_all_seeds(0) + + +@pytest.fixture() +def test_files_path(): + return Path(__file__).parent / "test_files" + + +@pytest.fixture(autouse=True) +def request_blocker(request: pytest.FixtureRequest, monkeypatch): + """ + This fixture is applied automatically to all tests. + Those that are not marked as integration will have the requests module + monkeypatched to avoid making HTTP requests by mistake. + """ + marker = request.node.get_closest_marker("integration") + if marker is not None: + return + + def urlopen_mock(self, method, url, *args, **kwargs): + raise RuntimeError(f"The test was about to {method} {self.scheme}://{self.host}{url}") + + monkeypatch.setattr("urllib3.connectionpool.HTTPConnectionPool.urlopen", urlopen_mock) + + +@pytest.fixture() +def spying_tracer() -> Generator[SpyingTracer, None, None]: + tracer = SpyingTracer() + tracing.enable_tracing(tracer) + + yield tracer + + # Make sure to disable tracing after the test to avoid affecting other tests + tracing.disable_tracing() diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..824dfdab --- /dev/null +++ b/tox.ini @@ -0,0 +1,18 @@ +[tox] +isolated_build = true +envlist = py37 + + +[testenv] +changedir = test +extras = + test +setenv = + COVERAGE_FILE = test-reports/.coverage + PYTEST_ADDOPTS = --junitxml=test-reports/{envname}/junit.xml -vv +commands = + coverage run --source haystack --parallel-mode -m pytest {posargs} + coverage combine + coverage report -m + coverage html -d test-reports/coverage-html + coverage xml -o test-reports/coverage.xml