Skip to content

!feat: unify NLTKDocumentSplitter and DocumentSplitter #15349

!feat: unify NLTKDocumentSplitter and DocumentSplitter

!feat: unify NLTKDocumentSplitter and DocumentSplitter #15349

Workflow file for this run

# If you change this name also do it in tests_skipper.yml and ci_metrics.yml
name: Tests
on:
workflow_dispatch: # Activate this workflow manually
push:
branches:
- main
# release branches have the form v1.9.x
- "v[0-9].*[0-9].x"
pull_request:
types:
- opened
- reopened
- synchronize
- ready_for_review
paths:
# Keep the list in sync with the paths defined in the `tests_skipper.yml` workflow
- "haystack/**/*.py"
- "haystack/core/pipeline/predefined/*"
- "test/**/*.py"
- "pyproject.toml"
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
CORE_AZURE_CS_ENDPOINT: ${{ secrets.CORE_AZURE_CS_ENDPOINT }}
CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }}
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
PYTHON_VERSION: "3.9"
HATCH_VERSION: "1.13.0"
jobs:
format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
run: pip install hatch==${{ env.HATCH_VERSION }}
- name: Check file format
run: hatch run format-check
- name: Check linting
run: hatch run check
- name: Check presence of license header
run: docker run --rm -v "$(pwd):/github/workspace" ghcr.io/korandoru/hawkeye check
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
check-imports:
needs: format
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
run: pip install hatch==${{ env.HATCH_VERSION }}
- name: Check imports
run: hatch run python .github/utils/check_imports.py
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
unit-tests:
name: Unit / ${{ matrix.os }}
needs: format
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- windows-latest
- macos-latest
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
id: hatch
shell: bash
run: |
pip install hatch==${{ env.HATCH_VERSION }}
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT"
- name: Run
run: hatch run test:unit
- uses: actions/cache/save@v4
id: cache
with:
path: ${{ steps.hatch.outputs.env }}
key: ${{ runner.os }}-${{ github.sha }}
- name: Coveralls
# We upload only coverage for ubuntu as handling both os
# complicates the workflow too much for little to no gain
if: matrix.os == 'ubuntu-latest'
uses: coverallsapp/github-action@v2
with:
path-to-lcov: coverage.xml
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
lint:
needs: unit-tests
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
# With the default value of 1, there are corner cases where tj-actions/changed-files
# fails with a `no merge base` error
fetch-depth: 0
- name: Get changed files
id: files
uses: tj-actions/changed-files@v45
with:
files: |
**/*.py
files_ignore: |
test/**
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
id: hatch
run: |
pip install hatch==${{ env.HATCH_VERSION }}
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT"
- uses: actions/cache/restore@v4
id: cache
with:
path: ${{ steps.hatch.outputs.env }}
key: ${{ runner.os }}-${{ github.sha }}
- name: Mypy
if: steps.files.outputs.any_changed == 'true'
run: |
mkdir .mypy_cache
hatch run test:types ${{ steps.files.outputs.all_changed_files }}
- name: Pylint
if: steps.files.outputs.any_changed == 'true'
run: |
hatch run test:lint ${{ steps.files.outputs.all_changed_files }}
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
integration-tests-linux:
name: Integration / ubuntu-latest
needs: unit-tests
runs-on: ubuntu-latest
services:
tika:
image: apache/tika:2.9.0.0
ports:
- 9998:9998
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
id: hatch
shell: bash
run: |
pip install hatch==${{ env.HATCH_VERSION }}
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT"
- uses: actions/cache/restore@v4
id: cache
with:
path: ${{ steps.hatch.outputs.env }}
key: ${{ runner.os }}-${{ github.sha }}
- name: Install dependencies
run: |
sudo apt update
sudo apt install ffmpeg # for local Whisper tests
- name: Run
run: hatch run test:integration
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
integration-tests-macos:
name: Integration / macos-latest
needs: unit-tests
runs-on: macos-latest
env:
HAYSTACK_MPS_ENABLED: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
id: hatch
shell: bash
run: |
pip install hatch==${{ env.HATCH_VERSION }}
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT"
- uses: actions/cache/restore@v4
id: cache
with:
path: ${{ steps.hatch.outputs.env }}
key: ${{ runner.os }}-${{ github.sha }}
- name: Install dependencies
run: |
brew install ffmpeg # for local Whisper tests
- name: Run
run: hatch run test:integration-mac
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
integration-tests-windows:
name: Integration / windows-latest
needs: unit-tests
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "${{ env.PYTHON_VERSION }}"
- name: Install Hatch
id: hatch
shell: bash
run: |
pip install hatch==${{ env.HATCH_VERSION }}
echo "env=$(hatch env find test)" >> "$GITHUB_OUTPUT"
- uses: actions/cache/restore@v4
id: cache
with:
path: ${{ steps.hatch.outputs.env }}
key: ${{ runner.os }}-${{ github.sha }}
- name: Run
run: hatch run test:integration-windows
- name: Calculate alert data
id: calculator
shell: bash
if: (success() || failure()) && github.ref_name == 'main'
run: |
if [ "${{ job.status }}" = "success" ]; then
echo "alert_type=success" >> "$GITHUB_OUTPUT";
else
echo "alert_type=error" >> "$GITHUB_OUTPUT";
fi
- name: Send event to Datadog
if: (success() || failure()) && github.ref_name == 'main'
uses: masci/datadog@v1
with:
api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
api-url: https://api.datadoghq.eu
events: |
- title: "${{ github.workflow }} workflow"
text: "Job ${{ github.job }} in branch ${{ github.ref_name }}"
alert_type: "${{ steps.calculator.outputs.alert_type }}"
source_type_name: "Github"
host: ${{ github.repository_owner }}
tags:
- "project:${{ github.repository }}"
- "job:${{ github.job }}"
- "run_id:${{ github.run_id }}"
- "workflow:${{ github.workflow }}"
- "branch:${{ github.ref_name }}"
- "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
trigger-catch-all:
name: Tests completed
# This job will be executed only after all the other tests
# are successful.
# This way we'll be able to mark only this test as required
# and skip it accordingly.
needs:
- integration-tests-linux
- integration-tests-macos
- integration-tests-windows
uses: ./.github/workflows/tests_skipper_workflow.yml
with:
tests_were_skipped: false