Skip to content

text_splitters: Add HTMLSemanticPreservingSplitter #35566

text_splitters: Add HTMLSemanticPreservingSplitter

text_splitters: Add HTMLSemanticPreservingSplitter #35566

Workflow file for this run

---
name: CI
on:
push:
branches: [master]
pull_request:
merge_group:
# If another push to the same PR or branch happens while this workflow is still running,
# cancel the earlier run in favor of the next run.
#
# There's no point in testing an outdated version of the code. GitHub only allows
# a limited number of job runners to be active at the same time, so it's better to cancel
# pointless jobs early so that more useful jobs can run sooner.
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
POETRY_VERSION: "1.8.4"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- id: files
uses: Ana06/[email protected]
- id: set-matrix
run: |
python -m pip install packaging requests
python .github/scripts/check_diff.py ${{ steps.files.outputs.all }} >> $GITHUB_OUTPUT
outputs:
lint: ${{ steps.set-matrix.outputs.lint }}
test: ${{ steps.set-matrix.outputs.test }}
extended-tests: ${{ steps.set-matrix.outputs.extended-tests }}
compile-integration-tests: ${{ steps.set-matrix.outputs.compile-integration-tests }}
dependencies: ${{ steps.set-matrix.outputs.dependencies }}
test-doc-imports: ${{ steps.set-matrix.outputs.test-doc-imports }}
test-pydantic: ${{ steps.set-matrix.outputs.test-pydantic }}
lint:
name: cd ${{ matrix.job-configs.working-directory }}
needs: [ build ]
if: ${{ needs.build.outputs.lint != '[]' }}
strategy:
matrix:
job-configs: ${{ fromJson(needs.build.outputs.lint) }}
fail-fast: false
uses: ./.github/workflows/_lint.yml
with:
working-directory: ${{ matrix.job-configs.working-directory }}
python-version: ${{ matrix.job-configs.python-version }}
secrets: inherit
test:
name: cd ${{ matrix.job-configs.working-directory }}
needs: [ build ]
if: ${{ needs.build.outputs.test != '[]' }}
strategy:
matrix:
job-configs: ${{ fromJson(needs.build.outputs.test) }}
fail-fast: false
uses: ./.github/workflows/_test.yml
with:
working-directory: ${{ matrix.job-configs.working-directory }}
python-version: ${{ matrix.job-configs.python-version }}
secrets: inherit
test-pydantic:
name: cd ${{ matrix.job-configs.working-directory }}
needs: [ build ]
if: ${{ needs.build.outputs.test-pydantic != '[]' }}
strategy:
matrix:
job-configs: ${{ fromJson(needs.build.outputs.test-pydantic) }}
fail-fast: false
uses: ./.github/workflows/_test_pydantic.yml
with:
working-directory: ${{ matrix.job-configs.working-directory }}
pydantic-version: ${{ matrix.job-configs.pydantic-version }}
secrets: inherit
test-doc-imports:
needs: [ build ]
if: ${{ needs.build.outputs.test-doc-imports != '[]' }}
strategy:
matrix:
job-configs: ${{ fromJson(needs.build.outputs.test-doc-imports) }}
fail-fast: false
uses: ./.github/workflows/_test_doc_imports.yml
secrets: inherit
with:
python-version: ${{ matrix.job-configs.python-version }}
compile-integration-tests:
name: cd ${{ matrix.job-configs.working-directory }}
needs: [ build ]
if: ${{ needs.build.outputs.compile-integration-tests != '[]' }}
strategy:
matrix:
job-configs: ${{ fromJson(needs.build.outputs.compile-integration-tests) }}
fail-fast: false
uses: ./.github/workflows/_compile_integration_test.yml
with:
working-directory: ${{ matrix.job-configs.working-directory }}
python-version: ${{ matrix.job-configs.python-version }}
secrets: inherit
extended-tests:
name: "cd ${{ matrix.job-configs.working-directory }} / make extended_tests #${{ matrix.job-configs.python-version }}"
needs: [ build ]
if: ${{ needs.build.outputs.extended-tests != '[]' }}
strategy:
matrix:
# note different variable for extended test dirs
job-configs: ${{ fromJson(needs.build.outputs.extended-tests) }}
fail-fast: false
runs-on: ubuntu-latest
timeout-minutes: 20
defaults:
run:
working-directory: ${{ matrix.job-configs.working-directory }}
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.job-configs.python-version }} + Poetry ${{ env.POETRY_VERSION }}
uses: "./.github/actions/poetry_setup"
with:
python-version: ${{ matrix.job-configs.python-version }}
poetry-version: ${{ env.POETRY_VERSION }}
working-directory: ${{ matrix.job-configs.working-directory }}
cache-key: extended
- name: Install dependencies
shell: bash
run: |
echo "Running extended tests, installing dependencies with poetry..."
poetry install --with test
poetry run pip install uv
poetry run uv pip install -r extended_testing_deps.txt
- name: Run extended tests
run: make extended_tests
- name: Ensure the tests did not create any additional files
shell: bash
run: |
set -eu
STATUS="$(git status)"
echo "$STATUS"
# grep will exit non-zero if the target message isn't found,
# and `set -e` above will cause the step to fail.
echo "$STATUS" | grep 'nothing to commit, working tree clean'
ci_success:
name: "CI Success"
needs: [build, lint, test, compile-integration-tests, extended-tests, test-doc-imports, test-pydantic]
if: |
always()
runs-on: ubuntu-latest
env:
JOBS_JSON: ${{ toJSON(needs) }}
RESULTS_JSON: ${{ toJSON(needs.*.result) }}
EXIT_CODE: ${{!contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && '0' || '1'}}
steps:
- name: "CI Success"
run: |
echo $JOBS_JSON
echo $RESULTS_JSON
echo "Exiting with $EXIT_CODE"
exit $EXIT_CODE