From 32e90ce912c45a9cf09dffa02a5fb3b9275c419b Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 16:34:28 -0400 Subject: [PATCH 01/14] reorganize community deps --- .../community/extended_dependencies/extended_testing_deps.txt | 2 ++ .../other_deps.txt} | 4 ---- libs/community/extended_dependencies/pdf_loader_deps.txt | 4 ++++ 3 files changed, 6 insertions(+), 4 deletions(-) create mode 100644 libs/community/extended_dependencies/extended_testing_deps.txt rename libs/community/{extended_testing_deps.txt => extended_dependencies/other_deps.txt} (95%) create mode 100644 libs/community/extended_dependencies/pdf_loader_deps.txt diff --git a/libs/community/extended_dependencies/extended_testing_deps.txt b/libs/community/extended_dependencies/extended_testing_deps.txt new file mode 100644 index 0000000000000..fd947b8dac4db --- /dev/null +++ b/libs/community/extended_dependencies/extended_testing_deps.txt @@ -0,0 +1,2 @@ +-r pdf_loader_deps.txt +-r other_deps.txt diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_dependencies/other_deps.txt similarity index 95% rename from libs/community/extended_testing_deps.txt rename to libs/community/extended_dependencies/other_deps.txt index d9879fd6aa07c..466fd308d2d40 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_dependencies/other_deps.txt @@ -54,7 +54,6 @@ openapi-pydantic>=0.3.2,<0.4 oracle-ads>=2.9.1,<3 oracledb>=2.2.0,<3 pandas>=2.0.1,<3 -pdfminer-six>=20221105,<20240706 pgvector>=0.1.6,<0.2 praw>=7.7.1,<8 premai>=0.3.25,<0.4 @@ -62,9 +61,6 @@ psychicapi>=0.8.0,<0.9 pydantic>=2.7.4,<3 py-trello>=0.19.0,<0.20 pyjwt>=2.8.0,<3 -pymupdf>=1.22.3,<2 -pypdf>=3.4.0,<5 -pypdfium2>=4.10.0,<5 pyspark>=3.4.0,<4 rank-bm25>=0.2.2,<0.3 rapidfuzz>=3.1.1,<4 diff --git a/libs/community/extended_dependencies/pdf_loader_deps.txt b/libs/community/extended_dependencies/pdf_loader_deps.txt new file mode 100644 index 0000000000000..63bc45ce53156 --- /dev/null +++ b/libs/community/extended_dependencies/pdf_loader_deps.txt @@ -0,0 +1,4 @@ +pdfminer-six>=20221105,<20240706 +pymupdf>=1.22.3,<2 +pypdf>=3.4.0,<5 +pypdfium2>=4.10.0,<5 From 74a607999215aeecf2b673afb272eb9ad41a43a4 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 16:38:49 -0400 Subject: [PATCH 02/14] move all extended_testing_deps.txt --- .github/scripts/check_diff.py | 4 +++- .github/workflows/check_diffs.yml | 2 +- .../{ => extended_dependencies}/extended_testing_deps.txt | 0 .../{ => extended_dependencies}/extended_testing_deps.txt | 0 .../{ => extended_dependencies}/extended_testing_deps.txt | 0 .../{ => extended_dependencies}/extended_testing_deps.txt | 0 6 files changed, 4 insertions(+), 2 deletions(-) rename libs/core/{ => extended_dependencies}/extended_testing_deps.txt (100%) rename libs/experimental/{ => extended_dependencies}/extended_testing_deps.txt (100%) rename libs/langchain/{ => extended_dependencies}/extended_testing_deps.txt (100%) rename libs/text-splitters/{ => extended_dependencies}/extended_testing_deps.txt (100%) diff --git a/.github/scripts/check_diff.py b/.github/scripts/check_diff.py index fc2ce26f1baf2..ffdee0594f6e5 100644 --- a/.github/scripts/check_diff.py +++ b/.github/scripts/check_diff.py @@ -68,7 +68,9 @@ def dependents_graph() -> dict: # load extended deps from extended_testing_deps.txt package_path = Path(path).parent - extended_requirement_path = package_path / "extended_testing_deps.txt" + extended_requirement_path = ( + package_path / "extended_dependencies" / "extended_testing_deps.txt" + ) if extended_requirement_path.exists(): with open(extended_requirement_path, "r") as f: extended_deps = f.read().splitlines() diff --git a/.github/workflows/check_diffs.yml b/.github/workflows/check_diffs.yml index b5729611645c6..a94cb2e40e8a5 100644 --- a/.github/workflows/check_diffs.yml +++ b/.github/workflows/check_diffs.yml @@ -139,7 +139,7 @@ jobs: echo "Running extended tests, installing dependencies with poetry..." poetry install --with test poetry run pip install uv - poetry run uv pip install -r extended_testing_deps.txt + poetry run uv pip install -r extended_dependencies/extended_testing_deps.txt - name: Run extended tests run: make extended_tests diff --git a/libs/core/extended_testing_deps.txt b/libs/core/extended_dependencies/extended_testing_deps.txt similarity index 100% rename from libs/core/extended_testing_deps.txt rename to libs/core/extended_dependencies/extended_testing_deps.txt diff --git a/libs/experimental/extended_testing_deps.txt b/libs/experimental/extended_dependencies/extended_testing_deps.txt similarity index 100% rename from libs/experimental/extended_testing_deps.txt rename to libs/experimental/extended_dependencies/extended_testing_deps.txt diff --git a/libs/langchain/extended_testing_deps.txt b/libs/langchain/extended_dependencies/extended_testing_deps.txt similarity index 100% rename from libs/langchain/extended_testing_deps.txt rename to libs/langchain/extended_dependencies/extended_testing_deps.txt diff --git a/libs/text-splitters/extended_testing_deps.txt b/libs/text-splitters/extended_dependencies/extended_testing_deps.txt similarity index 100% rename from libs/text-splitters/extended_testing_deps.txt rename to libs/text-splitters/extended_dependencies/extended_testing_deps.txt From fe832bef26f7ce45b1baca535346739bed30f2d8 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 17:30:57 -0400 Subject: [PATCH 03/14] temporarily hijack integration test job to test workflow dispatch off branch --- .github/workflows/_integration_test.yml | 63 +++++++------------------ 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml index 40b0ec5344edd..8451321a68642 100644 --- a/.github/workflows/_integration_test.yml +++ b/.github/workflows/_integration_test.yml @@ -6,10 +6,20 @@ on: working-directory: required: true type: string + default: "libs/community" python-version: required: true type: string description: "Python version to use" + default: "3.11" + extended-deps-file: + required: true + type: choice + description: "File to install extended dependencies from" + options: + - pdf_loader_deps.txt + - extended_testing_deps.txt + - other_deps.txt env: POETRY_VERSION: "1.7.1" @@ -32,61 +42,22 @@ jobs: working-directory: ${{ inputs.working-directory }} cache-key: core - - name: Install dependencies + - name: Install extended dependencies shell: bash - run: poetry install --with test,test_integration + run: | + poetry install --with test + poetry run pip install uv + poetry run uv pip install -r extended_dependencies/${{ inputs.extended-deps-file }} - name: Install deps outside pyproject if: ${{ startsWith(inputs.working-directory, 'libs/community/') }} shell: bash run: poetry run pip install "boto3<2" "google-cloud-aiplatform<2" - - name: 'Authenticate to Google Cloud' - id: 'auth' - uses: google-github-actions/auth@v2 - with: - credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' - - - name: Run integration tests + - name: Run extended tests shell: bash - env: - AI21_API_KEY: ${{ secrets.AI21_API_KEY }} - FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - AZURE_OPENAI_API_VERSION: ${{ secrets.AZURE_OPENAI_API_VERSION }} - AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }} - AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} - AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }} - AZURE_OPENAI_LLM_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_LLM_DEPLOYMENT_NAME }} - AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME }} - MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} - TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} - NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} - GOOGLE_SEARCH_API_KEY: ${{ secrets.GOOGLE_SEARCH_API_KEY }} - GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }} - HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }} - EXA_API_KEY: ${{ secrets.EXA_API_KEY }} - NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }} - WATSONX_APIKEY: ${{ secrets.WATSONX_APIKEY }} - WATSONX_PROJECT_ID: ${{ secrets.WATSONX_PROJECT_ID }} - PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} - PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }} - ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }} - ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }} - ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }} - ES_URL: ${{ secrets.ES_URL }} - ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }} - ES_API_KEY: ${{ secrets.ES_API_KEY }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte - MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }} - VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} - COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} - UPSTAGE_API_KEY: ${{ secrets.UPSTAGE_API_KEY }} run: | - make integration_tests + make test - name: Ensure the tests did not create any additional files shell: bash From 48ca84dcfa46afd1eba69c073405924fc5da65df Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 17:50:23 -0400 Subject: [PATCH 04/14] move some pdf integration tests to extended tests --- .../document_loaders/test_pdf.py | 33 ------------- .../unit_tests/document_loaders/test_pdf.py | 46 +++++++++++++++++++ 2 files changed, 46 insertions(+), 33 deletions(-) create mode 100644 libs/community/tests/unit_tests/document_loaders/test_pdf.py diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 462e20d357904..50c9fde29d918 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,4 +1,3 @@ -import re from pathlib import Path from typing import Sequence, Union @@ -11,7 +10,6 @@ PDFMinerPDFasHTMLLoader, PyMuPDFLoader, PyPDFium2Loader, - PyPDFLoader, UnstructuredPDFLoader, ) @@ -86,37 +84,6 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 -def test_pypdf_loader() -> None: - """Test PyPDFLoader.""" - file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PyPDFLoader(str(file_path)) - docs = loader.load() - - assert len(docs) == 1 - - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path)) - - docs = loader.load() - assert len(docs) == 16 - - -def test_pypdf_loader_with_layout() -> None: - """Test PyPDFLoader with layout mode.""" - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path), extraction_mode="layout") - - docs = loader.load() - first_page = docs[0].page_content - - expected = ( - Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt" - ).read_text(encoding="utf-8") - cleaned_first_page = re.sub(r"\x00", "", first_page) - cleaned_expected = re.sub(r"\x00", "", expected) - assert cleaned_first_page == cleaned_expected - - def test_pypdfium2_loader() -> None: """Test PyPDFium2Loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py new file mode 100644 index 0000000000000..d62363723bd60 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -0,0 +1,46 @@ +import re +from pathlib import Path + +import pytest + +from langchain_community.document_loaders import PyPDFLoader + +path_to_simple_pdf = ( + Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf" +) +path_to_layout_pdf = ( + Path(__file__).parent.parent + / "document_loaders/sample_documents/layout-parser-paper.pdf" +) +path_to_layout_pdf_txt = ( + Path(__file__).parent.parent.parent + / "integration_tests/examples/layout-parser-paper-page-1.txt" +) + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader() -> None: + """Test PyPDFLoader.""" + loader = PyPDFLoader(str(path_to_simple_pdf)) + docs = loader.load() + + assert len(docs) == 1 + + loader = PyPDFLoader(str(path_to_layout_pdf)) + + docs = loader.load() + assert len(docs) == 16 + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader_with_layout() -> None: + """Test PyPDFLoader with layout mode.""" + loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout") + + docs = loader.load() + first_page = docs[0].page_content + + expected = path_to_layout_pdf_txt.read_text(encoding="utf-8") + cleaned_first_page = re.sub(r"\x00", "", first_page) + cleaned_expected = re.sub(r"\x00", "", expected) + assert cleaned_first_page == cleaned_expected From 8f786501812fb22fa5c75ae5d9e78796ca7ddd0c Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 17:50:41 -0400 Subject: [PATCH 05/14] temporarily raise assertion error to check that tests run --- libs/community/langchain_community/document_loaders/pdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index b01ffea8efcaa..72936b21fa0bb 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -250,6 +250,7 @@ def lazy_load( self, ) -> Iterator[Document]: """Lazy load given path as pages.""" + raise AssertionError("Testing!") if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: From 9ef23df3f826bdaf0cc2f4e6f5b2bb5ded498370 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 17:53:15 -0400 Subject: [PATCH 06/14] Revert "temporarily raise assertion error to check that tests run" This reverts commit 8f786501812fb22fa5c75ae5d9e78796ca7ddd0c. --- libs/community/langchain_community/document_loaders/pdf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 72936b21fa0bb..b01ffea8efcaa 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -250,7 +250,6 @@ def lazy_load( self, ) -> Iterator[Document]: """Lazy load given path as pages.""" - raise AssertionError("Testing!") if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: From 58e7175262102d274e10ed0f46bca069c7ea64e1 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 17:54:04 -0400 Subject: [PATCH 07/14] Revert "temporarily hijack integration test job to test workflow dispatch off branch" This reverts commit fe832bef26f7ce45b1baca535346739bed30f2d8. --- .github/workflows/_integration_test.yml | 63 ++++++++++++++++++------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml index 8451321a68642..40b0ec5344edd 100644 --- a/.github/workflows/_integration_test.yml +++ b/.github/workflows/_integration_test.yml @@ -6,20 +6,10 @@ on: working-directory: required: true type: string - default: "libs/community" python-version: required: true type: string description: "Python version to use" - default: "3.11" - extended-deps-file: - required: true - type: choice - description: "File to install extended dependencies from" - options: - - pdf_loader_deps.txt - - extended_testing_deps.txt - - other_deps.txt env: POETRY_VERSION: "1.7.1" @@ -42,22 +32,61 @@ jobs: working-directory: ${{ inputs.working-directory }} cache-key: core - - name: Install extended dependencies + - name: Install dependencies shell: bash - run: | - poetry install --with test - poetry run pip install uv - poetry run uv pip install -r extended_dependencies/${{ inputs.extended-deps-file }} + run: poetry install --with test,test_integration - name: Install deps outside pyproject if: ${{ startsWith(inputs.working-directory, 'libs/community/') }} shell: bash run: poetry run pip install "boto3<2" "google-cloud-aiplatform<2" - - name: Run extended tests + - name: 'Authenticate to Google Cloud' + id: 'auth' + uses: google-github-actions/auth@v2 + with: + credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' + + - name: Run integration tests shell: bash + env: + AI21_API_KEY: ${{ secrets.AI21_API_KEY }} + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + AZURE_OPENAI_API_VERSION: ${{ secrets.AZURE_OPENAI_API_VERSION }} + AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }} + AZURE_OPENAI_LLM_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_LLM_DEPLOYMENT_NAME }} + AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME }} + MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} + NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} + GOOGLE_SEARCH_API_KEY: ${{ secrets.GOOGLE_SEARCH_API_KEY }} + GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }} + HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }} + EXA_API_KEY: ${{ secrets.EXA_API_KEY }} + NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }} + WATSONX_APIKEY: ${{ secrets.WATSONX_APIKEY }} + WATSONX_PROJECT_ID: ${{ secrets.WATSONX_PROJECT_ID }} + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} + PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }} + ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }} + ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }} + ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }} + ES_URL: ${{ secrets.ES_URL }} + ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }} + ES_API_KEY: ${{ secrets.ES_API_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte + MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }} + VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} + COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} + UPSTAGE_API_KEY: ${{ secrets.UPSTAGE_API_KEY }} run: | - make test + make integration_tests - name: Ensure the tests did not create any additional files shell: bash From d7667da26d7410976dfaa9ed3ae7d8b675b07c97 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 17:54:16 -0400 Subject: [PATCH 08/14] add extended test workflow --- .github/workflows/_extended_test.yml | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 .github/workflows/_extended_test.yml diff --git a/.github/workflows/_extended_test.yml b/.github/workflows/_extended_test.yml new file mode 100644 index 0000000000000..fa010f3d54ee6 --- /dev/null +++ b/.github/workflows/_extended_test.yml @@ -0,0 +1,72 @@ +name: Extended tests + +on: + workflow_dispatch: + inputs: + working-directory: + required: true + type: string + default: "libs/community" + python-version: + required: true + type: string + description: "Python version to use" + default: "3.11" + extended-deps-file: + required: true + type: choice + description: "File to install extended dependencies from" + options: + - extended_testing_deps.txt + - pdf_loader_deps.txt + - other_deps.txt + +env: + POETRY_VERSION: "1.7.1" + +jobs: + build: + defaults: + run: + working-directory: ${{ inputs.working-directory }} + runs-on: ubuntu-latest + name: Python ${{ inputs.python-version }} + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ inputs.python-version }} + Poetry ${{ env.POETRY_VERSION }} + uses: "./.github/actions/poetry_setup" + with: + python-version: ${{ inputs.python-version }} + poetry-version: ${{ env.POETRY_VERSION }} + working-directory: ${{ inputs.working-directory }} + cache-key: core + + - name: Install extended dependencies + shell: bash + run: | + poetry install --with test + poetry run pip install uv + poetry run uv pip install -r extended_dependencies/${{ inputs.extended-deps-file }} + + - name: Install deps outside pyproject + if: ${{ startsWith(inputs.working-directory, 'libs/community/') }} + shell: bash + run: poetry run pip install "boto3<2" "google-cloud-aiplatform<2" + + - name: Run extended tests + shell: bash + run: | + make test + + - name: Ensure the tests did not create any additional files + shell: bash + run: | + set -eu + + STATUS="$(git status)" + echo "$STATUS" + + # grep will exit non-zero if the target message isn't found, + # and `set -e` above will cause the step to fail. + echo "$STATUS" | grep 'nothing to commit, working tree clean' From 47065413073ad9960d94d152ef686a470ea09e0b Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 18:53:29 -0400 Subject: [PATCH 09/14] Revert "move some pdf integration tests to extended tests" This reverts commit 48ca84dcfa46afd1eba69c073405924fc5da65df. --- .../document_loaders/test_pdf.py | 33 +++++++++++++ .../unit_tests/document_loaders/test_pdf.py | 46 ------------------- 2 files changed, 33 insertions(+), 46 deletions(-) delete mode 100644 libs/community/tests/unit_tests/document_loaders/test_pdf.py diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 50c9fde29d918..462e20d357904 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,3 +1,4 @@ +import re from pathlib import Path from typing import Sequence, Union @@ -10,6 +11,7 @@ PDFMinerPDFasHTMLLoader, PyMuPDFLoader, PyPDFium2Loader, + PyPDFLoader, UnstructuredPDFLoader, ) @@ -84,6 +86,37 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 +def test_pypdf_loader() -> None: + """Test PyPDFLoader.""" + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = PyPDFLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = PyPDFLoader(str(file_path)) + + docs = loader.load() + assert len(docs) == 16 + + +def test_pypdf_loader_with_layout() -> None: + """Test PyPDFLoader with layout mode.""" + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = PyPDFLoader(str(file_path), extraction_mode="layout") + + docs = loader.load() + first_page = docs[0].page_content + + expected = ( + Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt" + ).read_text(encoding="utf-8") + cleaned_first_page = re.sub(r"\x00", "", first_page) + cleaned_expected = re.sub(r"\x00", "", expected) + assert cleaned_first_page == cleaned_expected + + def test_pypdfium2_loader() -> None: """Test PyPDFium2Loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py deleted file mode 100644 index d62363723bd60..0000000000000 --- a/libs/community/tests/unit_tests/document_loaders/test_pdf.py +++ /dev/null @@ -1,46 +0,0 @@ -import re -from pathlib import Path - -import pytest - -from langchain_community.document_loaders import PyPDFLoader - -path_to_simple_pdf = ( - Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf" -) -path_to_layout_pdf = ( - Path(__file__).parent.parent - / "document_loaders/sample_documents/layout-parser-paper.pdf" -) -path_to_layout_pdf_txt = ( - Path(__file__).parent.parent.parent - / "integration_tests/examples/layout-parser-paper-page-1.txt" -) - - -@pytest.mark.requires("pypdf") -def test_pypdf_loader() -> None: - """Test PyPDFLoader.""" - loader = PyPDFLoader(str(path_to_simple_pdf)) - docs = loader.load() - - assert len(docs) == 1 - - loader = PyPDFLoader(str(path_to_layout_pdf)) - - docs = loader.load() - assert len(docs) == 16 - - -@pytest.mark.requires("pypdf") -def test_pypdf_loader_with_layout() -> None: - """Test PyPDFLoader with layout mode.""" - loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout") - - docs = loader.load() - first_page = docs[0].page_content - - expected = path_to_layout_pdf_txt.read_text(encoding="utf-8") - cleaned_first_page = re.sub(r"\x00", "", first_page) - cleaned_expected = re.sub(r"\x00", "", expected) - assert cleaned_first_page == cleaned_expected From 22bee8147ff9f2a1404d0a279cd0233889a06230 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 19:01:08 -0400 Subject: [PATCH 10/14] add pytest marker --- libs/community/Makefile | 2 +- libs/community/pyproject.toml | 1 + .../tests/integration_tests/document_loaders/test_pdf.py | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/libs/community/Makefile b/libs/community/Makefile index 55b63f009b519..439cc23be821f 100644 --- a/libs/community/Makefile +++ b/libs/community/Makefile @@ -19,7 +19,7 @@ test tests: poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE) integration_tests: - poetry run pytest $(TEST_FILE) + poetry run pytest -m runs $(TEST_FILE) test_watch: poetry run ptw --disable-socket --allow-unix-socket --snapshot-update --now . -- -vv tests/unit_tests diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml index e0968b5b89b96..1447ffd2a85cc 100644 --- a/libs/community/pyproject.toml +++ b/libs/community/pyproject.toml @@ -63,6 +63,7 @@ addopts = "--strict-markers --strict-config --durations=5 --snapshot-warn-unused markers = [ "requires: mark tests as requiring a specific library", "scheduled: mark tests to run in scheduled testing", + "runs: mark tests to run in CI", "compile: mark placeholder test used to compile integration tests without running them", ] asyncio_mode = "auto" diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 462e20d357904..bdde35d299810 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -86,6 +86,8 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 +@pytest.mark.requires("pypdf") +@pytest.mark.runs def test_pypdf_loader() -> None: """Test PyPDFLoader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" @@ -101,6 +103,8 @@ def test_pypdf_loader() -> None: assert len(docs) == 16 +@pytest.mark.requires("pypdf") +@pytest.mark.runs def test_pypdf_loader_with_layout() -> None: """Test PyPDFLoader with layout mode.""" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" From 5dfc2a89f7ab6f07ed3922f08bcf7e718c63bc94 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 19:01:36 -0400 Subject: [PATCH 11/14] temporarily hijack integration test job to test workflow dispatch off branch --- .github/workflows/_integration_test.yml | 64 +++++++------------------ 1 file changed, 18 insertions(+), 46 deletions(-) diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml index 40b0ec5344edd..56035db65eeea 100644 --- a/.github/workflows/_integration_test.yml +++ b/.github/workflows/_integration_test.yml @@ -1,4 +1,4 @@ -name: Integration tests +name: Extended tests on: workflow_dispatch: @@ -6,10 +6,20 @@ on: working-directory: required: true type: string + default: "libs/community" python-version: required: true type: string description: "Python version to use" + default: "3.11" + extended-deps-file: + required: true + type: choice + description: "File to install extended dependencies from" + options: + - extended_testing_deps.txt + - pdf_loader_deps.txt + - other_deps.txt env: POETRY_VERSION: "1.7.1" @@ -32,60 +42,22 @@ jobs: working-directory: ${{ inputs.working-directory }} cache-key: core - - name: Install dependencies + - name: Install extended dependencies shell: bash - run: poetry install --with test,test_integration + run: | + poetry install --with test + poetry run pip install uv + poetry run uv pip install -r extended_dependencies/${{ inputs.extended-deps-file }} - name: Install deps outside pyproject if: ${{ startsWith(inputs.working-directory, 'libs/community/') }} shell: bash run: poetry run pip install "boto3<2" "google-cloud-aiplatform<2" - - name: 'Authenticate to Google Cloud' - id: 'auth' - uses: google-github-actions/auth@v2 - with: - credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' - - - name: Run integration tests + - name: Run extended tests shell: bash - env: - AI21_API_KEY: ${{ secrets.AI21_API_KEY }} - FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - AZURE_OPENAI_API_VERSION: ${{ secrets.AZURE_OPENAI_API_VERSION }} - AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }} - AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} - AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }} - AZURE_OPENAI_LLM_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_LLM_DEPLOYMENT_NAME }} - AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME: ${{ secrets.AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME }} - MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} - TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} - NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} - GOOGLE_SEARCH_API_KEY: ${{ secrets.GOOGLE_SEARCH_API_KEY }} - GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }} - HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }} - EXA_API_KEY: ${{ secrets.EXA_API_KEY }} - NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }} - WATSONX_APIKEY: ${{ secrets.WATSONX_APIKEY }} - WATSONX_PROJECT_ID: ${{ secrets.WATSONX_PROJECT_ID }} - PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} - PINECONE_ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }} - ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }} - ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }} - ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }} - ES_URL: ${{ secrets.ES_URL }} - ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }} - ES_API_KEY: ${{ secrets.ES_API_KEY }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte - MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }} - VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} - COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} - UPSTAGE_API_KEY: ${{ secrets.UPSTAGE_API_KEY }} run: | + make test make integration_tests - name: Ensure the tests did not create any additional files From 0dfe63af2a674fb52c757fa0b92a5c4722ae5b43 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 19:18:11 -0400 Subject: [PATCH 12/14] update --- .github/workflows/_extended_test.yml | 3 +- .github/workflows/_integration_test.yml | 2 +- .../tests/integration_tests/conftest.py | 87 ++++++++++++++++++- .../document_loaders/test_pdf.py | 4 +- 4 files changed, 91 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_extended_test.yml b/.github/workflows/_extended_test.yml index fa010f3d54ee6..2a002dcfb6c37 100644 --- a/.github/workflows/_extended_test.yml +++ b/.github/workflows/_extended_test.yml @@ -45,7 +45,7 @@ jobs: - name: Install extended dependencies shell: bash run: | - poetry install --with test + poetry install --with test,test_integration poetry run pip install uv poetry run uv pip install -r extended_dependencies/${{ inputs.extended-deps-file }} @@ -58,6 +58,7 @@ jobs: shell: bash run: | make test + make integration_tests - name: Ensure the tests did not create any additional files shell: bash diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml index 56035db65eeea..2a002dcfb6c37 100644 --- a/.github/workflows/_integration_test.yml +++ b/.github/workflows/_integration_test.yml @@ -45,7 +45,7 @@ jobs: - name: Install extended dependencies shell: bash run: | - poetry install --with test + poetry install --with test,test_integration poetry run pip install uv poetry run uv pip install -r extended_dependencies/${{ inputs.extended-deps-file }} diff --git a/libs/community/tests/integration_tests/conftest.py b/libs/community/tests/integration_tests/conftest.py index 02b518e8695a2..477b3c518559f 100644 --- a/libs/community/tests/integration_tests/conftest.py +++ b/libs/community/tests/integration_tests/conftest.py @@ -1,6 +1,11 @@ -# Getting the absolute path of the current file's directory +from importlib import util import os +from typing import Dict, Sequence + +import pytest +from pytest import Config, Function, Parser +# Getting the absolute path of the current file's directory ABS_PATH = os.path.dirname(os.path.abspath(__file__)) # Getting the absolute path of the project's root directory @@ -17,3 +22,83 @@ def _load_env() -> None: _load_env() + +def pytest_addoption(parser: Parser) -> None: + """Add custom command line options to pytest.""" + parser.addoption( + "--only-extended", + action="store_true", + help="Only run extended tests. Does not allow skipping any extended tests.", + ) + parser.addoption( + "--only-core", + action="store_true", + help="Only run core tests. Never runs any extended tests.", + ) + + +def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) -> None: + """Add implementations for handling custom markers. + + At the moment, this adds support for a custom `requires` marker. + + The `requires` marker is used to denote tests that require one or more packages + to be installed to run. If the package is not installed, the test is skipped. + + The `requires` marker syntax is: + + .. code-block:: python + + @pytest.mark.requires("package1", "package2") + def test_something(): + ... + """ + # Mapping from the name of a package to whether it is installed or not. + # Used to avoid repeated calls to `util.find_spec` + required_pkgs_info: Dict[str, bool] = {} + + only_extended = config.getoption("--only-extended") or False + only_core = config.getoption("--only-core") or False + + if only_extended and only_core: + raise ValueError("Cannot specify both `--only-extended` and `--only-core`.") + + for item in items: + requires_marker = item.get_closest_marker("requires") + if requires_marker is not None: + if only_core: + item.add_marker(pytest.mark.skip(reason="Skipping not a core test.")) + continue + + # Iterate through the list of required packages + required_pkgs = requires_marker.args + for pkg in required_pkgs: + # If we haven't yet checked whether the pkg is installed + # let's check it and store the result. + if pkg not in required_pkgs_info: + try: + installed = util.find_spec(pkg) is not None + except Exception: + installed = False + required_pkgs_info[pkg] = installed + + if not required_pkgs_info[pkg]: + if only_extended: + pytest.fail( + f"Package `{pkg}` is not installed but is required for " + f"extended tests. Please install the given package and " + f"try again.", + ) + + else: + # If the package is not installed, we immediately break + # and mark the test as skipped. + item.add_marker( + pytest.mark.skip(reason=f"Requires pkg: `{pkg}`") + ) + break + else: + if only_extended: + item.add_marker( + pytest.mark.skip(reason="Skipping not an extended test.") + ) diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index bdde35d299810..e19a084a871df 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -86,8 +86,8 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 -@pytest.mark.requires("pypdf") @pytest.mark.runs +@pytest.mark.requires("pypdf") def test_pypdf_loader() -> None: """Test PyPDFLoader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" @@ -103,8 +103,8 @@ def test_pypdf_loader() -> None: assert len(docs) == 16 -@pytest.mark.requires("pypdf") @pytest.mark.runs +@pytest.mark.requires("pypdf") def test_pypdf_loader_with_layout() -> None: """Test PyPDFLoader with layout mode.""" file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" From 9786f011068d9e9e520f125f0d6a67240bf976de Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 19:19:22 -0400 Subject: [PATCH 13/14] add comment --- .github/workflows/_integration_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml index 2a002dcfb6c37..59607dfe450df 100644 --- a/.github/workflows/_integration_test.yml +++ b/.github/workflows/_integration_test.yml @@ -1,3 +1,6 @@ +# Ignore changes to this file. Hijacking just to allow +# testing of workflow dispatch on new workflow off of branch. + name: Extended tests on: From fdbd9f6eba86849ec8d97703460d8bc61d481665 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Sep 2024 19:20:32 -0400 Subject: [PATCH 14/14] fix --- .github/workflows/_integration_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_integration_test.yml b/.github/workflows/_integration_test.yml index 59607dfe450df..5717f25ad7ffa 100644 --- a/.github/workflows/_integration_test.yml +++ b/.github/workflows/_integration_test.yml @@ -1,7 +1,7 @@ # Ignore changes to this file. Hijacking just to allow # testing of workflow dispatch on new workflow off of branch. -name: Extended tests +name: Integration tests on: workflow_dispatch: