Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

INTPYTHON-416 Make AI/ML testing framework runnable locally #52

Merged
merged 10 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 9 additions & 45 deletions .evergreen/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,40 +37,29 @@ functions:
args: [.evergreen/fetch-secrets.sh]

"fetch repo":
- command: shell.exec
- command: subprocess.exec
type: setup
params:
include_expansions_in_env: [DIR]
working_dir: "src"
script: |
if [ ! -d "${DIR}" ]; then
echo '${REPO_NAME} could not be found' 1>&2
exit 1
fi
# Apply patches to upstream repo if desired.
cd ${DIR}
git clone ${CLONE_URL}
if [ -d "patches" ]; then
cd ${REPO_NAME}
echo "Applying patches."
git apply ../patches/*
fi
binary: bash
args: [.evergreen/fetch-repo.sh]

"execute tests":
- command: subprocess.exec
type: test
params:
add_expansions_to_env: true
working_dir: "src/${DIR}/${REPO_NAME}"
include_expansions_in_env: [DIR]
working_dir: "src"
binary: bash
args:
- ../run.sh
args: [.evergreen/execute-tests.sh]

"setup local atlas":
- command: subprocess.exec
type: setup
retry_on_failure: true
params:
add_expansions_to_env: true
include_expansions_in_env: [DIR]
working_dir: "src"
binary: bash
args:
Expand All @@ -80,7 +69,7 @@ functions:
- command: subprocess.exec
type: setup
params:
add_expansions_to_env: true
include_expansions_in_env: [DIR]
working_dir: "src"
binary: bash
args: [.evergreen/setup-remote.sh]
Expand Down Expand Up @@ -194,10 +183,6 @@ buildvariants:
display_name: LlamaIndex RHEL KV Store
expansions:
DIR: llama-index-python-kvstore
REPO_NAME: llama_index
# TODO - Update CLONE_URL: [PYTHON-4522] [INTPYTHON-326]
CLONE_URL: -b PYTHON-4522 --single-branch https://github.com/shruti-sridhar/llama_index.git
DATABASE: llama_index_test_db
run_on:
- rhel87-small
tasks:
Expand All @@ -209,9 +194,6 @@ buildvariants:
display_name: Semantic-Kernel RHEL Python
expansions:
DIR: semantic-kernel-python
REPO_NAME: semantic-kernel
CLONE_URL: https://github.com/microsoft/semantic-kernel.git
DATABASE: pyMSKTest
run_on:
- rhel87-small
tasks:
Expand All @@ -224,9 +206,6 @@ buildvariants:
display_name: Semantic-Kernel RHEL CSharp
expansions:
DIR: semantic-kernel-csharp
REPO_NAME: semantic-kernel
CLONE_URL: https://github.com/microsoft/semantic-kernel.git
DATABASE: dotnetMSKNearestTest
run_on:
- rhel87-small
tasks:
Expand All @@ -238,9 +217,6 @@ buildvariants:
display_name: Langchain RHEL Python
expansions:
DIR: langchain-python
REPO_NAME: langchain-mongodb
CLONE_URL: https://github.com/langchain-ai/langchain-mongodb.git
DATABASE: langchain_test_db
run_on:
- rhel87-small
tasks:
Expand All @@ -252,9 +228,6 @@ buildvariants:
display_name: Langgraph RHEL Python
expansions:
DIR: langgraph-python
REPO_NAME: langchain-mongodb
CLONE_URL: https://github.com/langchain-ai/langchain-mongodb.git
DATABASE: langgraph-test
run_on:
- rhel87-small
tasks:
Expand All @@ -266,9 +239,6 @@ buildvariants:
display_name: ChatGPT Retrieval Plugin
expansions:
DIR: chatgpt-retrieval-plugin
REPO_NAME: chatgpt-retrieval-plugin
CLONE_URL: https://github.com/openai/chatgpt-retrieval-plugin.git
DATABASE: chatgpt_retrieval_plugin_test_db
run_on:
- rhel87-small
tasks:
Expand All @@ -280,9 +250,6 @@ buildvariants:
display_name: LlamaIndex RHEL Vector Store
expansions:
DIR: llama-index-python-vectorstore
REPO_NAME: llama_index
CLONE_URL: https://github.com/run-llama/llama_index.git
DATABASE: llama_index_test_db
run_on:
- rhel87-small
tasks:
Expand All @@ -295,9 +262,6 @@ buildvariants:
display_name: DocArray RHEL
expansions:
DIR: docarray
REPO_NAME: docarray
CLONE_URL: https://github.com/docarray/docarray.git
DATABASE: docarray_test_db
run_on:
- rhel87-small
tasks:
Expand Down
16 changes: 16 additions & 0 deletions .evergreen/execute-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -eu

SCRIPT_DIR=$(realpath "$(dirname ${BASH_SOURCE[0]})")
ROOT_DIR=$(dirname $SCRIPT_DIR)


# Source the configuration.
cd ${ROOT_DIR}/${DIR}
set -a
source config.env
set +a

cd ${REPO_NAME}
bash ${ROOT_DIR}/${DIR}/run.sh
25 changes: 25 additions & 0 deletions .evergreen/fetch-repo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

set -eu

if [ ! -d "${DIR}" ]; then
echo '${REPO_NAME} could not be found' 1>&2
exit 1
fi

cd ${DIR}

# Source the configuration.
set -a
source config.env
set +a

rm -rf ${REPO_NAME}
git clone ${CLONE_URL}

# Apply patches to upstream repo if desired.
if [ -d "patches" ]; then
cd ${REPO_NAME}
echo "Applying patches."
git apply ../patches/*
fi
2 changes: 1 addition & 1 deletion .evergreen/fetch-secrets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -eu

# Clone drivers-evergeen-tools.
git clone https://github.com/mongodb-labs/drivers-evergreen-tools
git clone https://github.com/mongodb-labs/drivers-evergreen-tools || true

# Get the secrets for drivers/ai-ml-pipeline-testing.
. drivers-evergreen-tools/.evergreen/secrets_handling/setup-secrets.sh drivers/ai-ml-pipeline-testing
7 changes: 7 additions & 0 deletions .evergreen/provision-atlas.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ set -eu

. .evergreen/utils.sh

# Source the config
pushd $DIR
set -a
. config.env
set +x
popd

setup_local_atlas
scaffold_atlas

Expand Down
7 changes: 7 additions & 0 deletions .evergreen/setup-remote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ if [ -z "${DIR:-}" ]; then
exit 1
fi

# Source the config
pushd $DIR
set -a
. config.env
set +x
popd

# Get the correct remote URI.
case $DIR in
llama-index-python-kvstore)
Expand Down
13 changes: 6 additions & 7 deletions .evergreen/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ setup_local_atlas() {
IMAGE=artifactory.corp.mongodb.com/dockerhub/mongodb/mongodb-atlas-local:latest
retry podman pull $IMAGE

CONTAINER_ID=$(podman run --rm -d -e DO_NOT_TRACK=1 -P --health-cmd "/usr/local/bin/runner healthcheck" mongodb/mongodb-atlas-local:latest)
CONTAINER_ID=$(podman run --rm -d -e DO_NOT_TRACK=1 -P --health-cmd "/usr/local/bin/runner healthcheck" $IMAGE)
Jibola marked this conversation as resolved.
Show resolved Hide resolved

echo "waiting for container to become healthy..."
function wait() {
Expand Down Expand Up @@ -104,13 +104,13 @@ setup_local_atlas() {
wait "$CONTAINER_ID"
EXPOSED_PORT=$(podman inspect --format='{{ (index (index .NetworkSettings.Ports "27017/tcp") 0).HostPort }}' "$CONTAINER_ID")
export CONN_STRING="mongodb://127.0.0.1:$EXPOSED_PORT/?directConnection=true"
# shellcheck disable=SC2154
echo "CONN_STRING=mongodb://127.0.0.1:$EXPOSED_PORT/?directConnection=true" > $workdir/src/.evergreen/.local_atlas_uri
SCRIPT_DIR=$(realpath "$(dirname ${BASH_SOURCE[0]})")
echo "CONN_STRING=mongodb://127.0.0.1:$EXPOSED_PORT/?directConnection=true" > $SCRIPT_DIR/.local_atlas_uri
}

fetch_local_atlas_uri() {
# shellcheck disable=SC2154
. $workdir/src/.evergreen/.local_atlas_uri
SCRIPT_DIR=$(realpath "$(dirname ${BASH_SOURCE[0]})")
. $SCRIPT_DIR/.local_atlas_uri

export CONN_STRING=$CONN_STRING
echo "$CONN_STRING"
Expand All @@ -120,8 +120,7 @@ fetch_local_atlas_uri() {
scaffold_atlas() {
PYTHON_BINARY=$(find_python3)

# Should be called from src
EVERGREEN_PATH=$(pwd)/.evergreen
EVERGREEN_PATH=$(realpath "$(dirname ${BASH_SOURCE[0]})")
TARGET_DIR=$(pwd)/$DIR
SCAFFOLD_SCRIPT=$EVERGREEN_PATH/scaffold_atlas.py

Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ xunit-results/
# Miscellaneous
.DS_Store
drivers-evergreen-tools
atlas
.evergreen/.local_atlas_uri

# Secrets
secrets-export.sh
Expand Down
27 changes: 23 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ Each subdirectory is scoped to run only one AI/ML integration's suite of tests f
Within each subdirectory you should expect to have:

- `run.sh` -- A script that should handle any additional library installations and steps for executing the test suite. This script should not populate the Atlas database with any required test data.
- `config.env` - A file that defines the following environment variables:
- `REPO_NAME` -- The name of the AI/ML framework repository that will get cloned
- `CLONE_URL` -- The Github URL to clone into the specified `DIR`
- `DATABASE` -- The optional database where the Atlas CLI will load your index configs
- `database/` -- An optional directory used by `.evergreen/scaffold_atlas.py` to populate a MongoDB database with test data. Only provide this if your tests require pre-populated data.
- `database/{collection}.json` -- An optional JSON file containing one or more MongoDB documents that will be uploaded to `$DATABASE.{collection}` in the local Atlas instance. Only provide this if your tests require pre-populated data.
- `indexConfig.json` -- An optional file containing configuration for a specified Atlas Search Index.
Expand All @@ -40,12 +44,15 @@ The general layout of this repo looks like this:
│ │ └── furthestSearch.json # Populates $DATABASE.furthestSearch
│ ├── indexes # Optional Index definitions directory
│ │ └── indexConfig.json # Optional Search index definition
| ├── config.env # Configuration file
│ └── run.sh # Script that executes test
|
├── semantic-kernel-python # Folder scoped for one Integration
│ ├── database # Optional database definition
│ │ └── nearestSearch.json # Populates $DATABASE.nearestSearch
│ │ └── furthestSearch.json # Populates $DATABASE.furthestSearch
│ ├── indexConfig.json # Creates Search Index on $DATABASE
| ├── config.env # Configuration file
│ └── run.sh # Script that executes test
```

Expand All @@ -54,13 +61,28 @@ The general layout of this repo looks like this:
Each test subdirectory will automatically have its own local Atlas deployment. As a result, database and collection names will not conflict between different AI/ML integrations. To connect to your local Atlas using a connection string, `utils.sh` has a `fetch_local_atlas_uri` that you can call from the `run.sh` script within your subdirectory. For example:

```bash
. $workdir/src/.evergreen/utils.sh
. .evergreen/utils.sh

CONN_STRING=$(fetch_local_atlas_uri)
```

Stores the local Atlas URI within the `CONN_STRING` var. The script can then pass `CONN_STRING` as an environment variable to the test suite.

#### Running tests locally.

We can run the tests with a local checkout of the repo.

For example, to run the `docarray` tests using local atlas:

```bash
export DIR=docarray
bash .evergreen/fetch-repo.sh
bash .evergreen/provision-atlas.sh
bash .evergreen/execute-tests.sh
```

Use `.evergreen/setup-remote.sh` instead of `.evergreen/provision-atlas.sh` to test against the remote cluster.
Jibola marked this conversation as resolved.
Show resolved Hide resolved

#### Pre-populating the Local Atlas Deployment

You can pre-populate a test's local Atlas deployment before running the `run.sh` script by providing JSON files in the optional `database` directory of the created subdirectory. The `.evergreen/scaffold_atlas.py` file will search for every JSON file within this database directory and upload the documents to the database provided by the `DATABASE` expansion provided in the build variant of the `.evergreen/config.yml` setup. The collection the script uploads to is based on the name of your JSON file:
Expand All @@ -82,9 +104,6 @@ Test execution flow is defined in `.evergreen/config.yml`. The test pipeline's c
- [`expansions`](https://docs.devprod.prod.corp.mongodb.com/evergreen/Project-Configuration/Project-Configuration-Files/#expansions) -- Build variant specific variables. Expansions that need to be maintained as secrets should be stored in [the Evergreen project settings](https://spruce.mongodb.com/project/ai-ml-pipeline-testing/settings/variables) using [variables](https://docs.devprod.prod.corp.mongodb.com/evergreen/Project-Configuration/Project-and-Distro-Settings#variables). Some common expansions needed are:

- `DIR` -- The subdirectory where the tasks will run
- `REPO_NAME` -- The name of the AI/ML framework repository that will get cloned
- `CLONE_URL` -- The Github URL to clone into the specified `DIR`
- `DATABASE` -- The optional database where the Atlas CLI will load your index configs

- `run_on` -- Specified platform to run on. `rhel87-small` should be used by default. Any other distro may fail Atlas CLI setup.
- `tasks` -- Tasks to run. See below for more details
Expand Down
3 changes: 3 additions & 0 deletions chatgpt-retrieval-plugin/config.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
REPO_NAME=chatgpt-retrieval-plugin
CLONE_URL="https://github.com/openai/chatgpt-retrieval-plugin.git"
DATABASE=chatgpt_retrieval_plugin_test_db
10 changes: 5 additions & 5 deletions chatgpt-retrieval-plugin/run.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#!/bin/sh
#!/bin/bash

# chat-gpt-retrieval-plugin is a poetry run project

set -eu

# Get the MONGODB_URI and OPENAI_API_KEY.
# shellcheck disable=SC2154
. $workdir/src/env.sh
SCRIPT_DIR=$(realpath "$(dirname ${BASH_SOURCE[0]})")
ROOT_DIR=$(dirname $SCRIPT_DIR)
. $ROOT_DIR/env.sh

# shellcheck disable=SC2154
. $workdir/src/.evergreen/utils.sh
. $ROOT_DIR/.evergreen/utils.sh

PYTHON_BINARY=$(find_python3)
$PYTHON_BINARY -c "import sys; print(f'Python version found: {sys.version_info}')"
Expand Down
3 changes: 3 additions & 0 deletions docarray/config.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
REPO_NAME=docarray
CLONE_URL="https://github.com/docarray/docarray.git"
DATABASE=docarray_test_db
12 changes: 7 additions & 5 deletions docarray/run.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#!/bin/sh
#!/bin/bash

# Sets up a virtual environment (poetry)
# Runs the mongodb tests of the upstream repo

set -eu

# Get the MONGODB_URI.
# shellcheck disable=SC2154
. $workdir/src/env.sh
SCRIPT_DIR=$(realpath "$(dirname ${BASH_SOURCE[0]})")
ROOT_DIR=$(dirname $SCRIPT_DIR)

. $ROOT_DIR/env.sh

. $ROOT_DIR/.evergreen/utils.sh

# shellcheck disable=SC2154
. $workdir/src/.evergreen/utils.sh
PYTHON_BINARY=$(find_python3)
$PYTHON_BINARY -c "import sys; print(f'Python version found: {sys.version_info}')"

Expand Down
3 changes: 3 additions & 0 deletions langchain-python/config.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
REPO_NAME=langchain-mongodb
CLONE_URL="https://github.com/langchain-ai/langchain-mongodb.git"
DATABASE=langchain_test_db
Loading
Loading