diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile new file mode 100644 index 0000000..56d3551 --- /dev/null +++ b/ci/Jenkinsfile @@ -0,0 +1,117 @@ +pipeline { + agent any + + options{ + skipDefaultCheckout(true) + } + environment { + PYTHONPATH = "${env.WORKSPACE}/.venv/bin" + } + + + stages { + + stage('Checkout') { + steps { + cleanWs() + checkout scm + } + } + + stage('Create venv'){ + steps { + sh 'python3 -m venv .venv' + } + } + + stage('Install dependencies'){ + steps { + withPythonEnv(PYTHONPATH){ + sh "pip install -e ." + } + } + + } + + stage('Config'){ + steps{ + sh 'echo $env.JENKINS_HOME' + withPythonEnv(PYTHONPATH){ + sh 'python3 ci/modify_config.py' + sh 'rm -rf $JENKINS_HOME/ci_test_data/data/vectordb/ci_test' + sh 'cp -r $JENKINS_HOME/ci_test_data/data/backup_vectordb/ci_test $JENKINS_HOME/ci_test_data/data/vectordb' + } + } + } + + stage('Linting'){ + steps { + withPythonEnv(PYTHONPATH){ + sh 'pip install ruff' + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){ + sh 'ruff check . --exclude .pyenv-var-lib-jenkins-workspace-capstone_5-.venv-bin --output-format junit -o ruff-report.xml' + sh 'ruff format .' + } + } + } + post { + always{ + withChecks('Lint Checks'){ + junit 'ruff-report.xml' + } + } + } + } + + stage('Static type check'){ + steps { + withPythonEnv(PYTHONPATH){ + sh 'pip install mypy' + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE'){ + sh 'python3 -m mypy -p src.grag --junit-xml mypy-report.xml' + } + } + } + post { + always{ + withChecks('Static Type Checks'){ + junit 'mypy-report.xml' + } + + } + } + } + + stage('Tests'){ + steps{ + sh 'echo $USER' + sh 'docker pull chromadb/chroma' + sh 'docker run -d --name jenkins-chroma -p 8000:8000 chromadb/chroma' + withPythonEnv(PYTHONPATH){ + sh 'pip install pytest' + sh 'python3 ci/unlock_deeplake.py' + sh 'pytest src --junitxml=pytest-report.xml' + } + } + post { + always{ + withChecks('Integration Tests'){ + junit 'pytest-report.xml' + } + sh 'docker stop jenkins-chroma' + sh 'docker rm jenkins-chroma' + + cleanWs( + cleanWhenNotBuilt: false, + deleteDirs: true, + disableDeferredWipeout: true, + notFailBuild: true, + patterns: [[pattern: '.gitignore', type: 'INCLUDE'], + [pattern: '.propsfile', type: 'EXCLUDE']] + ) + } + } + } + + } +} diff --git a/ci/modify_config.py b/ci/modify_config.py new file mode 100644 index 0000000..759e88d --- /dev/null +++ b/ci/modify_config.py @@ -0,0 +1,17 @@ +import configparser +import os + +from grag.components.utils import get_config + +config = configparser.ConfigParser() + +workspace = os.getenv('WORKSPACE') +jenkins_home = os.getenv('JENKINS_HOME') + +config = get_config() +config['root']['root_path'] = f'{workspace}' +config['data']['data_path'] = f'{jenkins_home}/ci_test_data/data' +config['llm']['base_dir'] = f'{jenkins_home}/ci_test_models/models' + +with open(f'{workspace}/src/config.ini', 'w') as configfile: + config.write(configfile) diff --git a/ci/modify_config_test.py b/ci/modify_config_test.py new file mode 100644 index 0000000..b93a40a --- /dev/null +++ b/ci/modify_config_test.py @@ -0,0 +1,6 @@ +from grag.components.utils import get_config + +config = get_config() +print(f"{config['root']['root_path']=}") +print(f"{config['data']['data_path'] = }") +print(f"{config['llm']['base_dir'] = }") diff --git a/ci/unlock_deeplake.py b/ci/unlock_deeplake.py new file mode 100644 index 0000000..d766941 --- /dev/null +++ b/ci/unlock_deeplake.py @@ -0,0 +1,11 @@ +import os +import shutil +from pathlib import Path + +jenkins_home = os.getenv('JENKINS_HOME') + +lock_path = Path(jenkins_home) / 'ci_test_data/data/vectordb/ci_test/dataset_lock.lock' + +if os.path.exists(lock_path): + shutil.rmtree(lock_path) + print('Deleting lock file: {}'.format(lock_path)) diff --git a/cookbook/Basic-RAG/BasicRAG_ingest.py b/cookbook/Basic-RAG/BasicRAG_ingest.py index e7b38a0..00e8f0b 100644 --- a/cookbook/Basic-RAG/BasicRAG_ingest.py +++ b/cookbook/Basic-RAG/BasicRAG_ingest.py @@ -5,9 +5,11 @@ from grag.components.multivec_retriever import Retriever from grag.components.vectordb.deeplake_client import DeepLakeClient -client = DeepLakeClient(collection_name="test") -retriever = Retriever(vectordb=client) +# from grag.components.vectordb.chroma_client import ChromaClient -dir_path = Path(__file__).parents[2] / "data/client_test/test/" +client = DeepLakeClient(collection_name="ci_test") +# client = ChromaClient(collection_name="ci_test") +retriever = Retriever(vectordb=client) +dir_path = Path(__file__).parents[2] / "data/test/pdfs/new_papers" retriever.ingest(dir_path) diff --git a/pyproject.toml b/pyproject.toml index e27f822..9859013 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "sentence-transformers==2.2.2", "instructorembedding>=1.0.1", "streamlit>=1.31.1", - "unstructured>=0.12.3", + "unstructured[pdf]>=0.12.3", "pdfplumber>=0.10.3", "llama-cpp-python>=0.2.43", "tqdm>=4.65.0", @@ -102,7 +102,7 @@ exclude_lines = [ [tool.ruff] line-length = 88 indent-width = 4 -extend-exclude = ["tests", "others"] +extend-exclude = ["tests", "others", "docs", "ci"] [tool.ruff.lint] select = ["E4", "E7", "E9", "F", "I", "D"] diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py index 66af4c6..41bff2d 100644 --- a/src/grag/components/multivec_retriever.py +++ b/src/grag/components/multivec_retriever.py @@ -75,7 +75,7 @@ def __init__( self.store = LocalFileStore(self.store_path) self.retriever = MultiVectorRetriever( vectorstore=self.vectordb.langchain_client, - docstore=self.store, # type: ignore + byte_store=self.store, # type: ignore id_key=self.id_key, ) self.splitter = TextSplitter() diff --git a/src/grag/components/vectordb/chroma_client.py b/src/grag/components/vectordb/chroma_client.py index d33e27c..09969ca 100644 --- a/src/grag/components/vectordb/chroma_client.py +++ b/src/grag/components/vectordb/chroma_client.py @@ -44,8 +44,8 @@ class ChromaClient(VectorDB): def __init__( self, - host: int = chroma_conf["host"], - port: int = chroma_conf["port"], + host: str = chroma_conf["host"], + port: str = chroma_conf["port"], collection_name: str = chroma_conf["collection_name"], embedding_type: str = chroma_conf["embedding_type"], embedding_model: str = chroma_conf["embedding_model"], @@ -69,7 +69,7 @@ def __init__( embedding_model=self.embedding_model, embedding_type=self.embedding_type ).embedding_function - self.client = chromadb.HttpClient(host=self.host, port=self.port) + self.client = chromadb.HttpClient(host=self.host, port=self.port) # type: ignore self.collection = self.client.get_or_create_collection( name=self.collection_name ) diff --git a/src/tests/components/multivec_retriever_test.py b/src/tests/components/multivec_retriever_test.py index f2544bf..3f847bd 100644 --- a/src/tests/components/multivec_retriever_test.py +++ b/src/tests/components/multivec_retriever_test.py @@ -1,9 +1,11 @@ import json from grag.components.multivec_retriever import Retriever +from grag.components.vectordb.deeplake_client import DeepLakeClient from langchain_core.documents import Document -retriever = Retriever() # pass test collection +client = DeepLakeClient(collection_name="ci_test") +retriever = Retriever(vectordb=client) # pass test collection doc = Document(page_content="Hello worlds", metadata={"source": "bars"}) diff --git a/src/tests/rag/basic_rag_test.py b/src/tests/rag/basic_rag_test.py index b8c2ceb..0b93643 100644 --- a/src/tests/rag/basic_rag_test.py +++ b/src/tests/rag/basic_rag_test.py @@ -4,7 +4,7 @@ from grag.components.vectordb.deeplake_client import DeepLakeClient from grag.rag.basic_rag import BasicRAG -client = DeepLakeClient(collection_name="test") +client = DeepLakeClient(collection_name="ci_test") retriever = Retriever(vectordb=client)