diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml new file mode 100644 index 0000000..c1f2009 --- /dev/null +++ b/.github/workflows/deploy-dev.yml @@ -0,0 +1,85 @@ +name: Deploy to Dev Workspace + +on: + push: + branches: + - dev + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install jq and yq + run: | + sudo apt-get update + sudo apt-get install -y jq + sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq + sudo chmod +x /usr/bin/yq + + - name: Create or Update Pipelines + env: + DEEPSET_CLOUD_API_KEY: ${{ secrets.DEEPSET_CLOUD_API_KEY }} + DEEPSET_CLOUD_WORKSPACE_NAME: "your-dev-workspace-name" + run: | + for pipeline_dir in pipelines/*/; do + pipeline_name=$(basename "$pipeline_dir") + indexing_yaml="${pipeline_dir}indexing.yaml" + query_yaml="${pipeline_dir}query.yaml" + + if [[ ! -f "$indexing_yaml" || ! -f "$query_yaml" ]]; then + echo "Error: Both indexing.yaml and query.yaml must exist in $pipeline_dir" + exit 1 + fi + + indexing_content=$(yq eval -o=json "$indexing_yaml") + query_content=$(yq eval -o=json "$query_yaml") + + payload=$(jq -n \ + --arg name "$pipeline_name" \ + --arg indexing "$indexing_content" \ + --arg query "$query_content" \ + '{ + "name": $name, + "yaml": null, + "indexing_yaml": $indexing, + "query_yaml": $query, + "deepset_cloud_version": "v2" + }') + + response=$(curl --silent --show-error --fail \ + --request POST \ + --url "https://api.cloud.deepset.ai/api/v1/workspaces/${DEEPSET_CLOUD_WORKSPACE_NAME}/pipelines" \ + --header "Authorization: Bearer ${DEEPSET_CLOUD_API_KEY}" \ + --header 'Content-Type: application/json' \ + --data "$payload") + + echo "Pipeline creation/update response for $pipeline_name: $response" + + if [[ $response == *"error"* ]]; then + echo "Pipeline creation/update failed for $pipeline_name" + exit 1 + fi + done + + - name: Validate Pipelines + env: + DEEPSET_CLOUD_API_KEY: ${{ secrets.DEEPSET_CLOUD_API_KEY }} + DEEPSET_CLOUD_WORKSPACE_NAME: "your-dev-workspace-name" + run: | + validation_response=$(curl --silent --show-error --fail \ + --request POST \ + --url "https://api.cloud.deepset.ai/api/v1/workspaces/${DEEPSET_CLOUD_WORKSPACE_NAME}/pipeline_validations" \ + --header "Authorization: Bearer ${DEEPSET_CLOUD_API_KEY}" \ + --header 'content-type: application/json' \ + --data '{"deepset_cloud_version": "v2"}') + + echo "Validation response: $validation_response" + + if [[ $validation_response == *"error"* ]]; then + echo "Pipeline validation failed" + exit 1 + fi diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml new file mode 100644 index 0000000..802a8dc --- /dev/null +++ b/.github/workflows/deploy-prod.yml @@ -0,0 +1,74 @@ +name: Deploy to Prod Workspace + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install jq + run: sudo apt-get install jq + + - name: Create or Update Pipelines + env: + DEEPSET_CLOUD_API_KEY: ${{ secrets.DEEPSET_CLOUD_API_KEY }} + DEEPSET_CLOUD_WORKSPACE_NAME: "your-prod-workspace-name" + run: | + for pipeline in pipelines/*.yaml; do + pipeline_content=$(cat "$pipeline" | jq -sR .) + response=$(curl --silent --show-error --fail \ + --request POST \ + --url "https://api.cloud.deepset.ai/api/v1/workspaces/${DEEPSET_CLOUD_WORKSPACE_NAME}/pipelines?dry_run=false" \ + --header "Authorization: Bearer ${DEEPSET_CLOUD_API_KEY}" \ + --header 'Content-Type: application/json' \ + --data "{ + \"yaml_content\": ${pipeline_content}, + \"deepset_cloud_version\": \"v2\" + }") + + echo "Pipeline creation/update response for $pipeline: $response" + + if [[ $response == *"error"* ]]; then + echo "Pipeline creation/update failed for $pipeline" + exit 1 + fi + done + + - name: Validate Pipelines + env: + DEEPSET_CLOUD_API_KEY: ${{ secrets.DEEPSET_CLOUD_API_KEY }} + DEEPSET_CLOUD_WORKSPACE_NAME: "your-prod-workspace-name" + run: | + validation_response=$(curl --silent --show-error --fail \ + --request POST \ + --url "https://api.cloud.deepset.ai/api/v1/workspaces/${DEEPSET_CLOUD_WORKSPACE_NAME}/pipeline_validations" \ + --header "Authorization: Bearer ${DEEPSET_CLOUD_API_KEY}" \ + --header 'content-type: application/json' \ + --data '{"deepset_cloud_version": "v2"}') + + echo "Validation response: $validation_response" + + if [[ $validation_response == *"error"* ]]; then + echo "Pipeline validation failed" + exit 1 + fi + + - name: Deploy Pipelines to Prod Workspace + env: + DEEPSET_CLOUD_API_KEY: ${{ secrets.DEEPSET_CLOUD_API_KEY }} + DEEPSET_CLOUD_WORKSPACE_NAME: "your-prod-workspace-name" + run: | + for pipeline in pipelines/*.yaml; do + deepset-cloud pipelines upload \ + --api-key "$DEEPSET_CLOUD_API_KEY" \ + --workspace-name "$DEEPSET_CLOUD_WORKSPACE_NAME" \ + --file "$pipeline" + done diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b7a9cf1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,49 @@ +# Python-related files +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +env/ +ENV/ + +# IDEs and editors +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Operating system files +.DS_Store +Thumbs.db + +# Logs +*.log + +# Local configuration files +*.env +config.local.yaml + +# Temporary files +*.tmp +*.bak +*.swp diff --git a/README.md b/README.md new file mode 100644 index 0000000..bcc5dba --- /dev/null +++ b/README.md @@ -0,0 +1,221 @@ +# CI/CD Template with GitHub Actions for deepset Cloud + +This repository serves as a template for using **GitHub Actions** as a CI/CD pipeline to manage and deploy pipelines to **deepset Cloud**. It demonstrates how to work with two separate workspaces: **dev** and **prod**. + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Repository Structure](#repository-structure) +- [Setup Instructions](#setup-instructions) +- [Usage](#usage) + - [Branching Strategy](#branching-strategy) + - [Adding or Updating Pipelines](#adding-or-updating-pipelines) + - [Triggering Deployments](#triggering-deployments) + - [Rollback Procedures](#rollback-procedures) +- [Security Considerations](#security-considerations) +- [Extending the Template](#extending-the-template) +- [Support and Contributions](#support-and-contributions) + +--- + +## Prerequisites + +Before using this template, ensure you have the following: + +- A **deepset Cloud** account with access to your **dev** and **prod** workspaces. +- A **GitHub** account with permissions to create repositories and set up GitHub Actions. +- Basic understanding of **Git** and GitHub workflows. + +## Repository Structure + +```plaintext +your-repo/ +├── .github/ +│ └── workflows/ +│ ├── deploy-dev.yml +│ └── deploy-prod.yml +├── pipelines/ +│ ├── pipeline1.yaml +│ └── pipeline2.yaml +└── README.md +``` + +- **`.github/workflows/`**: Contains GitHub Actions workflow files for CI/CD. +- **`pipelines/`**: Stores your pipeline configuration files. +- **`README.md`**: Provides instructions and information about the repository. + +## Setup Instructions + +1. **Clone the Repository**: + + ```bash + git clone https://github.com/your-username/your-repo.git + cd your-repo + ``` + +2. **Set Up GitHub Secrets**: + + - Navigate to your GitHub repository. + - Go to **Settings** > **Secrets and variables** > **Actions**. + - Click **"New repository secret"** and add the following secret: + + - `DEEPSET_CLOUD_API_KEY`: Your deepset Cloud API key. + + **Note**: The same API key is used for both dev and prod environments. + +3. **Configure Workspace Names**: + + - Update the workspace names in the workflow files: + + - In `.github/workflows/deploy-dev.yml`, replace `"your-dev-workspace-name"` with your actual **dev** workspace name. + - In `.github/workflows/deploy-prod.yml`, replace `"your-prod-workspace-name"` with your actual **prod** workspace name. + +4. **Install Dependencies (Optional)**: + + - If you plan to run the deepset Cloud CLI locally, install it with: + + ```bash + pip install deepset-cloud + ``` + +## Usage + +### Branching Strategy + +This template uses a simple branching strategy to manage deployments to different environments: + +- **Development Branch (`dev`)**: + + - Used for integrating and testing new changes. + - Automatically deploys to the **dev** workspace upon push. + +- **Production Branch (`main`)**: + + - Contains stable and reviewed code. + - Deployment to the **prod** workspace is triggered on push or manually via GitHub Actions. + +### Adding or Updating Pipelines + +1. **Add or Modify Pipeline Files**: + + - Pipeline configuration files are located in the `pipelines/` directory. + - Create new pipeline YAML files or modify existing ones. + + **Example**: + + ```yaml:pipelines/pipeline1.yaml + version: "1.0" + components: + - name: Retriever + type: BM25Retriever + params: + document_store: MyDocumentStore + + - name: Reader + type: TransformersReader + params: + model_name_or_path: deepset/roberta-base-squad2 + + pipelines: + - name: question-answering + nodes: + - name: Retriever + inputs: [Query] + - name: Reader + inputs: [Retriever] + ``` + +2. **Commit Changes**: + + ```bash + git add pipelines/your-pipeline.yaml + git commit -m "Add new pipeline" + ``` + +3. **Push to the Appropriate Branch**: + + - For development: + + ```bash + git push origin dev + ``` + + - For production: + + ```bash + git push origin main + ``` + +### Triggering Deployments + +- **Automatic Deployment**: + + - **Dev Workspace**: Pushing to the `dev` branch triggers the `deploy-dev.yml` workflow. + - **Prod Workspace**: Pushing to the `main` branch triggers the `deploy-prod.yml` workflow. + +- **Manual Deployment**: + + - For production, you can manually trigger the workflow: + + 1. Go to the **Actions** tab in your GitHub repository. + 2. Select **"Deploy to Prod Workspace"** workflow. + 3. Click **"Run workflow"**. + +### Rollback Procedures + +If you need to revert to a previous pipeline version: + +1. **Revert Changes in Git**: + + ```bash + git revert + git push origin dev # or main, depending on the branch + ``` + +2. **Deployment**: + + - The GitHub Actions workflow will redeploy the pipelines based on the reverted code. + +## Security Considerations + +- **API Keys**: + + - Store sensitive information like API keys in **GitHub Secrets**. + - **Do not** commit sensitive data to the repository. + +- **Workspace Names**: + + - Workspace names are specified in the workflow files. Ensure they are correct to avoid deploying to unintended workspaces. + +- **Secret Rotation**: + + - Regularly rotate your API keys and update the GitHub Secrets accordingly. + +## Extending the Template + +- **Add More Environments**: + + - To add environments like staging, copy one of the existing workflows and modify the workspace name and trigger conditions. + +- **Validation and Testing**: + + - Incorporate pipeline validation steps before deployment. + + ```yaml + - name: Validate Pipelines + run: | + for pipeline in pipelines/*.yaml; do + deepset-cloud pipelines validate --file "$pipeline" + done + ``` + +- **Notifications**: + + - Add steps to send deployment notifications via email, Slack, etc. + +## Support and Contributions + +- **Issues**: If you encounter problems, please open an issue. +- **Contributions**: Contributions are welcome! Feel free to fork the repository and submit a pull request. + +--- diff --git a/pipelines/rag-qa-gpt4/indexing.yaml b/pipelines/rag-qa-gpt4/indexing.yaml new file mode 100644 index 0000000..ddd9fcc --- /dev/null +++ b/pipelines/rag-qa-gpt4/indexing.yaml @@ -0,0 +1,135 @@ +# If you need help with the YAML format, have a look at https://docs.cloud.deepset.ai/v2.0/docs/create-a-pipeline#create-a-pipeline-using-pipeline-editor. +# This section defines components that you want to use in your pipelines. Each component must have a name and a type. You can also set the component's parameters here. +# The name is up to you, you can give your component a friendly name. You then use components' names when specifying the connections in the pipeline. +# Type is the class path of the component. You can check the type on the component's documentation page. +components: + file_classifier: + type: haystack.components.routers.file_type_router.FileTypeRouter + init_parameters: + mime_types: + - text/plain + - application/pdf + - text/markdown + - text/html + - application/vnd.openxmlformats-officedocument.wordprocessingml.document + - application/vnd.openxmlformats-officedocument.presentationml.presentation + - application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + + text_converter: + type: haystack.components.converters.txt.TextFileToDocument + init_parameters: + encoding: utf-8 + + pdf_converter: + type: haystack.components.converters.pypdf.PyPDFToDocument + init_parameters: + converter: + type: haystack.components.converters.pypdf.DefaultConverter + + markdown_converter: + type: haystack.components.converters.markdown.MarkdownToDocument + init_parameters: + table_to_single_line: false + + html_converter: + type: haystack.components.converters.html.HTMLToDocument + init_parameters: + # A dictionary of keyword arguments to customize how you want to extract content from your HTML files. + # For the full list of available arguments, see + # the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract). + extraction_kwargs: + output_format: txt # Extract text from HTML. You can also also choose "markdown" + target_language: null # You can define a language (using the ISO 639-1 format) to discard documents that don't match that language. + include_tables: true # If true, includes tables in the output + include_links: false # If true, keeps links along with their targets + + docx_converter: + type: haystack.components.converters.docx.DOCXToDocument + init_parameters: {} + + pptx_converter: + type: haystack.components.converters.pptx.PPTXToDocument + init_parameters: {} + + xlsx_converter: + type: deepset_cloud_custom_nodes.converters.xlsx.XLSXToDocument + init_parameters: {} + + joiner: + type: haystack.components.joiners.document_joiner.DocumentJoiner + init_parameters: + join_mode: concatenate + sort_by_score: false + + joiner_xlsx: # merge split documents with non-split xlsx documents + type: haystack.components.joiners.document_joiner.DocumentJoiner + init_parameters: + join_mode: concatenate + sort_by_score: false + + splitter: + type: deepset_cloud_custom_nodes.preprocessors.document_splitter.DeepsetDocumentSplitter + init_parameters: + split_by: word + split_length: 250 + split_overlap: 30 + respect_sentence_boundary: True + language: en + + document_embedder: + type: haystack.components.embedders.sentence_transformers_document_embedder.SentenceTransformersDocumentEmbedder + init_parameters: + model: "intfloat/e5-base-v2" + + writer: + type: haystack.components.writers.document_writer.DocumentWriter + init_parameters: + document_store: + type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore + init_parameters: + embedding_dim: 768 + similarity: cosine + policy: OVERWRITE + +connections: # Defines how the components are connected +- sender: file_classifier.text/plain + receiver: text_converter.sources +- sender: file_classifier.application/pdf + receiver: pdf_converter.sources +- sender: file_classifier.text/markdown + receiver: markdown_converter.sources +- sender: file_classifier.text/html + receiver: html_converter.sources +- sender: file_classifier.application/vnd.openxmlformats-officedocument.wordprocessingml.document + receiver: docx_converter.sources +- sender: file_classifier.application/vnd.openxmlformats-officedocument.presentationml.presentation + receiver: pptx_converter.sources +- sender: file_classifier.application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + receiver: xlsx_converter.sources +- sender: text_converter.documents + receiver: joiner.documents +- sender: pdf_converter.documents + receiver: joiner.documents +- sender: markdown_converter.documents + receiver: joiner.documents +- sender: html_converter.documents + receiver: joiner.documents +- sender: docx_converter.documents + receiver: joiner.documents +- sender: pptx_converter.documents + receiver: joiner.documents +- sender: joiner.documents + receiver: splitter.documents +- sender: splitter.documents + receiver: joiner_xlsx.documents +- sender: xlsx_converter.documents + receiver: joiner_xlsx.documents +- sender: joiner_xlsx.documents + receiver: document_embedder.documents +- sender: document_embedder.documents + receiver: writer.documents + +max_loops_allowed: 100 + +inputs: # Define the inputs for your pipeline + files: "file_classifier.sources" # This component will receive the files to index as input diff --git a/pipelines/rag-qa-gpt4/query.yaml b/pipelines/rag-qa-gpt4/query.yaml new file mode 100644 index 0000000..c181214 --- /dev/null +++ b/pipelines/rag-qa-gpt4/query.yaml @@ -0,0 +1,132 @@ +# If you need help with the YAML format, have a look at https://docs.cloud.deepset.ai/v2.0/docs/create-a-pipeline#create-a-pipeline-using-pipeline-editor. +# This section defines components that you want to use in your pipelines. Each component must have a name and a type. You can also set the component's parameters here. +# The name is up to you, you can give your component a friendly name. You then use components' names when specifying the connections in the pipeline. +# Type is the class path of the component. You can check the type on the component's documentation page. +components: + bm25_retriever: # Selects the most similar documents from the document store + type: haystack_integrations.components.retrievers.opensearch.bm25_retriever.OpenSearchBM25Retriever + init_parameters: + document_store: + type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore + init_parameters: + use_ssl: True + verify_certs: False + hosts: + - ${OPENSEARCH_HOST} + http_auth: + - "${OPENSEARCH_USER}" + - "${OPENSEARCH_PASSWORD}" + embedding_dim: 768 + similarity: cosine + top_k: 20 # The number of results to return + + query_embedder: + type: haystack.components.embedders.sentence_transformers_text_embedder.SentenceTransformersTextEmbedder + init_parameters: + model: "intfloat/e5-base-v2" + + embedding_retriever: # Selects the most similar documents from the document store + type: haystack_integrations.components.retrievers.opensearch.embedding_retriever.OpenSearchEmbeddingRetriever + init_parameters: + document_store: + type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore + init_parameters: + use_ssl: True + verify_certs: False + hosts: + - ${OPENSEARCH_HOST} + http_auth: + - "${OPENSEARCH_USER}" + - "${OPENSEARCH_PASSWORD}" + embedding_dim: 768 + similarity: cosine + top_k: 20 # The number of results to return + + document_joiner: + type: haystack.components.joiners.document_joiner.DocumentJoiner + init_parameters: + join_mode: concatenate + + ranker: + type: haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker + init_parameters: + model: "intfloat/simlm-msmarco-reranker" + top_k: 8 + model_kwargs: + torch_dtype: "torch.float16" + + prompt_builder: + type: haystack.components.builders.prompt_builder.PromptBuilder + init_parameters: + template: |- + You are a technical expert. + You answer questions truthfully based on provided documents. + If the answer exists in several documents, summarize them. + Ignore documents that don't contain the answer to the question. + Only answer based on the documents provided. Don't make things up. + If no information related to the question can be found in the document, say so. + Always use references in the form [NUMBER OF DOCUMENT] when using information from a document, e.g. [3] for Document[3]. + Never name the documents, only enter a number in square brackets as a reference. + The reference must only refer to the number that comes in square brackets after the document. + Otherwise, do not use brackets in your answer and reference ONLY the number of the document without mentioning the word document. + These are the documents: + {% for document in documents %} + Document[{{ loop.index }}]: + {{ document.content }} + {% endfor %} + + Question: {{ question }} + Answer: + + llm: + type: haystack.components.generators.openai.OpenAIGenerator + init_parameters: + api_key: {"type": "env_var", "env_vars": ["OPENAI_API_KEY"], "strict": False} + model: "gpt-4-turbo" + generation_kwargs: + max_tokens: 650 + temperature: 0.0 + seed: 0 + + answer_builder: + type: deepset_cloud_custom_nodes.augmenters.deepset_answer_builder.DeepsetAnswerBuilder + init_parameters: + reference_pattern: acm + +connections: # Defines how the components are connected +- sender: bm25_retriever.documents + receiver: document_joiner.documents +- sender: query_embedder.embedding + receiver: embedding_retriever.query_embedding +- sender: embedding_retriever.documents + receiver: document_joiner.documents +- sender: document_joiner.documents + receiver: ranker.documents +- sender: ranker.documents + receiver: prompt_builder.documents +- sender: ranker.documents + receiver: answer_builder.documents +- sender: prompt_builder.prompt + receiver: llm.prompt +- sender: prompt_builder.prompt + receiver: answer_builder.prompt +- sender: llm.replies + receiver: answer_builder.replies + +max_loops_allowed: 100 + +inputs: # Define the inputs for your pipeline + query: # These components will receive the query as input + - "bm25_retriever.query" + - "query_embedder.text" + - "ranker.query" + - "prompt_builder.question" + - "answer_builder.query" + + filters: # These components will receive a potential query filter as input + - "bm25_retriever.filters" + - "embedding_retriever.filters" + +outputs: # Defines the output of your pipeline + documents: "ranker.documents" # The output of the pipeline is the retrieved documents + answers: "answer_builder.answers" # The output of the pipeline is the generated answers