From 4a9214f165d8fe87ba3a6c87aa36408c4a61a01e Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Tue, 10 Dec 2024 21:48:48 +0100
Subject: [PATCH] draft1

---
 docs/docs/tutorials/cookbooks/RAG-QA-docs.mdx | 482 ++++++++++++++++++
 1 file changed, 482 insertions(+)
 create mode 100644 docs/docs/tutorials/cookbooks/RAG-QA-docs.mdx

diff --git a/docs/docs/tutorials/cookbooks/RAG-QA-docs.mdx b/docs/docs/tutorials/cookbooks/RAG-QA-docs.mdx
new file mode 100644
index 0000000000..b5bd3bb7b2
--- /dev/null
+++ b/docs/docs/tutorials/cookbooks/RAG-QA-docs.mdx
@@ -0,0 +1,482 @@
+---
+title: "RAG Q&A over Documentation"
+---
+
+:::info Open in Github
+The code for this tutorial is available [here](https://github.com/Agenta-AI/agenta/tree/main/examples/custom_workflows/rag-docs-qa).
+:::
+
+In this tutorial, we'll build a Q&A system for our documentation using RAG (Retrieval-Augmented Generation). Our AI assistant will answer user queries by retrieving relevant sections from our documentation and using them as context when calling a Large Language Model (LLM).
+
+At the end, we will have:
+
+- A playground for testing different embeddings, adjusting top_k values (number of context chunks to include), and experimenting with various prompts and models
+- LLM-as-a-judge and RAG context relevancy evaluations for our Q&A application
+- A deployed application that we can directly invoke or export its configuration to run elsewhere
+
+You can try our playground by creating a free account at [https://cloud.agenta.ai](https://cloud.agenta.ai) and opening the demo.
+
+## Our stack
+
+- **Agenta** for playground, evaluation, observability, and deployment.
+- **[LiteLLM](https://github.com/BerriAI/litellm)** for interacting with language models and embeddings.
+- **[Qdrant](https://qdrant.tech/)** as our vector database for storing and querying document embeddings.
+
+## Ingestion pipeline
+
+The first step is to process our documentation and store it in a vector database for retrieval. Let's start by looking at how we ingest our documentation into Qdrant.
+
+```python title="ingest.py"
+
+OPENAI_EMBEDDING_DIM = 1536  # For text-embedding-ada-002
+COHERE_EMBEDDING_DIM = 1024  # For embed-english-v3.0
+
+qdrant_client = QdrantClient(
+    url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY")
+)
+
+
+def chunk_text(text: str, max_chunk_size: int = 1500) -> List[str]:
+    """
+    Split text into chunks based on paragraphs and size.
+    Tries to maintain context by keeping paragraphs together when possible.
+    """
+    # Split by double newlines to preserve paragraph structure
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+
+    chunks = []
+    current_chunk = []
+    current_size = 0
+
+    for paragraph in paragraphs:
+        paragraph_size = len(paragraph)
+
+        # If a single paragraph is too large, split it by sentences
+        if paragraph_size > max_chunk_size:
+            sentences = [s.strip() + "." for s in paragraph.split(".") if s.strip()]
+            for sentence in sentences:
+                if len(sentence) > max_chunk_size:
+                    # If even a sentence is too long, split it by chunks
+                    for i in range(0, len(sentence), max_chunk_size):
+                        chunks.append(sentence[i : i + max_chunk_size])
+                elif current_size + len(sentence) > max_chunk_size:
+                    # Start new chunk
+                    chunks.append(" ".join(current_chunk))
+                    current_chunk = [sentence]
+                    current_size = len(sentence)
+                else:
+                    current_chunk.append(sentence)
+                    current_size += len(sentence)
+        # If adding this paragraph would exceed the limit, start a new chunk
+        elif current_size + paragraph_size > max_chunk_size:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [paragraph]
+            current_size = paragraph_size
+        else:
+            current_chunk.append(paragraph)
+            current_size += paragraph_size
+
+    # Add the last chunk if it exists
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+
+    return chunks
+
+
+def process_doc(file_path: str, docs_path: str, docs_base_url: str) -> List[Dict]:
+    """Process a single document into chunks with metadata."""
+    with open(file_path, "r", encoding="utf-8") as f:
+        # Parse frontmatter and content
+        post = frontmatter.load(f)
+        content = post.content
+
+        # Calculate document hash
+        doc_hash = calculate_doc_hash(content)
+
+        # Get document URL
+        doc_url = get_doc_url(file_path, docs_path, docs_base_url)
+
+        # Create base metadata
+        metadata = {
+            "title": post.get("title", ""),
+            "url": doc_url,
+            "file_path": file_path,
+            "last_updated": datetime.utcnow().isoformat(),
+            "doc_hash": doc_hash,
+        }
+
+        # Chunk the content
+        chunks = chunk_text(content)
+
+        return [
+            {"content": chunk, "metadata": metadata, "doc_hash": doc_hash}
+            for chunk in chunks
+        ]
+
+
+def get_embeddings(text: str) -> Dict[str, List[float]]:
+    """Get embeddings using both OpenAI and Cohere models via LiteLLM."""
+    # Get OpenAI embedding
+    openai_response = embedding(model="text-embedding-ada-002", input=[text])
+    openai_embedding = openai_response["data"][0]["embedding"]
+
+    # Get Cohere embedding
+    cohere_response = embedding(
+        model="cohere/embed-english-v3.0",
+        input=[text],
+        input_type="search_document",  # Specific to Cohere v3 models
+    )
+    cohere_embedding = cohere_response["data"][0]["embedding"]
+
+    return {"openai": openai_embedding, "cohere": cohere_embedding}
+
+
+def setup_qdrant_collection():
+    """Create or recreate the vector collection."""
+    # Delete if exists
+    try:
+        qdrant_client.delete_collection(COLLECTION_NAME)
+    except Exception:
+        pass
+
+    # Create collection with two vector types
+    qdrant_client.create_collection(
+        collection_name=COLLECTION_NAME,
+        vectors_config={
+            "openai": models.VectorParams(
+                size=OPENAI_EMBEDDING_DIM, distance=models.Distance.COSINE
+            ),
+            "cohere": models.VectorParams(
+                size=COHERE_EMBEDDING_DIM, distance=models.Distance.COSINE
+            ),
+        },
+    )
+
+
+def upsert_chunks(chunks: List[Dict]):
+    """Upsert document chunks to the vector store."""
+    for i, chunk in enumerate(chunks):
+        # Get both embeddings using LiteLLM
+        embeddings = get_embeddings(chunk["content"])
+
+        # Create payload
+        payload = {**chunk["metadata"], "content": chunk["content"], "chunk_index": i}
+
+        # Upsert to Qdrant
+        qdrant_client.upsert(
+            collection_name=COLLECTION_NAME,
+            points=[
+                models.PointStruct(
+                    id=f"{chunk['doc_hash']}",
+                    payload=payload,
+                    vector=embeddings,  # Contains both 'openai' and 'cohere' embeddings
+                )
+            ],
+        )
+
+
+def main():
+    # Get environment variables
+    docs_path = os.getenv("DOCS_PATH")
+    docs_base_url = os.getenv("DOCS_BASE_URL")
+
+    if not docs_path or not docs_base_url:
+        raise ValueError("DOCS_PATH and DOCS_BASE_URL must be set in .env file")
+
+    # Create fresh collection
+    setup_qdrant_collection()
+
+    # Process all documents
+    all_docs = get_all_docs(docs_path)
+    for doc_path in tqdm.tqdm(all_docs):
+        print(f"Processing {doc_path}")
+        chunks = process_doc(doc_path, docs_path, docs_base_url)
+        upsert_chunks(chunks)
+```
+
+This script performs the following steps:
+
+1. **Loads documentation files:** Reads all `.mdx` files from the documentation directory.
+2. **Processes documents:** Chunks the text, adds metadata (e.g. the url where the page where to be found)..
+3. **Generates embeddings:** Generate embeddings for each chunk using both OpenAI and Cohere models. We use both because we would like to compare them in the playground.
+4. **Stores embeddings in Qdrant:** Upserts the embeddings into a Qdrant collection for later retrieval. We use named vectors to save multiple embeddings for the same document.
+
+To run the ingestion pipeline, you need first to create a collection in Qdrant and then set the following environment variables:
+
+- `QDRANT_URL`: The URL of your Qdrant instance.
+- `QDRANT_API_KEY`: The API key for your Qdrant instance.
+- `DOCS_PATH`: The folder containing the documentation (in our case it's under `agenta/docs/docs`).
+- `DOCS_BASE_URL`: The base URL where the documentation can be found (in our case it's `https://docs.agenta.ai`).
+
+:::info
+The complete script with a setup readme is available [here](https://github.com/Agenta-AI/agenta/tree/main/examples/custom_workflows/rag-docs-qa).
+:::
+
+## Querying the Assistant
+
+Now that we have ingested the documentation into the Qdrant vector database, let's create the query logic for our assistant. Parts related to the Agenta integrations are highlighted.
+
+```python title="query.py"
+#highlight-start
+import agenta as ag
+from pydantic import BaseModel, Field
+from typing import Annotated
+from agenta.sdk.assets import supported_llm_models
+#highlight-end
+
+system_prompt = """
+You are a helpful assistant that answers questions based on the documentation.
+"""
+user_prompt = """
+Here is the query: {query}
+
+Here is the context: {context}
+"""
+#highlight-start
+ag.init()
+#highlight-end
+
+#highlight-start
+litellm.callbacks = [ag.callbacks.litellm_handler()]
+#highlight-end
+
+# Initialize Qdrant client
+qdrant_client = QdrantClient(
+    url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY")
+)
+
+#highlight-start
+# We define here the configuration that will be used by the playground
+class Config(BaseModel):
+    system_prompt: str = Field(default=system_prompt)
+    user_prompt: str = Field(default=user_prompt)
+    embedding_model: Annotated[str, ag.MultipleChoice(["openai", "cohere"])] = Field(
+        default="openai"
+    )
+    llm_model: Annotated[str, ag.MultipleChoice(choices=supported_llm_models)] = Field(
+        default="gpt-3.5-turbo"
+    )
+    top_k: int = Field(default=10, ge=1, le=25)
+    rerank_top_k: int = Field(default=3, ge=1, le=10)
+    use_rerank: bool = Field(default=True)
+#highlight-end
+
+
+def get_embeddings(text: str, model: str) -> Dict[str, List[float]]:
+    """Get embeddings using both OpenAI and Cohere models via LiteLLM."""
+    if model == "openai":
+        return embedding(model="text-embedding-ada-002", input=[text])["data"][0]["embedding"]
+    elif model == "cohere":
+        return embedding(
+            model="cohere/embed-english-v3.0",
+            input=[text],
+            input_type="search_query",  # Use search_query for queries
+        )["data"][0]["embedding"]
+
+    raise ValueError(f"Unknown model: {model}")
+
+#highlight-next-line
+@ag.instrument()
+def search_docs(
+    query: str, collection_name: str = os.getenv("COLLECTION_NAME", "docs_collection")
+) -> List[Dict]:
+    """
+    Search the documentation using embeddings.
+
+    Args:
+        query: The search query
+        collection_name: Name of the Qdrant collection to search
+
+    Returns:
+        List of dictionaries containing matched documents and their metadata
+    """
+
+    #highlight-start
+    # Get embeddings for the query
+    config = ag.ConfigManager.get_from_route(Config)
+    #highlight-end
+    # Search using embeddings
+    results = qdrant_client.query_points(
+        collection_name=collection_name,
+        query=get_embeddings(query, config.embedding_model),
+        using=config.embedding_model,
+        limit=config.top_k,
+    )
+    # Format results
+    formatted_results = []
+    for result in results.points:
+        formatted_result = {
+            "content": result.payload["content"],
+            "metadata": {
+                "title": result.payload["title"],
+                "url": result.payload["url"],
+                "score": result.score,
+            },
+        }
+        formatted_results.append(formatted_result)
+
+    return formatted_results
+
+#highlight-next-line
+@ag.instrument()
+def llm(query: str, results: List[Dict]):
+    #highlight-next-line
+    config = ag.ConfigManager.get_from_route(Config)
+    context = []
+    for i, result in enumerate(results, 1):
+        score = result["metadata"].get("rerank_score", result["metadata"]["score"])
+        item = f"Result {i} (Score: {score:.3f})\n"
+        item += f"Title: {result['metadata']['title']}\n"
+        item += f"URL: {result['metadata']['url']}\n"
+        item += f"Content: {result['content']}\n"
+        item += "-" * 80 + "\n"
+        context.append(item)
+    #highlight-start
+    # We store the context in the trace so that it can be used for evaluation
+    ag.tracing.store_internals({"context": context})
+    #highlight-end
+    response = completion(
+        model=config.llm_model,
+        messages=[
+            {"role": "system", "content": config.system_prompt},
+            {
+                "role": "user",
+                "content": config.user_prompt.format(
+                    query=query, context="".join(context)
+                ),
+            },
+        ],
+    )
+    return response.choices[0].message.content
+
+#highlight-next-line
+@ag.instrument()
+def rerank_results(query: str, results: List[Dict]) -> List[Dict]:
+    """Rerank the search results using Cohere's reranker."""
+    #highlight-start
+    config = ag.ConfigManager.get_from_route(Config)
+    #highlight-end
+    # Format documents for reranking
+    documents = [result["content"] for result in results]
+
+    # Perform reranking
+    reranked = rerank(
+        model="cohere/rerank-english-v3.0",
+        query=query,
+        documents=documents,
+        top_n=config.rerank_top_k,
+    )
+    # Reorder the original results based on reranking
+    reranked_results = []
+    for item in reranked.results:
+        # The rerank function returns dictionaries with 'document' and 'index' keys
+        reranked_results.append(results[item["index"]])
+        # Add rerank score to metadata
+        reranked_results[-1]["metadata"]["rerank_score"] = item["relevance_score"]
+
+    return reranked_results
+
+#highlight-start
+# We define here the route that will be used by the playground
+@ag.route("/", config_schema=Config)
+@ag.instrument()
+#highlight-end
+def generate(query: str):
+    #highlight-next-line
+    config = ag.ConfigManager.get_from_route(Config)
+    results = search_docs(query)
+    if config.use_rerank:
+        reranked_results = rerank_results(query, results)
+        return llm(query, reranked_results)
+    else:
+        return llm(query, results)
+```
+
+This script handles user queries by:
+
+1. **Searching the documentation:** Uses the query to retrieve relevant documents from Qdrant.
+2. **Optionally reranking results:** Improves the relevance of results using Cohere's reranker.
+3. **Generating the answer:** Constructs a prompt with the query and context, then calls the LLM to generate the final answer.
+
+### Instrumentation with Agenta
+
+We use Agenta's `@ag.instrument()` decorator to instrument functions. This allows us to trace inputs, outputs, and internal variables for better observability and debugging.
+
+Additionally, we store internal variables using `ag.tracing.store_internals()`, which helps in evaluation .
+
+## Configuration
+
+We define a `Config` class using Pydantic to manage configurations for our assistant. This includes the system and user prompts, models to use, and other parameters.
+
+```python
+from pydantic import BaseModel, Field
+from typing import Annotated
+import agenta as ag
+from agenta.sdk.assets import supported_llm_models
+
+class Config(BaseModel):
+    system_prompt: str = Field(default=system_prompt)
+    user_prompt: str = Field(default=user_prompt)
+    embedding_model: Annotated[str, ag.MultipleChoice(["openai", "cohere"])] = Field(
+        default="openai"
+    )
+    llm_model: Annotated[str, ag.MultipleChoice(choices=supported_llm_models)] = Field(
+        default="gpt-3.5-turbo"
+    )
+    top_k: int = Field(default=10, ge=1, le=25)
+    rerank_top_k: int = Field(default=3, ge=1, le=10)
+    use_rerank: bool = Field(default=True)
+```
+
+This configuration allows us to experiment with different models and parameters easily.
+
+## Adding to the Playground
+
+With Agenta, we can serve our application and add it to a playground for interactive testing and parameter tuning.
+
+**[Instructions on adding to the playground will be added here.]**
+
+## Evaluating the Assistant
+
+To ensure our assistant provides accurate and relevant answers, we'll use evaluators to assess its performance.
+
+### RAG Relevancy Evaluator
+
+We use the RAG Relevancy evaluator as described in [Agenta's evaluation documentation](#). (Placeholder for documentation link.)
+
+**Configuration:**
+
+- **Question key:** `trace.generate.inputs.query`
+- **Answer key:** `trace.generate.outputs`
+- **Contexts key:** `trace.generate.llm.internals.context`
+
+This evaluator measures how relevant the assistant's answers are with respect to the retrieved context.
+
+### LLM-as-a-Judge Evaluator
+
+We also set up an LLM-as-a-Judge evaluator to rate the quality of the assistant's responses.
+
+**[Placeholder for the prompt used in the evaluator.]**
+
+## Deploying the Assistant
+
+Once satisfied with the assistant's performance, we can deploy it as an API endpoint using Agenta.
+
+**[Deployment instructions will be added here.]**
+
+## Conclusion
+
+In this tutorial, we've:
+
+- **Built** a RAG-based Q&A assistant over our documentation.
+- **Ingested and processed** documentation into a vector database.
+- **Handled user queries** by retrieving relevant context and generating answers.
+- **Instrumented our code** for observability with Agenta.
+- **Configured and used evaluators** to assess performance.
+- **Prepared the assistant for deployment**.
+
+By following these steps, you can create powerful AI assistants that provide accurate information based on your documentation.
+
+---
+
+**Note:** Sections marked as placeholders will be completed later.