Merge branch 'explodinggradients:main' into ayulockin/cache

explodinggradients · Dec 9, 2024 · 929deb8 · 929deb8
2 parents b3ebb1f + 57c6cbf
commit 929deb8
Show file tree

Hide file tree

Showing 14 changed files with 974 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -64,9 +64,10 @@ pip install git+https://github.com/explodinggradients/ragas
 
 ### Evaluate your RAG with Ragas metrics
 
-This is 4 main lines:
+This is 5 main lines:
 
 ```python
+from ragas import evaluate
 from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
 from langchain_openai.chat_models import ChatOpenAI
 from ragas.llms import LangchainLLMWrapper

diff --git a/docs/howtos/customizations/customize_models.md b/docs/howtos/customizations/customize_models.md
@@ -70,11 +70,12 @@ import google.auth
 from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings
 from ragas.llms import LangchainLLMWrapper
 from ragas.embeddings import LangchainEmbeddingsWrapper
+from langchain_core.outputs import LLMResult, ChatGeneration
 
 config = {
     "project_id": "<your-project-id>",
-    "chat_model_id": "gemini-1.0-pro-002",
-    "embedding_model_id": "textembedding-gecko",
+    "chat_model_id": "gemini-1.5-pro-002",
+    "embedding_model_id": "text-embedding-005",
 }
 
 # authenticate to GCP
@@ -89,7 +90,41 @@ vertextai_embeddings = VertexAIEmbeddings(
     credentials=creds, model_name=config["embedding_model_id"]
 )
 
-vertextai_llm = LangchainLLMWrapper(vertextai_llm)
+# Create a custom is_finished_parser to capture Gemini generation completion signals
+def gemini_is_finished_parser(response: LLMResult) -> bool:
+    is_finished_list = []
+    for g in response.flatten():
+        resp = g.generations[0][0]
+
+        # Check generation_info first
+        if resp.generation_info is not None:
+            finish_reason = resp.generation_info.get("finish_reason")
+            if finish_reason is not None:
+                is_finished_list.append(
+                    finish_reason in ["STOP", "MAX_TOKENS"]
+                )
+                continue
+
+        # Check response_metadata as fallback
+        if isinstance(resp, ChatGeneration) and resp.message is not None:
+            metadata = resp.message.response_metadata
+            if metadata.get("finish_reason"):
+                is_finished_list.append(
+                    metadata["finish_reason"] in ["STOP", "MAX_TOKENS"]
+                )
+            elif metadata.get("stop_reason"):
+                is_finished_list.append(
+                    metadata["stop_reason"] in ["STOP", "MAX_TOKENS"] 
+                )
+
+        # If no finish reason found, default to True
+        if not is_finished_list:
+            is_finished_list.append(True)
+
+    return all(is_finished_list)
+
+
+vertextai_llm = LangchainLLMWrapper(vertextai_llm, is_finished_parser=gemini_is_finished_parser)
 vertextai_embeddings = LangchainEmbeddingsWrapper(vertextai_embeddings)
 ```
 Yay! Now are you ready to use ragas with Google VertexAI endpoints

diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py
@@ -133,12 +133,15 @@ def __str__(self):
 
 def parse_run_traces(
     traces: t.Dict[str, ChainRun],
+    parent_run_id: t.Optional[str] = None,
 ) -> t.List[t.Dict[str, t.Any]]:
+
     root_traces = [
         chain_trace
         for chain_trace in traces.values()
-        if chain_trace.parent_run_id is None
+        if chain_trace.parent_run_id == parent_run_id
     ]
+
     if len(root_traces) > 1:
         raise ValueError(
             "Multiple root traces found! This is a bug on our end, please file an issue and we will fix it ASAP :)"
@@ -159,7 +162,7 @@ def parse_run_traces(
             prompt_traces = {}
             for i, prompt_uuid in enumerate(metric_trace.children):
                 prompt_trace = traces[prompt_uuid]
-                prompt_traces[f"{i}_{prompt_trace.name}"] = {
+                prompt_traces[f"{prompt_trace.name}"] = {
                     "input": prompt_trace.inputs.get("data", {}),
                     "output": prompt_trace.outputs.get("output", {}),
                 }

diff --git a/src/ragas/config.py b/src/ragas/config.py
@@ -5,7 +5,7 @@
 from ragas.embeddings import BaseRagasEmbeddings
 from ragas.llms import BaseRagasLLM
 from ragas.losses import Loss
-from ragas.optimizers import Optimizer
+from ragas.optimizers import GeneticOptimizer, Optimizer
 
 DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100}
 
@@ -20,7 +20,7 @@ class DemonstrationConfig(BaseModel):
 class InstructionConfig(BaseModel):
     enabled: bool = True
     loss: t.Optional[Loss] = None
-    optimizer: Optimizer
+    optimizer: Optimizer = GeneticOptimizer()
     optimizer_config: t.Dict[str, t.Any] = Field(
         default_factory=lambda: DEFAULT_OPTIMIZER_CONFIG
     )

diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py
@@ -6,6 +6,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass, field
+from uuid import UUID
 
 import numpy as np
 from datasets import Dataset as HFDataset
@@ -43,6 +44,13 @@ def get_features(self) -> t.List[str]:
         """
         return list(self.to_dict().keys())
 
+    def to_string(self) -> str:
+        """
+        Get the string representation of the sample.
+        """
+        sample_dict = self.to_dict()
+        return "".join(f"\n{key}:\n\t{val}\n" for key, val in sample_dict.items())
+
 
 class SingleTurnSample(BaseSample):
     """
@@ -378,6 +386,7 @@ class EvaluationResult:
     cost_cb: t.Optional[CostCallbackHandler] = None
     traces: t.List[t.Dict[str, t.Any]] = field(default_factory=list)
     ragas_traces: t.Dict[str, ChainRun] = field(default_factory=dict, repr=False)
+    run_id: t.Optional[UUID] = None
 
     def __post_init__(self):
         # transform scores from list of dicts to dict of lists
@@ -395,7 +404,8 @@ def __post_init__(self):
                 values.append(value + 1e-10)
 
         # parse the traces
-        self.traces = parse_run_traces(self.ragas_traces)
+        run_id = str(self.run_id) if self.run_id is not None else None
+        self.traces = parse_run_traces(self.ragas_traces, run_id)
 
     def __repr__(self) -> str:
         score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]

diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 import typing as t
+from uuid import UUID
 
 from datasets import Dataset
 from langchain_core.callbacks import BaseCallbackHandler, BaseCallbackManager
 from langchain_core.embeddings import Embeddings as LangchainEmbeddings
 from langchain_core.language_models import BaseLanguageModel as LangchainLLM
+from tqdm.auto import tqdm
 
 from ragas._analytics import track_was_completed
 from ragas.callbacks import ChainType, RagasTracer, new_group
@@ -59,12 +61,14 @@ def evaluate(
     embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings] = None,
     callbacks: Callbacks = None,
     in_ci: bool = False,
-    run_config: RunConfig = RunConfig(),
+    run_config: t.Optional[RunConfig] = None,
     token_usage_parser: t.Optional[TokenUsageParser] = None,
     raise_exceptions: bool = False,
     column_map: t.Optional[t.Dict[str, str]] = None,
     show_progress: bool = True,
     batch_size: t.Optional[int] = None,
+    _run_id: t.Optional[UUID] = None,
+    _pbar: t.Optional[tqdm] = None,
 ) -> EvaluationResult:
     """
     Run the evaluation on the dataset with different metrics
@@ -146,6 +150,7 @@ def evaluate(
     """
     column_map = column_map or {}
     callbacks = callbacks or []
+    run_config = run_config or RunConfig()
 
     if helicone_config.is_enabled:
         import uuid
@@ -226,6 +231,7 @@ def evaluate(
         run_config=run_config,
         show_progress=show_progress,
         batch_size=batch_size,
+        pbar=_pbar,
     )
 
     # Ragas Callbacks
@@ -333,6 +339,7 @@ def evaluate(
                 cost_cb,
             ),
             ragas_traces=tracer.traces,
+            run_id=_run_id,
         )
         if not evaluation_group_cm.ended:
             evaluation_rm.on_chain_end({"scores": result.scores})

diff --git a/src/ragas/executor.py b/src/ragas/executor.py
@@ -84,6 +84,7 @@ class Executor:
     batch_size: t.Optional[int] = None
     run_config: t.Optional[RunConfig] = field(default=None, repr=False)
     _nest_asyncio_applied: bool = field(default=False, repr=False)
+    pbar: t.Optional[tqdm] = None
 
     def wrap_callable_with_index(
         self, callable: t.Callable, counter: int
@@ -130,21 +131,22 @@ async def _process_jobs(self) -> t.List[t.Any]:
         results = []
 
         if not self.batch_size:
-            with tqdm(
-                total=len(self.jobs),
-                desc=self.desc,
-                disable=not self.show_progress,
-            ) as pbar:
-                # Create coroutines
-                coroutines = [
-                    afunc(*args, **kwargs) for afunc, args, kwargs, _ in self.jobs
-                ]
-                for future in await as_completed(coroutines, max_workers):
-                    result = await future
-                    results.append(result)
-                    pbar.update(1)
+            # Use external progress bar if provided, otherwise create one
+            if self.pbar is None:
+                with tqdm(
+                    total=len(self.jobs),
+                    desc=self.desc,
+                    disable=not self.show_progress,
+                ) as internal_pbar:
+                    await self._process_coroutines(
+                        self.jobs, internal_pbar, results, max_workers
+                    )
+            else:
+                await self._process_coroutines(
+                    self.jobs, self.pbar, results, max_workers
+                )
 
-                return results
+            return results
 
         # With batching, show nested progress bars
         batches = batched(self.jobs, self.batch_size)  # generator of job tuples
@@ -182,6 +184,14 @@ async def _process_jobs(self) -> t.List[t.Any]:
 
         return results
 
+    async def _process_coroutines(self, jobs, pbar, results, max_workers):
+        """Helper function to process coroutines and update the progress bar."""
+        coroutines = [afunc(*args, **kwargs) for afunc, args, kwargs, _ in jobs]
+        for future in await as_completed(coroutines, max_workers):
+            result = await future
+            results.append(result)
+            pbar.update(1)
+
     def results(self) -> t.List[t.Any]:
         """
         Execute all submitted jobs and return their results. The results are returned in the order of job submission.

diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py
@@ -134,7 +134,7 @@ def __init__(
     def is_finished(self, response: LLMResult) -> bool:
         """
         Parse the response to check if the LLM finished by checking the finish_reason
-        or stop_reason.
+        or stop_reason. Supports OpenAI and Vertex AI models.
         """
         if self.is_finished_parser is not None:
             return self.is_finished_parser(response)
@@ -145,30 +145,34 @@ def is_finished(self, response: LLMResult) -> bool:
             resp = g.generations[0][0]
             if resp.generation_info is not None:
                 # generation_info is provided - so we parse that
-
-                # OpenAI uses "stop" to indicate that the generation is finished
-                # and is stored in 'finish_reason' key in generation_info
-                if resp.generation_info.get("finish_reason") is not None:
+                finish_reason = resp.generation_info.get("finish_reason")
+                if finish_reason is not None:
+                    # OpenAI uses "stop"
+                    # Vertex AI uses "STOP" or "MAX_TOKENS"
                     is_finished_list.append(
-                        resp.generation_info.get("finish_reason") == "stop"
+                        finish_reason in ["stop", "STOP", "MAX_TOKENS"]
                     )
+
                 # provied more conditions here
                 # https://github.com/explodinggradients/ragas/issues/1548
 
             # if generation_info is empty, we parse the response_metadata
             # this is less reliable
+
             elif (
                 isinstance(resp, ChatGeneration)
                 and t.cast(ChatGeneration, resp).message is not None
             ):
                 resp_message: BaseMessage = t.cast(ChatGeneration, resp).message
                 if resp_message.response_metadata.get("finish_reason") is not None:
+                    finish_reason = resp_message.response_metadata.get("finish_reason")
                     is_finished_list.append(
-                        resp_message.response_metadata.get("finish_reason") == "stop"
+                        finish_reason in ["stop", "STOP", "MAX_TOKENS"]
                     )
                 elif resp_message.response_metadata.get("stop_reason") is not None:
+                    stop_reason = resp_message.response_metadata.get("stop_reason")
                     is_finished_list.append(
-                        resp_message.response_metadata.get("stop_reason") == "end_turn"
+                        stop_reason in ["end_turn", "STOP", "MAX_TOKENS"]
                     )
             # default to True
             else:

diff --git a/src/ragas/losses.py b/src/ragas/losses.py
@@ -1,6 +1,9 @@
 import typing as t
 from abc import ABC, abstractmethod
 
+from pydantic import GetCoreSchemaHandler
+from pydantic_core import CoreSchema, core_schema
+
 
 class Loss(ABC):
     """
@@ -11,6 +14,17 @@ class Loss(ABC):
     def __call__(self, predicted: t.List, actual: t.List) -> float:
         raise NotImplementedError
 
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: t.Any, handler: GetCoreSchemaHandler
+    ) -> CoreSchema:
+        """
+        Define how Pydantic generates a schema for BaseRagasEmbeddings.
+        """
+        return core_schema.no_info_after_validator_function(
+            cls, core_schema.is_instance_schema(cls)  # The validator function
+        )
+
 
 class MSELoss(Loss):
     """