added VertexStringEvaluator (langchain-ai#251)

jzaldi · May 23, 2024 · 0649c09 · 0649c09
1 parent 7586813
commit 0649c09
Show file tree

Hide file tree

Showing 9 changed files with 528 additions and 0 deletions.
diff --git a/libs/vertexai/langchain_google_vertexai/__init__.py b/libs/vertexai/langchain_google_vertexai/__init__.py
@@ -10,6 +10,7 @@
 from langchain_google_vertexai.chains import create_structured_runnable
 from langchain_google_vertexai.chat_models import ChatVertexAI
 from langchain_google_vertexai.embeddings import VertexAIEmbeddings
+from langchain_google_vertexai.evaluators.evaluation import VertexStringEvaluator
 from langchain_google_vertexai.functions_utils import (
     PydanticFunctionsOutputParser,
 )
@@ -68,4 +69,5 @@
     "VertexAIImageGeneratorChat",
     "VertexAIModelGarden",
     "VertexAIVisualQnAChat",
+    "VertexStringEvaluator",
 ]
diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/__init__.py b/libs/vertexai/langchain_google_vertexai/evaluators/__init__.py
diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/_core.py b/libs/vertexai/langchain_google_vertexai/evaluators/_core.py
@@ -0,0 +1,179 @@
+"""Interfaces to be implemented by general evaluators.
+
+Remove after interfaces will be moved to lc-core.
+"""
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Union
+from warnings import warn
+
+from langchain_core.runnables.config import run_in_executor
+
+logger = logging.getLogger(__name__)
+
+
+class _EvalArgsMixin:
+    """Mixin for checking evaluation arguments."""
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
+        return False
+
+    @property
+    def requires_input(self) -> bool:
+        """Whether this evaluator requires an input string."""
+        return False
+
+    @property
+    def _skip_input_warning(self) -> str:
+        """Warning to show when input is ignored."""
+        return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
+
+    @property
+    def _skip_reference_warning(self) -> str:
+        """Warning to show when reference is ignored."""
+        return (
+            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
+        )
+
+    def _check_evaluation_args(
+        self,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+    ) -> None:
+        """Check if the evaluation arguments are valid.
+
+        Args:
+            reference (Optional[str], optional): The reference label.
+            input (Optional[str], optional): The input string.
+        Raises:
+            ValueError: If the evaluator requires an input string but none is provided,
+                or if the evaluator requires a reference label but none is provided.
+        """
+        if self.requires_input and input is None:
+            raise ValueError(f"{self.__class__.__name__} requires an input string.")
+        elif input is not None and not self.requires_input:
+            warn(self._skip_input_warning)
+        if self.requires_reference and reference is None:
+            raise ValueError(f"{self.__class__.__name__} requires a reference string.")
+        elif reference is not None and not self.requires_reference:
+            warn(self._skip_reference_warning)
+
+
+class StringEvaluator(_EvalArgsMixin, ABC):
+    """Grade, tag, or otherwise evaluate predictions relative to their inputs
+    and/or reference labels."""
+
+    @property
+    def evaluation_name(self) -> str:
+        """The name of the evaluation."""
+        return self.__class__.__name__
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
+        return False
+
+    @abstractmethod
+    def _evaluate_strings(
+        self,
+        *,
+        prediction: Union[str, Any],
+        reference: Optional[Union[str, Any]] = None,
+        input: Optional[Union[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                     - score: the score of the evaluation, if applicable.
+                     - value: the string value of the evaluation, if applicable.
+                     - reasoning: the reasoning for the evaluation, if applicable.
+        """  # noqa: E501
+
+    async def _aevaluate_strings(
+        self,
+        *,
+        prediction: Union[str, Any],
+        reference: Optional[Union[str, Any]] = None,
+        input: Optional[Union[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                     - score: the score of the evaluation, if applicable.
+                     - value: the string value of the evaluation, if applicable.
+                     - reasoning: the reasoning for the evaluation, if applicable.
+        """  # noqa: E501
+        return await run_in_executor(
+            None,
+            self._evaluate_strings,
+            prediction=prediction,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    def evaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+        """  # noqa: E501
+        self._check_evaluation_args(reference=reference, input=input)
+        return self._evaluate_strings(
+            prediction=prediction, reference=reference, input=input, **kwargs
+        )
+
+    async def aevaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+        """  # noqa: E501
+        self._check_evaluation_args(reference=reference, input=input)
+        return await self._aevaluate_strings(
+            prediction=prediction, reference=reference, input=input, **kwargs
+        )