diff --git a/libs/vertexai/langchain_google_vertexai/__init__.py b/libs/vertexai/langchain_google_vertexai/__init__.py index fef80927..86b138e3 100644 --- a/libs/vertexai/langchain_google_vertexai/__init__.py +++ b/libs/vertexai/langchain_google_vertexai/__init__.py @@ -10,6 +10,7 @@ from langchain_google_vertexai.chains import create_structured_runnable from langchain_google_vertexai.chat_models import ChatVertexAI from langchain_google_vertexai.embeddings import VertexAIEmbeddings +from langchain_google_vertexai.evaluators.evaluation import VertexStringEvaluator from langchain_google_vertexai.functions_utils import ( PydanticFunctionsOutputParser, ) @@ -68,4 +69,5 @@ "VertexAIImageGeneratorChat", "VertexAIModelGarden", "VertexAIVisualQnAChat", + "VertexStringEvaluator", ] diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/__init__.py b/libs/vertexai/langchain_google_vertexai/evaluators/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/_core.py b/libs/vertexai/langchain_google_vertexai/evaluators/_core.py new file mode 100644 index 00000000..69957c45 --- /dev/null +++ b/libs/vertexai/langchain_google_vertexai/evaluators/_core.py @@ -0,0 +1,179 @@ +"""Interfaces to be implemented by general evaluators. + +Remove after interfaces will be moved to lc-core. +""" +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import Any, Optional, Union +from warnings import warn + +from langchain_core.runnables.config import run_in_executor + +logger = logging.getLogger(__name__) + + +class _EvalArgsMixin: + """Mixin for checking evaluation arguments.""" + + @property + def requires_reference(self) -> bool: + """Whether this evaluator requires a reference label.""" + return False + + @property + def requires_input(self) -> bool: + """Whether this evaluator requires an input string.""" + return False + + @property + def _skip_input_warning(self) -> str: + """Warning to show when input is ignored.""" + return f"Ignoring input in {self.__class__.__name__}, as it is not expected." + + @property + def _skip_reference_warning(self) -> str: + """Warning to show when reference is ignored.""" + return ( + f"Ignoring reference in {self.__class__.__name__}, as it is not expected." + ) + + def _check_evaluation_args( + self, + reference: Optional[str] = None, + input: Optional[str] = None, + ) -> None: + """Check if the evaluation arguments are valid. + + Args: + reference (Optional[str], optional): The reference label. + input (Optional[str], optional): The input string. + Raises: + ValueError: If the evaluator requires an input string but none is provided, + or if the evaluator requires a reference label but none is provided. + """ + if self.requires_input and input is None: + raise ValueError(f"{self.__class__.__name__} requires an input string.") + elif input is not None and not self.requires_input: + warn(self._skip_input_warning) + if self.requires_reference and reference is None: + raise ValueError(f"{self.__class__.__name__} requires a reference string.") + elif reference is not None and not self.requires_reference: + warn(self._skip_reference_warning) + + +class StringEvaluator(_EvalArgsMixin, ABC): + """Grade, tag, or otherwise evaluate predictions relative to their inputs + and/or reference labels.""" + + @property + def evaluation_name(self) -> str: + """The name of the evaluation.""" + return self.__class__.__name__ + + @property + def requires_reference(self) -> bool: + """Whether this evaluator requires a reference label.""" + return False + + @abstractmethod + def _evaluate_strings( + self, + *, + prediction: Union[str, Any], + reference: Optional[Union[str, Any]] = None, + input: Optional[Union[str, Any]] = None, + **kwargs: Any, + ) -> dict: + """Evaluate Chain or LLM output, based on optional input and label. + + Args: + prediction (str): The LLM or chain prediction to evaluate. + reference (Optional[str], optional): The reference label to evaluate against. + input (Optional[str], optional): The input to consider during evaluation. + **kwargs: Additional keyword arguments, including callbacks, tags, etc. + Returns: + dict: The evaluation results containing the score or value. + It is recommended that the dictionary contain the following keys: + - score: the score of the evaluation, if applicable. + - value: the string value of the evaluation, if applicable. + - reasoning: the reasoning for the evaluation, if applicable. + """ # noqa: E501 + + async def _aevaluate_strings( + self, + *, + prediction: Union[str, Any], + reference: Optional[Union[str, Any]] = None, + input: Optional[Union[str, Any]] = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate Chain or LLM output, based on optional input and label. + + Args: + prediction (str): The LLM or chain prediction to evaluate. + reference (Optional[str], optional): The reference label to evaluate against. + input (Optional[str], optional): The input to consider during evaluation. + **kwargs: Additional keyword arguments, including callbacks, tags, etc. + Returns: + dict: The evaluation results containing the score or value. + It is recommended that the dictionary contain the following keys: + - score: the score of the evaluation, if applicable. + - value: the string value of the evaluation, if applicable. + - reasoning: the reasoning for the evaluation, if applicable. + """ # noqa: E501 + return await run_in_executor( + None, + self._evaluate_strings, + prediction=prediction, + reference=reference, + input=input, + **kwargs, + ) + + def evaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate Chain or LLM output, based on optional input and label. + + Args: + prediction (str): The LLM or chain prediction to evaluate. + reference (Optional[str], optional): The reference label to evaluate against. + input (Optional[str], optional): The input to consider during evaluation. + **kwargs: Additional keyword arguments, including callbacks, tags, etc. + Returns: + dict: The evaluation results containing the score or value. + """ # noqa: E501 + self._check_evaluation_args(reference=reference, input=input) + return self._evaluate_strings( + prediction=prediction, reference=reference, input=input, **kwargs + ) + + async def aevaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate Chain or LLM output, based on optional input and label. + + Args: + prediction (str): The LLM or chain prediction to evaluate. + reference (Optional[str], optional): The reference label to evaluate against. + input (Optional[str], optional): The input to consider during evaluation. + **kwargs: Additional keyword arguments, including callbacks, tags, etc. + Returns: + dict: The evaluation results containing the score or value. + """ # noqa: E501 + self._check_evaluation_args(reference=reference, input=input) + return await self._aevaluate_strings( + prediction=prediction, reference=reference, input=input, **kwargs + ) diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/evaluation.py b/libs/vertexai/langchain_google_vertexai/evaluators/evaluation.py new file mode 100644 index 00000000..e868a64e --- /dev/null +++ b/libs/vertexai/langchain_google_vertexai/evaluators/evaluation.py @@ -0,0 +1,213 @@ +from typing import Any, Dict, List, Optional, Sequence + +import proto # type: ignore[import-untyped] +from google.api_core.client_options import ClientOptions +from google.cloud.aiplatform.constants import base as constants +from google.cloud.aiplatform_v1beta1 import ( + EvaluationServiceAsyncClient, + EvaluationServiceClient, +) +from google.cloud.aiplatform_v1beta1.types import ( + EvaluateInstancesRequest, + EvaluateInstancesResponse, +) + +from langchain_google_vertexai._utils import ( + get_client_info, + get_user_agent, +) +from langchain_google_vertexai.evaluators._core import StringEvaluator + +_METRICS = [ + "bleu", + "exact_match", + "rouge", + "coherence", + "fluency", + "safety", + "groundedness", + "fulfillment", + "summarization_quality", + "summarization_helpfulness", + "summarization_verbosity", + "question_answering_quality", + "question_answering_relevance", + "question_answering_correctness", +] +_METRICS_INPUTS = { + "rouge1": {"rouge_type": "rouge1"}, + "rouge2": {"rouge_type": "rouge2"}, + "rougeL": {"rouge_type": "rougeL"}, + "rougeLsum": {"rouge_type": "rougeLsum"}, +} +_METRICS_ATTRS = { + "safety": ["prediction"], + "coherence": ["prediction"], + "fluency": ["prediction"], + "groundedness": ["context", "prediction"], + "fulfillment": ["prediction", "instruction"], + "summarization_quality": ["prediction", "instruction", "context"], + "summarization_helpfulness": ["prediction", "context"], + "summarization_verbosity": ["prediction", "context"], + "question_answering_quality": ["prediction", "context", "instruction"], + "question_answering_relevance": ["prediction", "instruction"], + "question_answering_correctness": ["prediction", "instruction"], +} +_METRICS_OPTIONAL_ATTRS = { + "summarization_quality": ["reference"], + "summarization_helpfulness": ["reference", "instruction"], + "summarization_verbosity": ["reference", "instruction"], + "question_answering_quality": ["reference"], + "question_answering_relevance": ["reference", "context"], + "question_answering_correctness": ["reference", "context"], +} +# a client supports multiple instances per request for these metrics +_METRICS_MULTIPLE_INSTANCES = ["bleu", "exact_match", "rouge"] + + +def _format_metric(metric: str) -> str: + if metric.startswith("rouge"): + return "rouge" + return metric + + +def _format_instance(instance: Dict[str, str], metric: str) -> Dict[str, str]: + attrs = _METRICS_ATTRS.get(metric, ["prediction", "reference"]) + result = {a: instance[a] for a in attrs} + for attr in _METRICS_OPTIONAL_ATTRS.get(metric, []): + if attr in instance: + result[attr] = instance[attr] + return result + + +def _prepare_request( + instances: Sequence[Dict[str, str]], metric: str, location: str +) -> EvaluateInstancesRequest: + request = EvaluateInstancesRequest() + metric_input: Dict[str, Any] = {"metric_spec": _METRICS_INPUTS.get(metric, {})} + if _format_metric(metric) not in _METRICS_MULTIPLE_INSTANCES: + if len(instances) > 1: + raise ValueError( + f"Metric {metric} supports only a single instance per request, " + f"got {len(instances)}!" + ) + metric_input["instance"] = _format_instance(instances[0], metric=metric) + else: + metric_input["instances"] = [ + _format_instance(i, metric=metric) for i in instances + ] + setattr(request, f"{_format_metric(metric)}_input", metric_input) + request.location = location + return request + + +def _parse_response( + response: EvaluateInstancesResponse, metric: str +) -> List[Dict[str, Any]]: + metric = _format_metric(metric) + result = proto.Message.to_dict(response) + if metric in _METRICS_MULTIPLE_INSTANCES: + return result[f"{metric}_results"][f"{metric}_metric_values"] + return [result[f"{metric}_result"]] + + +class VertexStringEvaluator(StringEvaluator): + """Evaluate the perplexity of a predicted string.""" + + def __init__(self, metric: str, project_id: str, location: str = "us-central1"): + self._metric = metric + client_options = ClientOptions( + api_endpoint=f"{location}-{constants.PREDICTION_API_BASE_PATH}" + ) + self._client = EvaluationServiceClient( + client_options=client_options, + client_info=get_client_info(module=self._user_agent), + ) + self._async_client = EvaluationServiceAsyncClient( + client_options=client_options, + client_info=get_client_info(module=self._user_agent), + ) + self._location = self._client.common_location_path(project_id, location) + if _format_metric(metric) not in _METRICS: + raise ValueError(f"Metric {metric} is not supported yet!") + + def _evaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + request = self._prepare_request(prediction, reference, input, **kwargs) + response = self._client.evaluate_instances(request) + return _parse_response(response, metric=self._metric)[0] + + @property + def _user_agent(self) -> str: + """Gets the User Agent.""" + _, user_agent = get_user_agent(f"{type(self).__name__}_{self._metric}") + return user_agent + + def evaluate( + self, + examples: Sequence[Dict[str, str]], + predictions: Sequence[Dict[str, str]], + *, + question_key: str = "context", + answer_key: str = "reference", + prediction_key: str = "prediction", + instruction_key: str = "instruction", + **kwargs: Any, + ) -> List[dict]: + instances: List[dict] = [] + for example, prediction in zip(examples, predictions): + row = {"prediction": prediction[prediction_key]} + if answer_key in example: + row["reference"] = example[answer_key] + if question_key in example: + row["context"] = example[question_key] + if instruction_key in example: + row["instruction"] = example[instruction_key] + instances.append(row) + + if self._metric in _METRICS_MULTIPLE_INSTANCES: + request = _prepare_request( + instances, metric=self._metric, location=self._location + ) + response = self._client.evaluate_instances(request) + return _parse_response(response, metric=self._metric) + else: + return [self._evaluate_strings(**i) for i in instances] + + def _prepare_request( + self, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> EvaluateInstancesRequest: + instance = {"prediction": prediction} + if reference: + instance["reference"] = reference + if input: + instance["context"] = input + if "instruction" in kwargs: + instance["instruction"] = kwargs["instruction"] + if "context" in kwargs: + instance["context"] = kwargs["context"] + return _prepare_request( + [instance], metric=self._metric, location=self._location + ) + + async def _aevaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + request = self._prepare_request(prediction, reference, input, **kwargs) + response = await self._async_client.evaluate_instances(request) + return _parse_response(response, metric=self._metric)[0] diff --git a/libs/vertexai/pyproject.toml b/libs/vertexai/pyproject.toml index 0d705d5c..ce5c2aeb 100644 --- a/libs/vertexai/pyproject.toml +++ b/libs/vertexai/pyproject.toml @@ -36,6 +36,9 @@ numexpr = "^2.8.6" google-api-python-client = "^2.117.0" langchain-core = { git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core" } +[tool.codespell] +ignore-words-list = "rouge" + [tool.poetry.group.codespell] optional = true diff --git a/libs/vertexai/tests/integration_tests/test_chat_models.py b/libs/vertexai/tests/integration_tests/test_chat_models.py index b3857830..de6a5b23 100644 --- a/libs/vertexai/tests/integration_tests/test_chat_models.py +++ b/libs/vertexai/tests/integration_tests/test_chat_models.py @@ -524,6 +524,7 @@ class MyModel(BaseModel): @pytest.mark.release +@pytest.mark.xfail(reason="flaky") def test_chat_vertexai_gemini_function_calling_with_multiple_parts() -> None: @tool def search( diff --git a/libs/vertexai/tests/integration_tests/test_evaluation.py b/libs/vertexai/tests/integration_tests/test_evaluation.py new file mode 100644 index 00000000..ff35de77 --- /dev/null +++ b/libs/vertexai/tests/integration_tests/test_evaluation.py @@ -0,0 +1,51 @@ +import os + +import pytest + +from langchain_google_vertexai import VertexStringEvaluator + + +@pytest.mark.release +def test_evaluate() -> None: + evaluator = VertexStringEvaluator( + metric="bleu", project_id=os.environ["PROJECT_ID"] + ) + result = evaluator.evaluate( + examples=[ + {"reference": "This is a test."}, + {"reference": "This is another test."}, + ], + predictions=[ + {"prediction": "This is a test."}, + {"prediction": "This is another one."}, + ], + ) + assert len(result) == 2 + assert result[0]["score"] == 1.0 + assert result[1]["score"] < 1.0 + + +@pytest.mark.release +def test_evaluate_strings() -> None: + evaluator = VertexStringEvaluator( + metric="safety", project_id=os.environ["PROJECT_ID"] + ) + result = evaluator._evaluate_strings(prediction="This is a test") + assert isinstance(result, dict) + assert "score" in result + assert "explanation" in result + + +@pytest.mark.release +async def test_aevaluate_strings() -> None: + evaluator = VertexStringEvaluator( + metric="question_answering_quality", project_id=os.environ["PROJECT_ID"] + ) + result = await evaluator._aevaluate_strings( + prediction="London", + input="What is the capital of Great Britain?", + instruction="Be concise", + ) + assert isinstance(result, dict) + assert "score" in result + assert "explanation" in result diff --git a/libs/vertexai/tests/unit_tests/test_evaluation.py b/libs/vertexai/tests/unit_tests/test_evaluation.py new file mode 100644 index 00000000..80cc1ca1 --- /dev/null +++ b/libs/vertexai/tests/unit_tests/test_evaluation.py @@ -0,0 +1,78 @@ +from unittest.mock import MagicMock, patch + +from google.cloud.aiplatform_v1beta1.types import ( + EvaluateInstancesRequest, + EvaluateInstancesResponse, +) + +from langchain_google_vertexai import VertexStringEvaluator +from langchain_google_vertexai.evaluators.evaluation import _prepare_request + + +def test_prepare_request_rouge() -> None: + instances = [ + {"prediction": "test1", "reference": "test2"}, + {"prediction": "test3", "reference": "test4"}, + ] + request = _prepare_request( + instances, metric="rougeL", location="project/123/location/moon1" + ) + expected = EvaluateInstancesRequest( + rouge_input={"metric_spec": {"rouge_type": "rougeL"}, "instances": instances}, + location="project/123/location/moon1", + ) + assert expected == request + + +def test_prepare_request_coherence() -> None: + instance = {"prediction": "test1"} + request = _prepare_request( + [instance], metric="coherence", location="project/123/location/moon1" + ) + expected = EvaluateInstancesRequest( + coherence_input={"metric_spec": {}, "instance": instance}, + location="project/123/location/moon1", + ) + assert expected == request + + +def test_prepare_request_question_answering_correctness() -> None: + instance = {"prediction": "test1", "instruction": "test2", "context": "test3"} + request = _prepare_request( + [instance], + metric="question_answering_correctness", + location="project/123/location/moon1", + ) + expected = EvaluateInstancesRequest( + question_answering_correctness_input={"metric_spec": {}, "instance": instance}, + location="project/123/location/moon1", + ) + assert expected == request + + +def test_evaluate(): + with patch( + "langchain_google_vertexai.evaluators.evaluation.EvaluationServiceClient" + ) as mc: + with patch( + "langchain_google_vertexai.evaluators.evaluation.EvaluationServiceAsyncClient" + ) as amc: + evaluator = VertexStringEvaluator( + metric="bleu", project_id="test", location="moon1" + ) + mc.assert_called_once() + amc.assert_called_once() + evaluator._location = "test" + + mock_evaluate = MagicMock() + mock_evaluate.return_value = EvaluateInstancesResponse( + bleu_results={"bleu_metric_values": [{"score": 1.0}, {"score": 0.5}]} + ) + mc.return_value.evaluate_instances = mock_evaluate + + result = evaluator.evaluate( + examples=[{"reference": "test1"}, {"reference": "test2"}], + predictions=[{"prediction": "test3"}, {"prediction": "test4"}], + ) + mock_evaluate.assert_called_once() + assert result == [{"score": 1.0}, {"score": 0.5}] diff --git a/libs/vertexai/tests/unit_tests/test_imports.py b/libs/vertexai/tests/unit_tests/test_imports.py index 4bbf5852..d6874da6 100644 --- a/libs/vertexai/tests/unit_tests/test_imports.py +++ b/libs/vertexai/tests/unit_tests/test_imports.py @@ -30,6 +30,7 @@ "VertexAIImageGeneratorChat", "VertexAIModelGarden", "VertexAIVisualQnAChat", + "VertexStringEvaluator", ]