diff --git a/libs/vertexai/langchain_google_vertexai/__init__.py b/libs/vertexai/langchain_google_vertexai/__init__.py
index fef80927..86b138e3 100644
--- a/libs/vertexai/langchain_google_vertexai/__init__.py
+++ b/libs/vertexai/langchain_google_vertexai/__init__.py
@@ -10,6 +10,7 @@
 from langchain_google_vertexai.chains import create_structured_runnable
 from langchain_google_vertexai.chat_models import ChatVertexAI
 from langchain_google_vertexai.embeddings import VertexAIEmbeddings
+from langchain_google_vertexai.evaluators.evaluation import VertexStringEvaluator
 from langchain_google_vertexai.functions_utils import (
     PydanticFunctionsOutputParser,
 )
@@ -68,4 +69,5 @@
     "VertexAIImageGeneratorChat",
     "VertexAIModelGarden",
     "VertexAIVisualQnAChat",
+    "VertexStringEvaluator",
 ]
diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/__init__.py b/libs/vertexai/langchain_google_vertexai/evaluators/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/_core.py b/libs/vertexai/langchain_google_vertexai/evaluators/_core.py
new file mode 100644
index 00000000..69957c45
--- /dev/null
+++ b/libs/vertexai/langchain_google_vertexai/evaluators/_core.py
@@ -0,0 +1,179 @@
+"""Interfaces to be implemented by general evaluators.
+
+Remove after interfaces will be moved to lc-core.
+"""
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Union
+from warnings import warn
+
+from langchain_core.runnables.config import run_in_executor
+
+logger = logging.getLogger(__name__)
+
+
+class _EvalArgsMixin:
+    """Mixin for checking evaluation arguments."""
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
+        return False
+
+    @property
+    def requires_input(self) -> bool:
+        """Whether this evaluator requires an input string."""
+        return False
+
+    @property
+    def _skip_input_warning(self) -> str:
+        """Warning to show when input is ignored."""
+        return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
+
+    @property
+    def _skip_reference_warning(self) -> str:
+        """Warning to show when reference is ignored."""
+        return (
+            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
+        )
+
+    def _check_evaluation_args(
+        self,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+    ) -> None:
+        """Check if the evaluation arguments are valid.
+
+        Args:
+            reference (Optional[str], optional): The reference label.
+            input (Optional[str], optional): The input string.
+        Raises:
+            ValueError: If the evaluator requires an input string but none is provided,
+                or if the evaluator requires a reference label but none is provided.
+        """
+        if self.requires_input and input is None:
+            raise ValueError(f"{self.__class__.__name__} requires an input string.")
+        elif input is not None and not self.requires_input:
+            warn(self._skip_input_warning)
+        if self.requires_reference and reference is None:
+            raise ValueError(f"{self.__class__.__name__} requires a reference string.")
+        elif reference is not None and not self.requires_reference:
+            warn(self._skip_reference_warning)
+
+
+class StringEvaluator(_EvalArgsMixin, ABC):
+    """Grade, tag, or otherwise evaluate predictions relative to their inputs
+    and/or reference labels."""
+
+    @property
+    def evaluation_name(self) -> str:
+        """The name of the evaluation."""
+        return self.__class__.__name__
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
+        return False
+
+    @abstractmethod
+    def _evaluate_strings(
+        self,
+        *,
+        prediction: Union[str, Any],
+        reference: Optional[Union[str, Any]] = None,
+        input: Optional[Union[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                     - score: the score of the evaluation, if applicable.
+                     - value: the string value of the evaluation, if applicable.
+                     - reasoning: the reasoning for the evaluation, if applicable.
+        """  # noqa: E501
+
+    async def _aevaluate_strings(
+        self,
+        *,
+        prediction: Union[str, Any],
+        reference: Optional[Union[str, Any]] = None,
+        input: Optional[Union[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                     - score: the score of the evaluation, if applicable.
+                     - value: the string value of the evaluation, if applicable.
+                     - reasoning: the reasoning for the evaluation, if applicable.
+        """  # noqa: E501
+        return await run_in_executor(
+            None,
+            self._evaluate_strings,
+            prediction=prediction,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    def evaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+        """  # noqa: E501
+        self._check_evaluation_args(reference=reference, input=input)
+        return self._evaluate_strings(
+            prediction=prediction, reference=reference, input=input, **kwargs
+        )
+
+    async def aevaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.
+
+        Args:
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+        """  # noqa: E501
+        self._check_evaluation_args(reference=reference, input=input)
+        return await self._aevaluate_strings(
+            prediction=prediction, reference=reference, input=input, **kwargs
+        )
diff --git a/libs/vertexai/langchain_google_vertexai/evaluators/evaluation.py b/libs/vertexai/langchain_google_vertexai/evaluators/evaluation.py
new file mode 100644
index 00000000..e868a64e
--- /dev/null
+++ b/libs/vertexai/langchain_google_vertexai/evaluators/evaluation.py
@@ -0,0 +1,213 @@
+from typing import Any, Dict, List, Optional, Sequence
+
+import proto  # type: ignore[import-untyped]
+from google.api_core.client_options import ClientOptions
+from google.cloud.aiplatform.constants import base as constants
+from google.cloud.aiplatform_v1beta1 import (
+    EvaluationServiceAsyncClient,
+    EvaluationServiceClient,
+)
+from google.cloud.aiplatform_v1beta1.types import (
+    EvaluateInstancesRequest,
+    EvaluateInstancesResponse,
+)
+
+from langchain_google_vertexai._utils import (
+    get_client_info,
+    get_user_agent,
+)
+from langchain_google_vertexai.evaluators._core import StringEvaluator
+
+_METRICS = [
+    "bleu",
+    "exact_match",
+    "rouge",
+    "coherence",
+    "fluency",
+    "safety",
+    "groundedness",
+    "fulfillment",
+    "summarization_quality",
+    "summarization_helpfulness",
+    "summarization_verbosity",
+    "question_answering_quality",
+    "question_answering_relevance",
+    "question_answering_correctness",
+]
+_METRICS_INPUTS = {
+    "rouge1": {"rouge_type": "rouge1"},
+    "rouge2": {"rouge_type": "rouge2"},
+    "rougeL": {"rouge_type": "rougeL"},
+    "rougeLsum": {"rouge_type": "rougeLsum"},
+}
+_METRICS_ATTRS = {
+    "safety": ["prediction"],
+    "coherence": ["prediction"],
+    "fluency": ["prediction"],
+    "groundedness": ["context", "prediction"],
+    "fulfillment": ["prediction", "instruction"],
+    "summarization_quality": ["prediction", "instruction", "context"],
+    "summarization_helpfulness": ["prediction", "context"],
+    "summarization_verbosity": ["prediction", "context"],
+    "question_answering_quality": ["prediction", "context", "instruction"],
+    "question_answering_relevance": ["prediction", "instruction"],
+    "question_answering_correctness": ["prediction", "instruction"],
+}
+_METRICS_OPTIONAL_ATTRS = {
+    "summarization_quality": ["reference"],
+    "summarization_helpfulness": ["reference", "instruction"],
+    "summarization_verbosity": ["reference", "instruction"],
+    "question_answering_quality": ["reference"],
+    "question_answering_relevance": ["reference", "context"],
+    "question_answering_correctness": ["reference", "context"],
+}
+# a client supports multiple instances per request for these metrics
+_METRICS_MULTIPLE_INSTANCES = ["bleu", "exact_match", "rouge"]
+
+
+def _format_metric(metric: str) -> str:
+    if metric.startswith("rouge"):
+        return "rouge"
+    return metric
+
+
+def _format_instance(instance: Dict[str, str], metric: str) -> Dict[str, str]:
+    attrs = _METRICS_ATTRS.get(metric, ["prediction", "reference"])
+    result = {a: instance[a] for a in attrs}
+    for attr in _METRICS_OPTIONAL_ATTRS.get(metric, []):
+        if attr in instance:
+            result[attr] = instance[attr]
+    return result
+
+
+def _prepare_request(
+    instances: Sequence[Dict[str, str]], metric: str, location: str
+) -> EvaluateInstancesRequest:
+    request = EvaluateInstancesRequest()
+    metric_input: Dict[str, Any] = {"metric_spec": _METRICS_INPUTS.get(metric, {})}
+    if _format_metric(metric) not in _METRICS_MULTIPLE_INSTANCES:
+        if len(instances) > 1:
+            raise ValueError(
+                f"Metric {metric} supports only a single instance per request, "
+                f"got {len(instances)}!"
+            )
+        metric_input["instance"] = _format_instance(instances[0], metric=metric)
+    else:
+        metric_input["instances"] = [
+            _format_instance(i, metric=metric) for i in instances
+        ]
+    setattr(request, f"{_format_metric(metric)}_input", metric_input)
+    request.location = location
+    return request
+
+
+def _parse_response(
+    response: EvaluateInstancesResponse, metric: str
+) -> List[Dict[str, Any]]:
+    metric = _format_metric(metric)
+    result = proto.Message.to_dict(response)
+    if metric in _METRICS_MULTIPLE_INSTANCES:
+        return result[f"{metric}_results"][f"{metric}_metric_values"]
+    return [result[f"{metric}_result"]]
+
+
+class VertexStringEvaluator(StringEvaluator):
+    """Evaluate the perplexity of a predicted string."""
+
+    def __init__(self, metric: str, project_id: str, location: str = "us-central1"):
+        self._metric = metric
+        client_options = ClientOptions(
+            api_endpoint=f"{location}-{constants.PREDICTION_API_BASE_PATH}"
+        )
+        self._client = EvaluationServiceClient(
+            client_options=client_options,
+            client_info=get_client_info(module=self._user_agent),
+        )
+        self._async_client = EvaluationServiceAsyncClient(
+            client_options=client_options,
+            client_info=get_client_info(module=self._user_agent),
+        )
+        self._location = self._client.common_location_path(project_id, location)
+        if _format_metric(metric) not in _METRICS:
+            raise ValueError(f"Metric {metric} is not supported yet!")
+
+    def _evaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        request = self._prepare_request(prediction, reference, input, **kwargs)
+        response = self._client.evaluate_instances(request)
+        return _parse_response(response, metric=self._metric)[0]
+
+    @property
+    def _user_agent(self) -> str:
+        """Gets the User Agent."""
+        _, user_agent = get_user_agent(f"{type(self).__name__}_{self._metric}")
+        return user_agent
+
+    def evaluate(
+        self,
+        examples: Sequence[Dict[str, str]],
+        predictions: Sequence[Dict[str, str]],
+        *,
+        question_key: str = "context",
+        answer_key: str = "reference",
+        prediction_key: str = "prediction",
+        instruction_key: str = "instruction",
+        **kwargs: Any,
+    ) -> List[dict]:
+        instances: List[dict] = []
+        for example, prediction in zip(examples, predictions):
+            row = {"prediction": prediction[prediction_key]}
+            if answer_key in example:
+                row["reference"] = example[answer_key]
+            if question_key in example:
+                row["context"] = example[question_key]
+            if instruction_key in example:
+                row["instruction"] = example[instruction_key]
+            instances.append(row)
+
+        if self._metric in _METRICS_MULTIPLE_INSTANCES:
+            request = _prepare_request(
+                instances, metric=self._metric, location=self._location
+            )
+            response = self._client.evaluate_instances(request)
+            return _parse_response(response, metric=self._metric)
+        else:
+            return [self._evaluate_strings(**i) for i in instances]
+
+    def _prepare_request(
+        self,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> EvaluateInstancesRequest:
+        instance = {"prediction": prediction}
+        if reference:
+            instance["reference"] = reference
+        if input:
+            instance["context"] = input
+        if "instruction" in kwargs:
+            instance["instruction"] = kwargs["instruction"]
+        if "context" in kwargs:
+            instance["context"] = kwargs["context"]
+        return _prepare_request(
+            [instance], metric=self._metric, location=self._location
+        )
+
+    async def _aevaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        request = self._prepare_request(prediction, reference, input, **kwargs)
+        response = await self._async_client.evaluate_instances(request)
+        return _parse_response(response, metric=self._metric)[0]
diff --git a/libs/vertexai/pyproject.toml b/libs/vertexai/pyproject.toml
index 0d705d5c..ce5c2aeb 100644
--- a/libs/vertexai/pyproject.toml
+++ b/libs/vertexai/pyproject.toml
@@ -36,6 +36,9 @@ numexpr = "^2.8.6"
 google-api-python-client = "^2.117.0"
 langchain-core = { git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core" }
 
+[tool.codespell]
+ignore-words-list = "rouge"
+
 [tool.poetry.group.codespell]
 optional = true
 
diff --git a/libs/vertexai/tests/integration_tests/test_chat_models.py b/libs/vertexai/tests/integration_tests/test_chat_models.py
index b3857830..de6a5b23 100644
--- a/libs/vertexai/tests/integration_tests/test_chat_models.py
+++ b/libs/vertexai/tests/integration_tests/test_chat_models.py
@@ -524,6 +524,7 @@ class MyModel(BaseModel):
 
 
 @pytest.mark.release
+@pytest.mark.xfail(reason="flaky")
 def test_chat_vertexai_gemini_function_calling_with_multiple_parts() -> None:
     @tool
     def search(
diff --git a/libs/vertexai/tests/integration_tests/test_evaluation.py b/libs/vertexai/tests/integration_tests/test_evaluation.py
new file mode 100644
index 00000000..ff35de77
--- /dev/null
+++ b/libs/vertexai/tests/integration_tests/test_evaluation.py
@@ -0,0 +1,51 @@
+import os
+
+import pytest
+
+from langchain_google_vertexai import VertexStringEvaluator
+
+
+@pytest.mark.release
+def test_evaluate() -> None:
+    evaluator = VertexStringEvaluator(
+        metric="bleu", project_id=os.environ["PROJECT_ID"]
+    )
+    result = evaluator.evaluate(
+        examples=[
+            {"reference": "This is a test."},
+            {"reference": "This is another test."},
+        ],
+        predictions=[
+            {"prediction": "This is a test."},
+            {"prediction": "This is another one."},
+        ],
+    )
+    assert len(result) == 2
+    assert result[0]["score"] == 1.0
+    assert result[1]["score"] < 1.0
+
+
+@pytest.mark.release
+def test_evaluate_strings() -> None:
+    evaluator = VertexStringEvaluator(
+        metric="safety", project_id=os.environ["PROJECT_ID"]
+    )
+    result = evaluator._evaluate_strings(prediction="This is a test")
+    assert isinstance(result, dict)
+    assert "score" in result
+    assert "explanation" in result
+
+
+@pytest.mark.release
+async def test_aevaluate_strings() -> None:
+    evaluator = VertexStringEvaluator(
+        metric="question_answering_quality", project_id=os.environ["PROJECT_ID"]
+    )
+    result = await evaluator._aevaluate_strings(
+        prediction="London",
+        input="What is the capital of Great Britain?",
+        instruction="Be concise",
+    )
+    assert isinstance(result, dict)
+    assert "score" in result
+    assert "explanation" in result
diff --git a/libs/vertexai/tests/unit_tests/test_evaluation.py b/libs/vertexai/tests/unit_tests/test_evaluation.py
new file mode 100644
index 00000000..80cc1ca1
--- /dev/null
+++ b/libs/vertexai/tests/unit_tests/test_evaluation.py
@@ -0,0 +1,78 @@
+from unittest.mock import MagicMock, patch
+
+from google.cloud.aiplatform_v1beta1.types import (
+    EvaluateInstancesRequest,
+    EvaluateInstancesResponse,
+)
+
+from langchain_google_vertexai import VertexStringEvaluator
+from langchain_google_vertexai.evaluators.evaluation import _prepare_request
+
+
+def test_prepare_request_rouge() -> None:
+    instances = [
+        {"prediction": "test1", "reference": "test2"},
+        {"prediction": "test3", "reference": "test4"},
+    ]
+    request = _prepare_request(
+        instances, metric="rougeL", location="project/123/location/moon1"
+    )
+    expected = EvaluateInstancesRequest(
+        rouge_input={"metric_spec": {"rouge_type": "rougeL"}, "instances": instances},
+        location="project/123/location/moon1",
+    )
+    assert expected == request
+
+
+def test_prepare_request_coherence() -> None:
+    instance = {"prediction": "test1"}
+    request = _prepare_request(
+        [instance], metric="coherence", location="project/123/location/moon1"
+    )
+    expected = EvaluateInstancesRequest(
+        coherence_input={"metric_spec": {}, "instance": instance},
+        location="project/123/location/moon1",
+    )
+    assert expected == request
+
+
+def test_prepare_request_question_answering_correctness() -> None:
+    instance = {"prediction": "test1", "instruction": "test2", "context": "test3"}
+    request = _prepare_request(
+        [instance],
+        metric="question_answering_correctness",
+        location="project/123/location/moon1",
+    )
+    expected = EvaluateInstancesRequest(
+        question_answering_correctness_input={"metric_spec": {}, "instance": instance},
+        location="project/123/location/moon1",
+    )
+    assert expected == request
+
+
+def test_evaluate():
+    with patch(
+        "langchain_google_vertexai.evaluators.evaluation.EvaluationServiceClient"
+    ) as mc:
+        with patch(
+            "langchain_google_vertexai.evaluators.evaluation.EvaluationServiceAsyncClient"
+        ) as amc:
+            evaluator = VertexStringEvaluator(
+                metric="bleu", project_id="test", location="moon1"
+            )
+            mc.assert_called_once()
+            amc.assert_called_once()
+            evaluator._location = "test"
+
+            mock_evaluate = MagicMock()
+            mock_evaluate.return_value = EvaluateInstancesResponse(
+                bleu_results={"bleu_metric_values": [{"score": 1.0}, {"score": 0.5}]}
+            )
+            mc.return_value.evaluate_instances = mock_evaluate
+
+            result = evaluator.evaluate(
+                examples=[{"reference": "test1"}, {"reference": "test2"}],
+                predictions=[{"prediction": "test3"}, {"prediction": "test4"}],
+            )
+            mock_evaluate.assert_called_once()
+            assert result == [{"score": 1.0}, {"score": 0.5}]
diff --git a/libs/vertexai/tests/unit_tests/test_imports.py b/libs/vertexai/tests/unit_tests/test_imports.py
index 4bbf5852..d6874da6 100644
--- a/libs/vertexai/tests/unit_tests/test_imports.py
+++ b/libs/vertexai/tests/unit_tests/test_imports.py
@@ -30,6 +30,7 @@
     "VertexAIImageGeneratorChat",
     "VertexAIModelGarden",
     "VertexAIVisualQnAChat",
+    "VertexStringEvaluator",
 ]