[Python] Add beta evaluate() (#542)

- [X] base `evaluate` api - [X] support wrapper to support OTS evaluators from langchain - [X] add examples
langchain-ai · Mar 27, 2024 · 545b7fa · 545b7fa
1 parent 660d29c
commit 545b7fa
Show file tree

Hide file tree

Showing 15 changed files with 2,093 additions and 63 deletions.
diff --git a/.github/actions/python-integration-tests/action.yml b/.github/actions/python-integration-tests/action.yml
@@ -10,6 +10,9 @@ inputs:
   openai-api-key:
     description: "OpenAI API key"
     required: false
+  anthropic-api-key:
+    description: "Anthropic API key"
+    required: false
 runs:
   using: "composite"
   steps:
@@ -30,7 +33,7 @@ runs:
     - name: Install dependencies
       run: |
         poetry install --with dev
-        poetry run pip install -U langchain
+        poetry run pip install -U langchain langchain_anthropic tiktoken rapidfuzz
       shell: bash
       working-directory: python
 
@@ -42,3 +45,13 @@ runs:
       run: make integration_tests_fast
       shell: bash
       working-directory: python
+
+    - name: Run doctest
+      env:
+        LANGCHAIN_TRACING_V2: "true"
+        LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }}
+        OPENAI_API_KEY: ${{ inputs.openai-api-key }}
+        ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
+      run: make doctest
+      shell: bash
+      working-directory: python
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
@@ -49,6 +49,7 @@ jobs:
           python-version: 3.11
           langchain-api-key: ${{ secrets.LANGCHAIN_API_KEY }}
           openai-api-key: ${{ secrets.OPENAI_API_KEY }}
+          anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
 
   js_integration_test:
     name: JS Integration Test

diff --git a/python/Makefile b/python/Makefile
@@ -12,6 +12,9 @@ integration_tests:
 integration_tests_fast:
 	poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests
 
+doctest:
+	poetry run pytest -n auto --durations=10 --doctest-modules langsmith
+
 lint:
 	poetry run ruff .
 	poetry run mypy .

diff --git a/python/langsmith/beta/__init__.py b/python/langsmith/beta/__init__.py
@@ -1,5 +1,6 @@
 """Beta functionality prone to change."""
 
 from langsmith.beta._evals import compute_test_metrics, convert_runs_to_test
+from langsmith.beta._utils import warn_beta
 
-__all__ = ["convert_runs_to_test", "compute_test_metrics"]
+__all__ = ["convert_runs_to_test", "compute_test_metrics", "warn_beta"]
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -49,11 +49,12 @@
 from langsmith import env as ls_env
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
-from langsmith.evaluation import evaluator as ls_evaluator
 
 if TYPE_CHECKING:
     import pandas as pd  # type: ignore
 
+    from langsmith.evaluation import evaluator as ls_evaluator
+
 logger = logging.getLogger(__name__)
 _urllib3_logger = logging.getLogger("urllib3.connectionpool")
 
@@ -728,7 +729,9 @@ def request_with_retries(
                     args = list(e.args)
                     msg = args[1] if len(args) > 1 else ""
                     msg = msg.replace("session", "session (project)")
-                    emsg = "\n".join([args[0]] + [msg] + args[2:])
+                    emsg = "\n".join(
+                        [str(args[0])] + [msg] + [str(arg) for arg in args[2:]]
+                    )
                     raise ls_utils.LangSmithError(
                         f"Failed to {request_method} {url} in LangSmith API. {emsg}"
                     ) from e
@@ -3144,11 +3147,20 @@ def _resolve_example_id(
     def _select_eval_results(
         self,
         results: Union[ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults],
+        *,
+        fn_name: Optional[str] = None,
     ) -> List[ls_evaluator.EvaluationResult]:
+        from langsmith.evaluation import evaluator as ls_evaluator  # noqa: F811
+
         if isinstance(results, ls_evaluator.EvaluationResult):
             results_ = [results]
-        elif isinstance(results, dict) and "results" in results:
-            results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
+        elif isinstance(results, dict):
+            if "results" in results:
+                results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
+            else:
+                results_ = [
+                    ls_evaluator.EvaluationResult(**{"key": fn_name, **results})
+                ]
         else:
             raise TypeError(
                 f"Invalid evaluation result type {type(results)}."
@@ -3208,15 +3220,20 @@ def _log_evaluation_feedback(
         evaluator_response: Union[
             ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults
         ],
-        run: ls_schemas.Run,
+        run: Optional[ls_schemas.Run] = None,
         source_info: Optional[Dict[str, Any]] = None,
+        project_id: Optional[ID_TYPE] = None,
     ) -> List[ls_evaluator.EvaluationResult]:
         results = self._select_eval_results(evaluator_response)
         for res in results:
             source_info_ = source_info or {}
             if res.evaluator_info:
                 source_info_ = {**res.evaluator_info, **source_info_}
-            run_id_ = res.target_run_id if res.target_run_id else run.id
+            run_id_ = None
+            if res.target_run_id:
+                run_id_ = res.target_run_id
+            elif run is not None:
+                run_id_ = run.id
             self.create_feedback(
                 run_id_,
                 res.key,
@@ -3227,6 +3244,7 @@ def _log_evaluation_feedback(
                 source_info=source_info_,
                 source_run_id=res.source_run_id,
                 feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
+                project_id=project_id,
             )
         return results
 

diff --git a/python/langsmith/evaluation/__init__.py b/python/langsmith/evaluation/__init__.py
@@ -1,11 +1,13 @@
 """Evaluation Helpers."""
 
+from langsmith.evaluation._runner import evaluate, evaluate_existing
 from langsmith.evaluation.evaluator import (
     EvaluationResult,
     EvaluationResults,
     RunEvaluator,
     run_evaluator,
 )
+from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator
 from langsmith.evaluation.string_evaluator import StringEvaluator
 
 __all__ = [
@@ -14,4 +16,7 @@
     "EvaluationResults",
     "RunEvaluator",
     "StringEvaluator",
+    "evaluate",
+    "evaluate_existing",
+    "LangChainStringEvaluator",
 ]