From 545b7fae958a8730a59e2282d585b918b47ee6d2 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:35:14 -0700 Subject: [PATCH] [Python] Add beta `evaluate()` (#542) - [X] base `evaluate` api - [X] support wrapper to support OTS evaluators from langchain - [X] add examples --- .../python-integration-tests/action.yml | 15 +- .github/workflows/integration_tests.yml | 1 + python/Makefile | 3 + python/langsmith/beta/__init__.py | 3 +- python/langsmith/client.py | 30 +- python/langsmith/evaluation/__init__.py | 5 + .../langsmith/evaluation/_name_generation.py | 727 +++++++++++++ python/langsmith/evaluation/_runner.py | 990 ++++++++++++++++++ python/langsmith/evaluation/evaluator.py | 4 + .../evaluation/integrations/__init__.py | 8 + .../evaluation/integrations/_langchain.py | 243 +++++ python/langsmith/run_helpers.py | 65 +- python/poetry.lock | 13 +- python/pyproject.toml | 3 +- python/tests/integration_tests/test_client.py | 46 - 15 files changed, 2093 insertions(+), 63 deletions(-) create mode 100644 python/langsmith/evaluation/_name_generation.py create mode 100644 python/langsmith/evaluation/_runner.py create mode 100644 python/langsmith/evaluation/integrations/__init__.py create mode 100644 python/langsmith/evaluation/integrations/_langchain.py diff --git a/.github/actions/python-integration-tests/action.yml b/.github/actions/python-integration-tests/action.yml index d8a06a5b4..a107bc56c 100644 --- a/.github/actions/python-integration-tests/action.yml +++ b/.github/actions/python-integration-tests/action.yml @@ -10,6 +10,9 @@ inputs: openai-api-key: description: "OpenAI API key" required: false + anthropic-api-key: + description: "Anthropic API key" + required: false runs: using: "composite" steps: @@ -30,7 +33,7 @@ runs: - name: Install dependencies run: | poetry install --with dev - poetry run pip install -U langchain + poetry run pip install -U langchain langchain_anthropic tiktoken rapidfuzz shell: bash working-directory: python @@ -42,3 +45,13 @@ runs: run: make integration_tests_fast shell: bash working-directory: python + + - name: Run doctest + env: + LANGCHAIN_TRACING_V2: "true" + LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }} + OPENAI_API_KEY: ${{ inputs.openai-api-key }} + ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }} + run: make doctest + shell: bash + working-directory: python diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 23de6b5ea..fda330b7e 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -49,6 +49,7 @@ jobs: python-version: 3.11 langchain-api-key: ${{ secrets.LANGCHAIN_API_KEY }} openai-api-key: ${{ secrets.OPENAI_API_KEY }} + anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }} js_integration_test: name: JS Integration Test diff --git a/python/Makefile b/python/Makefile index 5e448866b..a50228ccc 100644 --- a/python/Makefile +++ b/python/Makefile @@ -12,6 +12,9 @@ integration_tests: integration_tests_fast: poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests +doctest: + poetry run pytest -n auto --durations=10 --doctest-modules langsmith + lint: poetry run ruff . poetry run mypy . diff --git a/python/langsmith/beta/__init__.py b/python/langsmith/beta/__init__.py index f9e152fd3..9240296a3 100644 --- a/python/langsmith/beta/__init__.py +++ b/python/langsmith/beta/__init__.py @@ -1,5 +1,6 @@ """Beta functionality prone to change.""" from langsmith.beta._evals import compute_test_metrics, convert_runs_to_test +from langsmith.beta._utils import warn_beta -__all__ = ["convert_runs_to_test", "compute_test_metrics"] +__all__ = ["convert_runs_to_test", "compute_test_metrics", "warn_beta"] diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 0fe79c8fa..0280d390a 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -49,11 +49,12 @@ from langsmith import env as ls_env from langsmith import schemas as ls_schemas from langsmith import utils as ls_utils -from langsmith.evaluation import evaluator as ls_evaluator if TYPE_CHECKING: import pandas as pd # type: ignore + from langsmith.evaluation import evaluator as ls_evaluator + logger = logging.getLogger(__name__) _urllib3_logger = logging.getLogger("urllib3.connectionpool") @@ -728,7 +729,9 @@ def request_with_retries( args = list(e.args) msg = args[1] if len(args) > 1 else "" msg = msg.replace("session", "session (project)") - emsg = "\n".join([args[0]] + [msg] + args[2:]) + emsg = "\n".join( + [str(args[0])] + [msg] + [str(arg) for arg in args[2:]] + ) raise ls_utils.LangSmithError( f"Failed to {request_method} {url} in LangSmith API. {emsg}" ) from e @@ -3144,11 +3147,20 @@ def _resolve_example_id( def _select_eval_results( self, results: Union[ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults], + *, + fn_name: Optional[str] = None, ) -> List[ls_evaluator.EvaluationResult]: + from langsmith.evaluation import evaluator as ls_evaluator # noqa: F811 + if isinstance(results, ls_evaluator.EvaluationResult): results_ = [results] - elif isinstance(results, dict) and "results" in results: - results_ = cast(List[ls_evaluator.EvaluationResult], results["results"]) + elif isinstance(results, dict): + if "results" in results: + results_ = cast(List[ls_evaluator.EvaluationResult], results["results"]) + else: + results_ = [ + ls_evaluator.EvaluationResult(**{"key": fn_name, **results}) + ] else: raise TypeError( f"Invalid evaluation result type {type(results)}." @@ -3208,15 +3220,20 @@ def _log_evaluation_feedback( evaluator_response: Union[ ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults ], - run: ls_schemas.Run, + run: Optional[ls_schemas.Run] = None, source_info: Optional[Dict[str, Any]] = None, + project_id: Optional[ID_TYPE] = None, ) -> List[ls_evaluator.EvaluationResult]: results = self._select_eval_results(evaluator_response) for res in results: source_info_ = source_info or {} if res.evaluator_info: source_info_ = {**res.evaluator_info, **source_info_} - run_id_ = res.target_run_id if res.target_run_id else run.id + run_id_ = None + if res.target_run_id: + run_id_ = res.target_run_id + elif run is not None: + run_id_ = run.id self.create_feedback( run_id_, res.key, @@ -3227,6 +3244,7 @@ def _log_evaluation_feedback( source_info=source_info_, source_run_id=res.source_run_id, feedback_source_type=ls_schemas.FeedbackSourceType.MODEL, + project_id=project_id, ) return results diff --git a/python/langsmith/evaluation/__init__.py b/python/langsmith/evaluation/__init__.py index 64c65c134..1dd2ecbed 100644 --- a/python/langsmith/evaluation/__init__.py +++ b/python/langsmith/evaluation/__init__.py @@ -1,11 +1,13 @@ """Evaluation Helpers.""" +from langsmith.evaluation._runner import evaluate, evaluate_existing from langsmith.evaluation.evaluator import ( EvaluationResult, EvaluationResults, RunEvaluator, run_evaluator, ) +from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator from langsmith.evaluation.string_evaluator import StringEvaluator __all__ = [ @@ -14,4 +16,7 @@ "EvaluationResults", "RunEvaluator", "StringEvaluator", + "evaluate", + "evaluate_existing", + "LangChainStringEvaluator", ] diff --git a/python/langsmith/evaluation/_name_generation.py b/python/langsmith/evaluation/_name_generation.py new file mode 100644 index 000000000..191c74632 --- /dev/null +++ b/python/langsmith/evaluation/_name_generation.py @@ -0,0 +1,727 @@ +import random + +adjectives = [ + "abandoned", + "aching", + "advanced", + "ample", + "artistic", + "back", + "best", + "bold", + "brief", + "clear", + "cold", + "complicated", + "cooked", + "crazy", + "crushing", + "damp", + "dear", + "definite", + "dependable", + "diligent", + "drab", + "earnest", + "elderly", + "enchanted", + "essential", + "excellent", + "extraneous", + "fixed", + "flowery", + "formal", + "fresh", + "frosty", + "giving", + "glossy", + "healthy", + "helpful", + "impressionable", + "kind", + "large", + "left", + "long", + "loyal", + "mealy", + "memorable", + "monthly", + "new", + "notable", + "only", + "ordinary", + "passionate", + "perfect", + "pertinent", + "proper", + "puzzled", + "reflecting", + "respectful", + "roasted", + "scholarly", + "shiny", + "slight", + "sparkling", + "spotless", + "stupendous", + "sunny", + "tart", + "terrific", + "timely", + "unique", + "upbeat", + "vacant", + "virtual", + "warm", + "weary", + "whispered", + "worthwhile", + "yellow", +] + +nouns = [ + "account", + "acknowledgment", + "address", + "advertising", + "airplane", + "animal", + "appointment", + "arrival", + "artist", + "attachment", + "attitude", + "availability", + "backpack", + "bag", + "balance", + "bass", + "bean", + "beauty", + "bibliography", + "bill", + "bite", + "blossom", + "boat", + "book", + "box", + "boy", + "bread", + "bridge", + "broccoli", + "building", + "butter", + "button", + "cabbage", + "cake", + "camera", + "camp", + "candle", + "candy", + "canvas", + "car", + "card", + "carrot", + "cart", + "case", + "cat", + "chain", + "chair", + "chalk", + "chance", + "change", + "channel", + "character", + "charge", + "charm", + "chart", + "check", + "cheek", + "cheese", + "chef", + "cherry", + "chicken", + "child", + "church", + "circle", + "class", + "clay", + "click", + "clock", + "cloth", + "cloud", + "clove", + "club", + "coach", + "coal", + "coast", + "coat", + "cod", + "coffee", + "collar", + "color", + "comb", + "comfort", + "comic", + "committee", + "community", + "company", + "comparison", + "competition", + "condition", + "connection", + "control", + "cook", + "copper", + "copy", + "corn", + "cough", + "country", + "cover", + "crate", + "crayon", + "cream", + "creator", + "crew", + "crown", + "current", + "curtain", + "curve", + "cushion", + "dad", + "daughter", + "day", + "death", + "debt", + "decision", + "deer", + "degree", + "design", + "desire", + "desk", + "detail", + "development", + "digestion", + "dime", + "dinner", + "direction", + "dirt", + "discovery", + "discussion", + "disease", + "disgust", + "distance", + "distribution", + "division", + "doctor", + "dog", + "door", + "drain", + "drawer", + "dress", + "drink", + "driving", + "dust", + "ear", + "earth", + "edge", + "education", + "effect", + "egg", + "end", + "energy", + "engine", + "error", + "event", + "example", + "exchange", + "existence", + "expansion", + "experience", + "expert", + "eye", + "face", + "fact", + "fall", + "family", + "farm", + "father", + "fear", + "feeling", + "field", + "finger", + "fire", + "fish", + "flag", + "flight", + "floor", + "flower", + "fold", + "food", + "football", + "force", + "form", + "frame", + "friend", + "frog", + "fruit", + "fuel", + "furniture", + "game", + "garden", + "gate", + "girl", + "glass", + "glove", + "goat", + "gold", + "government", + "grade", + "grain", + "grass", + "green", + "grip", + "group", + "growth", + "guide", + "guitar", + "hair", + "hall", + "hand", + "harbor", + "harmony", + "hat", + "head", + "health", + "heart", + "heat", + "hill", + "history", + "hobbies", + "hole", + "hope", + "horn", + "horse", + "hospital", + "hour", + "house", + "humor", + "idea", + "impulse", + "income", + "increase", + "industry", + "ink", + "insect", + "instrument", + "insurance", + "interest", + "invention", + "iron", + "island", + "jelly", + "jet", + "jewel", + "join", + "judge", + "juice", + "jump", + "kettle", + "key", + "kick", + "kiss", + "kitten", + "knee", + "knife", + "knowledge", + "land", + "language", + "laugh", + "law", + "lead", + "learning", + "leather", + "leg", + "lettuce", + "level", + "library", + "lift", + "light", + "limit", + "line", + "linen", + "lip", + "liquid", + "list", + "look", + "loss", + "love", + "lunch", + "machine", + "man", + "manager", + "map", + "marble", + "mark", + "market", + "mass", + "match", + "meal", + "measure", + "meat", + "meeting", + "memory", + "metal", + "middle", + "milk", + "mind", + "mine", + "minute", + "mist", + "mitten", + "mom", + "money", + "monkey", + "month", + "moon", + "morning", + "mother", + "motion", + "mountain", + "mouth", + "muscle", + "music", + "nail", + "name", + "nation", + "neck", + "need", + "news", + "night", + "noise", + "note", + "number", + "nut", + "observation", + "offer", + "oil", + "operation", + "opinion", + "orange", + "order", + "organization", + "ornament", + "oven", + "page", + "pail", + "pain", + "paint", + "pan", + "pancake", + "paper", + "parcel", + "parent", + "part", + "passenger", + "paste", + "payment", + "peace", + "pear", + "pen", + "pencil", + "person", + "pest", + "pet", + "picture", + "pie", + "pin", + "pipe", + "pizza", + "place", + "plane", + "plant", + "plastic", + "plate", + "play", + "pleasure", + "plot", + "plough", + "pocket", + "point", + "poison", + "police", + "pollution", + "popcorn", + "porter", + "position", + "pot", + "potato", + "powder", + "power", + "price", + "print", + "process", + "produce", + "product", + "profit", + "property", + "prose", + "protest", + "pull", + "pump", + "punishment", + "purpose", + "push", + "quarter", + "question", + "quiet", + "quill", + "quilt", + "quince", + "rabbit", + "rail", + "rain", + "range", + "rat", + "rate", + "ray", + "reaction", + "reading", + "reason", + "record", + "regret", + "relation", + "religion", + "representative", + "request", + "respect", + "rest", + "reward", + "rhythm", + "rice", + "river", + "road", + "roll", + "room", + "root", + "rose", + "route", + "rub", + "rule", + "run", + "sack", + "sail", + "salt", + "sand", + "scale", + "scarecrow", + "scarf", + "scene", + "scent", + "school", + "science", + "scissors", + "screw", + "sea", + "seat", + "secretary", + "seed", + "selection", + "self", + "sense", + "servant", + "shade", + "shake", + "shame", + "shape", + "sheep", + "sheet", + "shelf", + "ship", + "shirt", + "shock", + "shoe", + "shop", + "show", + "side", + "sign", + "silk", + "sink", + "sister", + "size", + "sky", + "sleep", + "smash", + "smell", + "smile", + "smoke", + "snail", + "snake", + "sneeze", + "snow", + "soap", + "society", + "sock", + "soda", + "sofa", + "son", + "song", + "sort", + "sound", + "soup", + "space", + "spark", + "speed", + "sponge", + "spoon", + "spray", + "spring", + "spy", + "square", + "stamp", + "star", + "start", + "statement", + "station", + "steam", + "steel", + "stem", + "step", + "stew", + "stick", + "stitch", + "stocking", + "stomach", + "stone", + "stop", + "store", + "story", + "stove", + "stranger", + "straw", + "stream", + "street", + "stretch", + "string", + "structure", + "substance", + "sugar", + "suggestion", + "suit", + "summer", + "sun", + "support", + "surprise", + "sweater", + "swim", + "system", + "table", + "tail", + "talk", + "tank", + "taste", + "tax", + "tea", + "teaching", + "team", + "tendency", + "test", + "texture", + "theory", + "thing", + "thought", + "thread", + "throat", + "thumb", + "thunder", + "ticket", + "time", + "tin", + "title", + "toad", + "toe", + "tooth", + "toothpaste", + "touch", + "town", + "toy", + "trade", + "train", + "transport", + "tray", + "treatment", + "tree", + "trick", + "trip", + "trouble", + "trousers", + "truck", + "tub", + "turkey", + "turn", + "twist", + "umbrella", + "uncle", + "underwear", + "unit", + "use", + "vacation", + "value", + "van", + "vase", + "vegetable", + "veil", + "vein", + "verse", + "vessel", + "view", + "visitor", + "voice", + "volcano", + "walk", + "wall", + "war", + "wash", + "waste", + "watch", + "water", + "wave", + "wax", + "way", + "wealth", + "weather", + "week", + "weight", + "wheel", + "whip", + "whistle", + "window", + "wine", + "wing", + "winter", + "wire", + "wish", + "woman", + "wood", + "wool", + "word", + "work", + "worm", + "wound", + "wrist", + "writer", + "yard", + "yoke", + "zebra", + "zinc", + "zipper", + "zone", +] + + +def random_name() -> str: + """Generate a random name.""" + adjective = random.choice(adjectives) + noun = random.choice(nouns) + number = random.randint(1, 100) + return f"{adjective}-{noun}-{number}" diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py new file mode 100644 index 000000000..c9563caf4 --- /dev/null +++ b/python/langsmith/evaluation/_runner.py @@ -0,0 +1,990 @@ +"""V2 Evaluation Interface.""" + +from __future__ import annotations + +import collections +import concurrent.futures as cf +import datetime +import functools +import itertools +import logging +import threading +import uuid +from contextvars import copy_context +from typing import ( + Callable, + DefaultDict, + Dict, + Generator, + Iterable, + Iterator, + List, + Optional, + Sequence, + Tuple, + Union, + cast, +) + +from requests import HTTPError +from typing_extensions import TypedDict + +import langsmith +from langsmith import beta as ls_beta +from langsmith import env as ls_env +from langsmith import run_helpers as rh +from langsmith import run_trees, schemas +from langsmith import utils as ls_utils +from langsmith.evaluation.evaluator import ( + EvaluationResult, + EvaluationResults, + RunEvaluator, + run_evaluator, +) +from langsmith.evaluation.integrations import LangChainStringEvaluator + +logger = logging.getLogger(__name__) + +TARGET_T = Callable[[dict], dict] +# Data format: dataset-name, dataset_id, or examples +DATA_T = Union[str, uuid.UUID, Iterable[schemas.Example]] +# Summary evaluator runs over the whole dataset +# and reports aggregate metric(s) +SUMMARY_EVALUATOR_T = Callable[ + [Sequence[schemas.Run], Sequence[schemas.Example]], + Union[EvaluationResult, EvaluationResults], +] +# Row-level evaluator +EVALUATOR_T = Union[ + RunEvaluator, + Callable[[schemas.Run, Optional[schemas.Example]], EvaluationResult], +] + + +@ls_beta.warn_beta +def evaluate( + target: TARGET_T, + /, + data: DATA_T, + evaluators: Optional[Sequence[EVALUATOR_T]] = None, + summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, + metadata: Optional[dict] = None, + experiment_prefix: Optional[str] = None, + max_concurrency: Optional[int] = None, + client: Optional[langsmith.Client] = None, + blocking: bool = True, +) -> ExperimentResults: + r"""Evaluate a target system or function on a given dataset. + + Args: + target (TARGET_T): The target system or function to evaluate. + data (DATA_T): The dataset to evaluate on. Can be a dataset name, a list of + examples, or a generator of examples. + evaluators (Optional[Sequence[EVALUATOR_T]]): A list of evaluators to run + on each example. Defaults to None. + summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): A list of summary + evaluators to run on the entire dataset. Defaults to None. + metadata (Optional[dict]): Metadata to attach to the experiment. + Defaults to None. + experiment_prefix (Optional[str]): A prefix to provide for your experiment name. + Defaults to None. + max_concurrency (Optional[int]): The maximum number of concurrent + evaluations to run. Defaults to None. + client (Optional[langsmith.Client]): The LangSmith client to use. + Defaults to None. + blocking (bool): Whether to block until the evaluation is complete. + Defaults to True. + + Returns: + ExperimentResults: The results of the evaluation. + + Examples: + Prepare the dataset: + + >>> from typing import Sequence + >>> from langsmith import Client + >>> from langsmith.evaluation import evaluate + >>> from langsmith.schemas import Example, Run + >>> client = Client() + >>> client.clone_public_dataset( + ... "https://smith.langchain.com/public/419dcab2-1d66-4b94-8901-0357ead390df/d" + ... ) + >>> dataset_name = "Evaluate Examples" + + Basic usage: + + >>> def accuracy(run: Run, example: Example): + ... # Row-level evaluator for accuracy. + ... pred = run.outputs["output"] + ... expected = example.outputs["answer"] + ... return {"score": expected.lower() == pred.lower()} + ... + >>> def precision(runs: Sequence[Run], examples: Sequence[Example]): + ... # Experiment-level evaluator for precision. + ... # TP / (TP + FP) + ... predictions = [run.outputs["output"].lower() for run in runs] + ... expected = [example.outputs["answer"].lower() for example in examples] + ... # yes and no are the only possible answers + ... tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"]) + ... fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)]) + ... return {"score": tp / (tp + fp)} + ... + >>> def predict(inputs: dict) -> dict: + ... # This can be any function or just an API call to your app. + ... return {"output": "Yes"} + ... + >>> results = evaluate( + ... predict, + ... data=dataset_name, + ... evaluators=[accuracy], + ... summary_evaluators=[precision], + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + + Evaluating over only a subset of the examples + + >>> experiment_name = results.experiment_name + >>> examples = client.list_examples(dataset_name=dataset_name, limit=5) + >>> results = evaluate( + ... predict, + ... data=examples, + ... evaluators=[accuracy], + ... summary_evaluators=[precision], + ... experiment_prefix="My Experiment", + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + + Streaming each prediction to more easily + eagerly debug. + + >>> results = evaluate( + ... predict, + ... data=dataset_name, + ... evaluators=[accuracy], + ... summary_evaluators=[precision], + ... blocking=False, + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + >>> for i, result in enumerate(results): # doctest: +ELLIPSIS + ... pass + + Using the `evaluate` API with an off-the-shelf LangChain evaluator: + + >>> from langsmith.evaluation import LangChainStringEvaluator + >>> def prepare_criteria_data(run: Run, example: Example): + ... return { + ... "prediction": run.outputs["output"], + ... "reference": example.outputs["answer"], + ... "input": str(example.inputs), + ... } + ... + >>> results = evaluate( + ... predict, + ... data=dataset_name, + ... evaluators=[ + ... accuracy, + ... LangChainStringEvaluator("embedding_distance"), + ... LangChainStringEvaluator( + ... "labeled_criteria", + ... config={ + ... "criteria": { + ... "usefulness": "The prediction is useful if it is correct" + ... " and/or asks a useful followup question." + ... }, + ... }, + ... prepare_data=prepare_criteria_data + ... ), + ... ], + ... summary_evaluators=[precision], + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + + Evaluating a LangChain object: + + >>> from langchain_core.runnables import chain as as_runnable + >>> @as_runnable + ... def nested_predict(inputs): + ... return {"output": "Yes"} + ... + >>> @as_runnable + ... def lc_predict(inputs): + ... return nested_predict.invoke(inputs) + ... + >>> results = evaluate( + ... lc_predict.invoke, + ... data=dataset_name, + ... evaluators=[accuracy], + ... summary_evaluators=[precision], + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + """ # noqa: E501 + return _evaluate( + target, + data=data, + evaluators=evaluators, + summary_evaluators=summary_evaluators, + metadata=metadata, + experiment_prefix=experiment_prefix, + max_concurrency=max_concurrency, + client=client, + blocking=blocking, + ) + + +@ls_beta.warn_beta +def evaluate_existing( + experiment: Union[str, uuid.UUID], + /, + data: DATA_T, + evaluators: Optional[Sequence[EVALUATOR_T]] = None, + summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, + metadata: Optional[dict] = None, + max_concurrency: Optional[int] = None, + client: Optional[langsmith.Client] = None, + load_nested: bool = False, + blocking: bool = True, +) -> ExperimentResults: + r"""Evaluate existing experiment runs. + + Args: + experiment (Union[str, uuid.UUID]): The identifier of the experiment to evaluate. + data (DATA_T): The data to use for evaluation. + evaluators (Optional[Sequence[EVALUATOR_T]]): Optional sequence of evaluators to use for individual run evaluation. + summary_evaluators (Optional[Sequence[SUMMARY_EVALUATOR_T]]): Optional sequence of evaluators + to apply over the entire dataset. + metadata (Optional[dict]): Optional metadata to include in the evaluation results. + max_concurrency (Optional[int]): Optional maximum number of concurrent evaluations. + client (Optional[langsmith.Client]): Optional Langsmith client to use for evaluation. + load_nested: Whether to load all child runs for the experiment. + Default is to only load the top-level root runs. + blocking (bool): Whether to block until evaluation is complete. + + Returns: + ExperimentResults: The evaluation results. + + Examples: + >>> from langsmith.evaluation import evaluate, evaluate_existing + >>> dataset_name = "Evaluate Examples" + >>> def predict(inputs: dict) -> dict: + ... # This can be any function or just an API call to your app. + ... return {"output": "Yes"} + ... + >>> # First run inference on the dataset + ... results = evaluate( + ... predict, + ... data=dataset_name, + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + >>> # Then apply evaluators to the experiment + ... def accuracy(run: Run, example: Example): + ... # Row-level evaluator for accuracy. + ... pred = run.outputs["output"] + ... expected = example.outputs["answer"] + ... return {"score": expected.lower() == pred.lower()} + ... + >>> def precision(runs: Sequence[Run], examples: Sequence[Example]): + ... # Experiment-level evaluator for precision. + ... # TP / (TP + FP) + ... predictions = [run.outputs["output"].lower() for run in runs] + ... expected = [example.outputs["answer"].lower() for example in examples] + ... # yes and no are the only possible answers + ... tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"]) + ... fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)]) + ... return {"score": tp / (tp + fp)} + >>> experiment_name = results.experiment_name # Can use the returned experiment name + >>> experiment_name = "My Experiment:64e6e91" # Or manually specify + >>> results = evaluate_existing( + ... experiment_name, + ... data=dataset_name, + ... summary_evaluators=[precision], + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + """ # noqa: E501 + client = client or langsmith.Client() + runs = _load_traces(experiment, client, load_nested=load_nested) + return _evaluate( + runs, + data=data, + evaluators=evaluators, + summary_evaluators=summary_evaluators, + metadata=metadata, + max_concurrency=max_concurrency, + client=client, + blocking=blocking, + ) + + +class ExperimentResultRow(TypedDict): + run: schemas.Run + example: schemas.Example + evaluation_results: EvaluationResults + + +class ExperimentResults: + """Represents the results of an evaluate() call. + + This class provides an iterator interface to iterate over the experiment results + as they become available. It also provides methods to access the experiment name, + the number of results, and to wait for the results to be processed. + + Methods: + experiment_name() -> str: Returns the name of the experiment. + wait() -> None: Waits for the experiment data to be processed. + """ + + def __init__( + self, + experiment_manager: _ExperimentManager, + ): + self._manager = experiment_manager + self._results: List[ExperimentResultRow] = [] + self._lock = threading.RLock() + self._thread = threading.Thread( + target=lambda: self._process_data(self._manager) + ) + self._thread.start() + + @property + def experiment_name(self) -> str: + return self._manager.experiment_name + + def __iter__(self) -> Iterator[ExperimentResultRow]: + processed_count = 0 + while True: + with self._lock: + if processed_count < len(self._results): + yield self._results[processed_count] + processed_count += 1 + elif not self._thread.is_alive(): + break + + def _process_data(self, manager: _ExperimentManager) -> None: + tqdm = _load_tqdm() + results = manager.get_results() + for item in tqdm(results): + with self._lock: + self._results.append(item) + summary_scores = manager.get_summary_scores() + with self._lock: + self._summary_results = summary_scores + + def __len__(self) -> int: + return len(self._results) + + def __repr__(self) -> str: + return f"" + + def wait(self) -> None: + """Wait for the evaluation runner to complete. + + This method blocks the current thread until the evaluation runner has + finished its execution. + """ + self._thread.join() + + +## Private API + + +def _is_callable(target: Union[TARGET_T, Iterable[schemas.Run]]) -> bool: + return callable(target) or (hasattr(target, "invoke") and callable(target.invoke)) + + +def _evaluate( + target: Union[TARGET_T, Iterable[schemas.Run]], + /, + data: DATA_T, + evaluators: Optional[Sequence[EVALUATOR_T]] = None, + summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None, + metadata: Optional[dict] = None, + experiment_prefix: Optional[str] = None, + max_concurrency: Optional[int] = None, + client: Optional[langsmith.Client] = None, + blocking: bool = True, +) -> ExperimentResults: + # Initialize the experiment manager. + manager = _ExperimentManager( + data, + client=client, + metadata=metadata, + experiment_prefix=experiment_prefix, + # If provided, we don't need to create a new experiment. + runs=None if _is_callable(target) else cast(Iterable[schemas.Run], target), + # Create or resolve the experiment. + ).start() + if _is_callable(target): + # Add predictions to the experiment. + manager = manager.with_predictions( + cast(TARGET_T, target), max_concurrency=max_concurrency + ) + if evaluators: + # Apply evaluators to the predictions. + manager = manager.with_evaluators(evaluators, max_concurrency=max_concurrency) + if summary_evaluators: + # Apply the experiment-level summary evaluators. + manager = manager.with_summary_evaluators(summary_evaluators) + # Start consuming the results. + results = ExperimentResults(manager) + if blocking: + # Wait for the evaluation to complete. + results.wait() + return results + + +def _is_uuid(value: str) -> bool: + try: + uuid.UUID(value) + return True + except ValueError: + return False + + +def _load_traces( + project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False +) -> Iterable[schemas.Run]: + """Load nested traces for a given project.""" + execution_order = None if load_nested else 1 + if isinstance(project, uuid.UUID) or _is_uuid(project): + runs = client.list_runs(project_id=project, execution_order=execution_order) + else: + runs = client.list_runs(project_name=project, execution_order=execution_order) + if not load_nested: + return runs + + treemap: DefaultDict[uuid.UUID, List[schemas.Run]] = collections.defaultdict(list) + results = [] + all_runs = {} + for run in runs: + if run.parent_run_id is not None: + treemap[run.parent_run_id].append(run) + else: + results.append(run) + all_runs[run.id] = run + for run_id, child_runs in treemap.items(): + all_runs[run_id].child_runs = sorted(child_runs, key=lambda r: r.dotted_order) + return results + + +def _load_tqdm() -> Callable[[Iterable], Iterable]: + try: + from tqdm.auto import tqdm + except ImportError: + return lambda x: x + return tqdm + + +class _ExperimentManager: + """Manage the execution of experiments. + + Supports lazily running predictions and evaluations in parallel to facilitate + result streaming and early debugging. + + Args: + data (DATA_T): The data used for the experiment. Can be a dataset name or ID OR + a generator of examples. + runs (Optional[Iterable[schemas.Run]]): The runs associated with the experiment + predictions. + experiment (Optional[schemas.TracerSession]): The tracer session + associated with the experiment. + experiment_prefix (Optional[str]): The prefix for the experiment name. + metadata (Optional[dict]): Additional metadata for the experiment. + client (Optional[langsmith.Client]): The Langsmith client used for + the experiment. + evaluation_results (Optional[Iterable[EvaluationResults]]): The evaluation + sresults for the experiment. + summary_results (Optional[Iterable[EvaluationResults]]): The aggregate results + for the experiment. + """ + + def __init__( + self, + data: DATA_T, + /, + metadata: Optional[dict] = None, + experiment_prefix: Optional[str] = None, + runs: Optional[Iterable[schemas.Run]] = None, + experiment: Optional[schemas.TracerSession] = None, + client: Optional[langsmith.Client] = None, + evaluation_results: Optional[Iterable[EvaluationResults]] = None, + summary_results: Optional[Iterable[EvaluationResults]] = None, + ): + self.client = client or langsmith.Client() + # If we've already created or started the experiment, + # use that name. + experiment_name, experiment, runs = _resolve_experiment_name( + experiment, + experiment_prefix, + runs, + self.client, + ) + self.experiment_name = experiment_name + self._experiment = experiment + self._runs = runs + metadata = metadata or {} + if not metadata.get("revision_id"): + metadata = { + "revision_id": ls_env.get_langchain_env_var_metadata().get( + "revision_id" + ), + **metadata, + } + self._metadata = metadata or {} + self._data = data + self._examples: Optional[Iterable[schemas.Example]] = None + self._evaluation_results = evaluation_results + self._summary_results = summary_results + + @property + def examples(self) -> Iterable[schemas.Example]: + if self._examples is None: + self._examples = _resolve_data(self._data, client=self.client) + self._examples, examples_iter = itertools.tee(self._examples) + return examples_iter + + @property + def evaluation_results(self) -> Iterable[EvaluationResults]: + if self._evaluation_results is None: + return [{"results": []} for _ in self.examples] + return self._evaluation_results + + @property + def runs(self) -> Iterable[schemas.Run]: + if self._runs is None: + raise ValueError( + "Runs not provided in this experiment." " Please predict first." + ) + self._runs, runs_iter = itertools.tee(self._runs) + return runs_iter + + def start(self) -> _ExperimentManager: + first_example = next(itertools.islice(self.examples, 1)) + _examples = itertools.chain([first_example], self.examples) + if self._experiment is None: + try: + project_metadata = self._metadata or {} + git_info = ls_env.get_git_info() + if git_info: + project_metadata = { + **project_metadata, + "git": git_info, + } + project = self.client.create_project( + self.experiment_name, + reference_dataset_id=first_example.dataset_id, + metadata=project_metadata, + ) + except (HTTPError, ValueError, ls_utils.LangSmithError) as e: + if "already exists " not in str(e): + raise e + raise ValueError( + # TODO: Better error + f"Experiment {self.experiment_name} already exists." + " Please use a different name." + ) + else: + project = self._experiment + if project.url: + # TODO: Make this a public API + project_url = project.url.split("?")[0] + dataset_id = first_example.dataset_id + base_url = project_url.split("/projects/p/")[0] + comparison_url = ( + f"{base_url}/datasets/{dataset_id}/compare?" + f"selectedSessions={project.id}" + ) + print( # noqa: T201 + f"View the evaluation results for experiment: '{self.experiment_name}'" + f" at:\n{comparison_url}\n\n" + ) + else: + # HACKHACK + print("Starting evaluation of experiment: %s", self.experiment_name) + return _ExperimentManager( + _examples, + experiment=project, + metadata=self._metadata, + client=self.client, + runs=self._runs, + evaluation_results=self._evaluation_results, + ) + + def with_predictions( + self, + target: TARGET_T, + /, + max_concurrency: Optional[int] = None, + ) -> _ExperimentManager: + """Lazily apply the target function to the experiment.""" + context = copy_context() + _experiment_results = context.run( + self._predict, target, max_concurrency=max_concurrency + ) + r1, r2 = itertools.tee(_experiment_results, 2) + return _ExperimentManager( + (pred["example"] for pred in r1), + experiment=self._experiment, + metadata=self._metadata, + client=self.client, + runs=(pred["run"] for pred in r2), + # TODO: Can't do multiple prediction rounds rn. + ) + + def with_evaluators( + self, + evaluators: Sequence[ + Union[ + EVALUATOR_T, + RunEvaluator, + ] + ], + *, + max_concurrency: Optional[int] = None, + ) -> _ExperimentManager: + """Lazily apply the provided evaluators to the experiment.""" + evaluators = _resolve_evaluators(evaluators) + context = copy_context() + experiment_results = context.run( + self._score, evaluators, max_concurrency=max_concurrency + ) + # Split the generator into three so the manager + # can consume each value individually. + r1, r2, r3 = itertools.tee(experiment_results, 3) + return _ExperimentManager( + (result["example"] for result in r1), + experiment=self._experiment, + metadata=self._metadata, + client=self.client, + runs=(result["run"] for result in r2), + evaluation_results=(result["evaluation_results"] for result in r3), + summary_results=self._summary_results, + ) + + def with_summary_evaluators( + self, + summary_evaluators: Sequence[SUMMARY_EVALUATOR_T], + ) -> _ExperimentManager: + """Lazily apply the provided summary evaluators to the experiment.""" + wrapped_evaluators = _wrap_summary_evaluators(summary_evaluators) + context = copy_context() + aggregate_feedback_gen = context.run( + self._apply_summary_evaluators, wrapped_evaluators + ) + return _ExperimentManager( + self.examples, + experiment=self._experiment, + metadata=self._metadata, + client=self.client, + runs=self.runs, + evaluation_results=self._evaluation_results, + summary_results=aggregate_feedback_gen, + ) + + def get_results(self) -> Iterable[ExperimentResultRow]: + """Return the traces, evaluation results, and associated examples.""" + for run, example, evaluation_results in zip( + self.runs, self.examples, self.evaluation_results + ): + yield ExperimentResultRow( + run=run, + example=example, + evaluation_results=evaluation_results, + ) + + def get_summary_scores(self) -> Dict[str, List[dict]]: + """If summary_evaluators were applied, consume and return the results.""" + if self._summary_results is None: + return {"results": []} + # Consume the generator + return { + "results": [ + res for results in self._summary_results for res in results["results"] + ] + } + + # Private methods. + + def _get_experiment(self) -> schemas.TracerSession: + if self._experiment is None: + raise ValueError("Experiment not started yet.") + return self._experiment + + def _end(self) -> None: + experiment = self._experiment + if experiment is None: + raise ValueError("Experiment not started yet.") + examples = list(self.examples) + modified_at = [ex.modified_at for ex in examples if ex.modified_at] + # Should always be defined in practice when fetched, + # but the typing permits None + max_modified_at = max(modified_at) if modified_at else None + + self.client.update_project( + experiment.id, + end_time=datetime.datetime.now(datetime.timezone.utc), + metadata={ + "dataset_version": ( + max_modified_at.isoformat() if max_modified_at else None + ) + }, + ) + + def _predict( + self, target: TARGET_T, /, max_concurrency: Optional[int] = None + ) -> Generator[_ForwardResults, None, None]: + """Run the target function on the examples.""" + fn = _ensure_traceable(target) + if max_concurrency == 0: + for example in self.examples: + yield _forward( + fn, example, self.experiment_name, self._metadata, self.client + ) + + else: + with cf.ThreadPoolExecutor(max_concurrency) as executor: + futures = [ + executor.submit( + _forward, + fn, + example, + self.experiment_name, + self._metadata, + self.client, + ) + for example in self.examples + ] + for future in cf.as_completed(futures): + yield future.result() + # Close out the project. + self._end() + + def _run_evaluators( + self, + evaluators: Sequence[RunEvaluator], + current_results: ExperimentResultRow, + ) -> ExperimentResultRow: + current_context = rh.get_tracing_context() + metadata = { + **(current_context["metadata"] or {}), + **{"experiment": self.experiment_name}, + } + with rh.tracing_context( + **{**current_context, "project_name": "evaluators", "metadata": metadata} + ): + run = current_results["run"] + example = current_results["example"] + eval_results = current_results["evaluation_results"] + for evaluator in evaluators: + try: + evaluator_response = evaluator.evaluate_run( + run=run, + example=example, + ) + eval_results["results"].extend( + # TODO: This is a hack + self.client._log_evaluation_feedback( + evaluator_response, + run=run, + ) + ) + except Exception as e: + logger.error( + f"Error running evaluator {repr(evaluator)} on" + f" run {run.id}: {repr(e)}", + exc_info=True, + ) + return ExperimentResultRow( + run=run, + example=example, + evaluation_results=eval_results, + ) + + def _score( + self, + evaluators: Sequence[RunEvaluator], + max_concurrency: Optional[int] = None, + ) -> Iterable[ExperimentResultRow]: + """Run the evaluators on the prediction stream. + + Expects runs to be available in the manager. + (e.g. from a previous prediction step) + """ + if max_concurrency == 0: + for current_results in self.get_results(): + yield self._run_evaluators(evaluators, current_results) + else: + with cf.ThreadPoolExecutor(max_workers=max_concurrency) as executor: + futures = [] + for current_results in self.get_results(): + futures.append( + executor.submit( + self._run_evaluators, + evaluators, + current_results, + ) + ) + for future in cf.as_completed(futures): + result = future.result() + yield result + + def _apply_summary_evaluators( + self, summary_evaluators: Sequence[SUMMARY_EVALUATOR_T] + ) -> Generator[EvaluationResults, None, None]: + runs, examples = [], [] + for run, example in zip(self.runs, self.examples): + runs.append(run) + examples.append(example) + aggregate_feedback = [] + with cf.ThreadPoolExecutor() as executor: + project_id = self._get_experiment().id + for evaluator in summary_evaluators: + try: + summary_eval_result = evaluator(runs, examples) + # TODO: Expose public API for this. + flattened_results = self.client._select_eval_results( + summary_eval_result, + fn_name=evaluator.__name__, + ) + aggregate_feedback.extend(flattened_results) + for result in flattened_results: + feedback = result.dict(exclude={"target_run_id"}) + evaluator_info = feedback.pop("evaluator_info", None) + executor.submit( + self.client.create_feedback, + **feedback, + run_id=None, + project_id=project_id, + source_info=evaluator_info, + ) + except Exception as e: + logger.error( + f"Error running summary evaluator {repr(evaluator)}: {e}" + ) + yield {"results": aggregate_feedback} + + +def _resolve_evaluators( + evaluators: Sequence[EVALUATOR_T], +) -> Sequence[RunEvaluator]: + results = [] + for evaluator in evaluators: + if isinstance(evaluator, RunEvaluator): + results.append(evaluator) + elif isinstance(evaluator, LangChainStringEvaluator): + results.append(evaluator.as_run_evaluator()) + else: + results.append(run_evaluator(evaluator)) + return results + + +def _wrap_summary_evaluators( + evaluators: Sequence[SUMMARY_EVALUATOR_T], +) -> List[SUMMARY_EVALUATOR_T]: + def _wrap(evaluator: SUMMARY_EVALUATOR_T) -> SUMMARY_EVALUATOR_T: + eval_name = getattr(evaluator, "__name__", "BatchEvaluator") + + @functools.wraps(evaluator) + def _wrapper_inner( + runs: Sequence[schemas.Run], examples: Sequence[schemas.Example] + ) -> EvaluationResults: + @rh.traceable(name=eval_name) + def _wrapper_super_inner(runs_: str, examples_: str) -> EvaluationResults: + return evaluator(runs, examples) + + return _wrapper_super_inner( + f"Runs[] (Length={len(runs)})", f"Examples[] (Length={len(examples)})" + ) + + return _wrapper_inner + + results = [] + for evaluator in evaluators: + results.append(_wrap(evaluator)) + return results + + +class _ForwardResults(TypedDict): + run: schemas.Run + example: schemas.Example + + +def _forward( + fn: rh.SupportsLangsmithExtra, + example: schemas.Example, + experiment_name: str, + metadata: dict, + client: langsmith.Client, +) -> _ForwardResults: + run: Optional[schemas.RunBase] = None + + def _get_run(r: run_trees.RunTree) -> None: + nonlocal run + run = r + + try: + fn( + example.inputs, + langsmith_extra=rh.LangSmithExtra( + reference_example_id=example.id, + on_end=_get_run, + project_name=experiment_name, + metadata=metadata, + client=client, + ), + ) + except Exception as e: + logger.error(f"Error running target function: {e}") + return _ForwardResults( + run=cast(schemas.Run, run), + example=example, + ) + + +def _resolve_data( + data: DATA_T, *, client: langsmith.Client +) -> Iterable[schemas.Example]: + """Return the examples for the given dataset.""" + if isinstance(data, str): + return client.list_examples(dataset_name=data) + elif isinstance(data, uuid.UUID): + return client.list_examples(dataset_id=data) + return data + + +def _ensure_traceable(target: TARGET_T) -> rh.SupportsLangsmithExtra: + """Ensure the target function is traceable.""" + if not callable(target): + raise ValueError("Target must be a callable function.") + if rh.is_traceable_function(target): + fn = cast(rh.SupportsLangsmithExtra, target) + else: + fn = rh.traceable(name="Target")(target) + return fn + + +def _resolve_experiment_name( + experiment: Optional[schemas.TracerSession], + experiment_prefix: Optional[str], + runs: Optional[Iterable[schemas.Run]], + client: langsmith.Client, +) -> Tuple[str, Optional[schemas.TracerSession], Optional[Iterable[schemas.Run]]]: + if experiment is not None: + if not experiment.name: + raise ValueError("Experiment name must be defined if provided.") + return experiment.name, experiment, runs + # If we have runs, that means the experiment was already started. + if runs is not None: + runs, runs_iter = itertools.tee(runs) + first_run = next(runs_iter) + experiment = client.read_project(project_id=first_run.session_id) + if not experiment.name: + raise ValueError("Experiment name not found for provided runs.") + return experiment.name, experiment, runs + # Otherwise, we will generate a new experiment name. + if isinstance(experiment_prefix, str): + return experiment_prefix + ":" + uuid.uuid4().hex[:7], None, None + return _get_random_name(), None, None + + +def _get_random_name() -> str: + from langsmith.evaluation._name_generation import random_name # noqa: F401 + + return random_name() diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 24f280ef8..65e123042 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -194,6 +194,10 @@ def __call__( """ # noqa: E501 return self.evaluate_run(run, example) + def __repr__(self) -> str: + """String representation of the DynamicRunEvaluator object.""" + return f"" + def run_evaluator( func: Callable[ diff --git a/python/langsmith/evaluation/integrations/__init__.py b/python/langsmith/evaluation/integrations/__init__.py new file mode 100644 index 000000000..ef8a30345 --- /dev/null +++ b/python/langsmith/evaluation/integrations/__init__.py @@ -0,0 +1,8 @@ +"""This module provides integration wrappers for popular open source eval frameworks. + +to be used with LangSmith. +""" + +from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator + +__all__ = ["LangChainStringEvaluator"] diff --git a/python/langsmith/evaluation/integrations/_langchain.py b/python/langsmith/evaluation/integrations/_langchain.py new file mode 100644 index 000000000..3e0f79016 --- /dev/null +++ b/python/langsmith/evaluation/integrations/_langchain.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Callable, Optional, TypedDict, Union + +from langsmith.evaluation.evaluator import run_evaluator +from langsmith.run_helpers import traceable +from langsmith.schemas import Example, Run + +if TYPE_CHECKING: + from langchain.evaluation.schema import StringEvaluator + + from langsmith.evaluation.evaluator import RunEvaluator + + +class SingleEvaluatorInput(TypedDict): + """The input to a `StringEvaluator`.""" + + prediction: str + """The prediction string.""" + reference: Optional[Any] + """The reference string.""" + input: Optional[str] + """The input string.""" + + +class LangChainStringEvaluator: + r"""A class for wrapping a LangChain StringEvaluator. + + Attributes: + evaluator (StringEvaluator): The underlying StringEvaluator OR the name + of the evaluator to load. + + Methods: + as_run_evaluator() -> RunEvaluator: + Convert the LangChainStringEvaluator to a RunEvaluator. + + Examples: + Creating a simple LangChainStringEvaluator: + + >>> evaluator = LangChainStringEvaluator("exact_match") + + Converting a LangChainStringEvaluator to a RunEvaluator: + + >>> from langsmith.evaluation import LangChainStringEvaluator + >>> evaluator = LangChainStringEvaluator( + ... "criteria", + ... config={ + ... "criteria": { + ... "usefulness": "The prediction is useful if" + ... " it is correct and/or asks a useful followup question." + ... }, + ... } + ... ) + >>> run_evaluator = evaluator.as_run_evaluator() + >>> run_evaluator # doctest: +ELLIPSIS + + + Using the `evaluate` API with different evaluators: + >>> def prepare_data(run: Run, example: Example): + ... # Convert the evaluation data into the format expected by the evaluator + ... # Only required for datasets with multiple inputs/output keys + ... return { + ... "prediction": run.outputs["prediction"], + ... "reference": example.outputs["answer"], + ... "input": str(example.inputs), + ... } + ... + >>> import re + >>> from langchain_anthropic import ChatAnthropic + >>> import langsmith + >>> from langsmith.evaluation import LangChainStringEvaluator, evaluate + >>> criteria_evaluator = LangChainStringEvaluator( + ... "criteria", config={ + ... "criteria": { + ... "usefulness": "The prediction is useful if it is correct" + ... " and/or asks a useful followup question." + ... }, + ... "llm": ChatAnthropic(model="claude-3-opus-20240229") + ... }, + ... prepare_data=prepare_data + ... ) + >>> embedding_evaluator = LangChainStringEvaluator("embedding_distance") + >>> exact_match_evaluator = LangChainStringEvaluator("exact_match") + >>> regex_match_evaluator = LangChainStringEvaluator( + ... "regex_match", config={ + ... "flags": re.IGNORECASE + ... }, + ... prepare_data=prepare_data + ... ) + >>> scoring_evaluator = LangChainStringEvaluator( + ... "labeled_score_string", config={ + ... "criteria": { + ... "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate" + ... }, + ... "normalize_by": 10 + ... }, + ... prepare_data=prepare_data + ... ) + >>> string_distance_evaluator = LangChainStringEvaluator( + ... "string_distance", config={ + ... "distance_metric": "levenshtein" + ... }, + ... prepare_data=prepare_data + ... ) + >>> from langsmith import Client + >>> client = Client() + >>> results = evaluate( + ... lambda inputs: {"prediction": "foo"}, + ... data=client.list_examples(dataset_name="Evaluate Examples", limit=1), + ... evaluators=[ + ... embedding_evaluator, + ... criteria_evaluator, + ... exact_match_evaluator, + ... regex_match_evaluator, + ... scoring_evaluator, + ... string_distance_evaluator + ... ], + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... + """ # noqa: E501 + + def __init__( + self, + evaluator: Union[StringEvaluator, str], + *, + config: Optional[dict] = None, + prepare_data: Optional[ + Callable[[Run, Optional[Example]], SingleEvaluatorInput] + ] = None, + ): + """Initialize a LangChainStringEvaluator. + + See: https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.StringEvaluator.html#langchain-evaluation-schema-stringevaluator + + Args: + evaluator (StringEvaluator): The underlying StringEvaluator. + """ + from langchain.evaluation.schema import StringEvaluator # noqa: F811 + + if isinstance(evaluator, StringEvaluator): + self.evaluator = evaluator + elif isinstance(evaluator, str): + from langchain.evaluation import load_evaluator # noqa: F811 + + self.evaluator = load_evaluator(evaluator, **(config or {})) # type: ignore[assignment, arg-type] + else: + raise NotImplementedError(f"Unsupported evaluator type: {type(evaluator)}") + + self._prepare_data = prepare_data + + def as_run_evaluator( + self, + ) -> RunEvaluator: + """Convert the LangChainStringEvaluator to a RunEvaluator. + + This is the object used in the LangSmith `evaluate` API. + + Returns: + RunEvaluator: The converted RunEvaluator. + """ + input_str = ( + "\n \"input\": example.inputs['input']," + if self.evaluator.requires_input + else "" + ) + reference_str = ( + "\n \"reference\": example.outputs['expected']" + if self.evaluator.requires_reference + else "" + ) + customization_error_str = f""" +def prepare_data(run, example): + return {{ + "prediction": run.outputs['my_output'],{reference_str}{input_str} + }} +evaluator = LangChainStringEvaluator(..., prepare_data=prepare_data) +""" + + @traceable + def prepare_evaluator_inputs( + run: Run, example: Optional[Example] = None + ) -> SingleEvaluatorInput: + if run.outputs and len(run.outputs) > 1: + raise ValueError( + f"Evaluator {self.evaluator} only supports a single prediction " + "key. Please ensure that the run has a single output." + " Or initialize with a prepare_data:\n" + f"{customization_error_str}" + ) + if ( + self.evaluator.requires_reference + and example + and example.outputs + and len(example.outputs) > 1 + ): + raise ValueError( + f"Evaluator {self.evaluator} nly supports a single reference key. " + "Please ensure that the example has a single output." + " Or create a custom evaluator yourself:\n" + f"{customization_error_str}" + ) + if ( + self.evaluator.requires_input + and example + and example.inputs + and len(example.inputs) > 1 + ): + raise ValueError( + f"Evaluator {self.evaluator} only supports a single input key. " + "Please ensure that the example has a single input." + " Or initialize with a prepare_data:\n" + f"{customization_error_str}" + ) + + return SingleEvaluatorInput( + prediction=next(iter(run.outputs.values())), # type: ignore[union-attr] + reference=( + next(iter(example.outputs.values())) + if ( + self.evaluator.requires_reference + and example + and example.outputs + ) + else None + ), + input=( + next(iter(example.inputs.values())) + if (self.evaluator.requires_input and example and example.inputs) + else None + ), + ) + + @traceable(name=self.evaluator.evaluation_name) + def evaluate(run: Run, example: Optional[Example] = None) -> dict: + eval_inputs = ( + prepare_evaluator_inputs(run, example) + if self._prepare_data is None + else self._prepare_data(run, example) + ) + results = self.evaluator.evaluate_strings(**eval_inputs) + return {"key": self.evaluator.evaluation_name, **results} + + return run_evaluator(evaluate) diff --git a/python/langsmith/run_helpers.py b/python/langsmith/run_helpers.py index 2d3baad26..f35aaaba2 100644 --- a/python/langsmith/run_helpers.py +++ b/python/langsmith/run_helpers.py @@ -48,10 +48,43 @@ def get_current_run_tree() -> Optional[run_trees.RunTree]: - """Get the current run tree context.""" + """Get the current run tree.""" return _PARENT_RUN_TREE.get() +def get_tracing_context() -> dict: + """Get the current tracing context.""" + return { + "parent_run": _PARENT_RUN_TREE.get(), + "project_name": _PROJECT_NAME.get(), + "tags": _TAGS.get(), + "metadata": _METADATA.get(), + } + + +@contextlib.contextmanager +def tracing_context( + *, + project_name: Optional[str] = None, + tags: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + parent_run: Optional[run_trees.RunTree] = None, +) -> Generator[None, None, None]: + """Set the tracing context for a block of code.""" + parent_run_ = get_run_tree_context() + _PROJECT_NAME.set(project_name) + _TAGS.set(tags) + _METADATA.set(metadata) + _PARENT_RUN_TREE.set(parent_run) + try: + yield + finally: + _PROJECT_NAME.set(None) + _TAGS.set(None) + _METADATA.set(None) + _PARENT_RUN_TREE.set(parent_run_) + + get_run_tree_context = get_current_run_tree @@ -106,6 +139,7 @@ class LangSmithExtra(TypedDict, total=False): tags: Optional[List[str]] run_id: Optional[ls_client.ID_TYPE] client: Optional[ls_client.Client] + on_end: Optional[Callable[[run_trees.RunTree], Any]] class _TraceableContainer(TypedDict, total=False): @@ -116,6 +150,7 @@ class _TraceableContainer(TypedDict, total=False): outer_project: Optional[str] outer_metadata: Optional[Dict[str, Any]] outer_tags: Optional[List[str]] + on_end: Optional[Callable[[run_trees.RunTree], Any]] class _ContainerInput(TypedDict, total=False): @@ -150,6 +185,12 @@ def _container_end( logger.info(f"See trace: {run_tree.get_url()}") except Exception: pass + on_end = container.get("on_end") + if on_end is not None and callable(on_end): + try: + on_end(run_tree) + except Exception as e: + logger.warning(f"Failed to run on_end function: {e}") def _collect_extra(extra_outer: dict, langsmith_extra: LangSmithExtra) -> dict: @@ -178,13 +219,21 @@ def _setup_run( outer_project = _PROJECT_NAME.get() langsmith_extra = langsmith_extra or LangSmithExtra() parent_run_ = langsmith_extra.get("run_tree") or get_run_tree_context() + project_cv = _PROJECT_NAME.get() selected_project = ( - _PROJECT_NAME.get() # From parent trace + project_cv # From parent trace or langsmith_extra.get("project_name") # at invocation time or container_input["project_name"] # at decorator time or utils.get_tracer_project() # default ) - if not parent_run_ and not utils.tracing_is_enabled(): + reference_example_id = langsmith_extra.get("reference_example_id") + id_ = langsmith_extra.get("run_id") + if ( + not project_cv + and not reference_example_id + and not parent_run_ + and not utils.tracing_is_enabled() + ): utils.log_once( logging.DEBUG, "LangSmith tracing is disabled, returning original function." ) @@ -194,7 +243,9 @@ def _setup_run( outer_project=outer_project, outer_metadata=None, outer_tags=None, + on_end=langsmith_extra.get("on_end"), ) + id_ = id_ or str(uuid.uuid4()) signature = inspect.signature(func) name_ = name or func.__name__ docstring = func.__doc__ @@ -223,7 +274,6 @@ def _setup_run( tags_ = (langsmith_extra.get("tags") or []) + (outer_tags or []) _TAGS.set(tags_) tags_ += tags or [] - id_ = langsmith_extra.get("run_id", uuid.uuid4()) client_ = langsmith_extra.get("client", client) if parent_run_ is not None: new_run = parent_run_.create_child( @@ -250,7 +300,7 @@ def _setup_run( }, inputs=inputs, run_type=run_type, - reference_example_id=langsmith_extra.get("reference_example_id"), + reference_example_id=reference_example_id, project_name=selected_project, extra=extra_inner, tags=tags_, @@ -266,6 +316,7 @@ def _setup_run( outer_project=outer_project, outer_metadata=outer_metadata, outer_tags=outer_tags, + on_end=langsmith_extra.get("on_end"), ) _PROJECT_NAME.set(response_container["project_name"]) _PARENT_RUN_TREE.set(response_container["new_run"]) @@ -291,7 +342,7 @@ class SupportsLangsmithExtra(Protocol, Generic[R]): Args: *args: Variable length arguments. - langsmith_extra (Optional[Dict[str, Any]]): Optional dictionary of + langsmith_extra (Optional[LangSmithExtra): Optional dictionary of additional parameters for Langsmith. **kwargs: Keyword arguments. @@ -302,7 +353,7 @@ class SupportsLangsmithExtra(Protocol, Generic[R]): def __call__( self, *args: Any, - langsmith_extra: Optional[Dict[str, Any]] = None, + langsmith_extra: Optional[LangSmithExtra] = None, **kwargs: Any, ) -> R: """Call the instance when it is called as a function. diff --git a/python/poetry.lock b/python/poetry.lock index 89c1800ff..2890bf05b 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -1041,6 +1041,17 @@ files = [ [package.dependencies] urllib3 = ">=2" +[[package]] +name = "types-tqdm" +version = "4.66.0.20240106" +description = "Typing stubs for tqdm" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-tqdm-4.66.0.20240106.tar.gz", hash = "sha256:7acf4aade5bad3ded76eb829783f9961b1c2187948eaa6dd1ae8644dff95a938"}, + {file = "types_tqdm-4.66.0.20240106-py3-none-any.whl", hash = "sha256:7459b0f441b969735685645a5d8480f7912b10d05ab45f99a2db8a8e45cb550b"}, +] + [[package]] name = "typing-extensions" version = "4.8.0" @@ -1126,4 +1137,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "79450ee1ccc1bb4c2c4b1a49ce97f336387f45a432d7c269737be112abfae8d7" +content-hash = "d410c02f3874131fe4b49ae7db5576b257263d38ed2972f5ce3c9d8ea6b010ef" diff --git a/python/pyproject.toml b/python/pyproject.toml index dbb10ad5d..5f7e9a53a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langsmith" -version = "0.1.31" +version = "0.1.33" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." authors = ["LangChain "] license = "MIT" @@ -49,6 +49,7 @@ pytest-watcher = "^0.3.4" pytest-xdist = "^3.5.0" pytest-cov = "^4.1.0" dataclasses-json = "^0.6.4" +types-tqdm = "^4.66.0.20240106" [tool.poetry.group.lint.dependencies] openai = "^1.10" diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index bccfb2f55..3ee7c81a0 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -42,36 +42,6 @@ def langchain_client(monkeypatch: pytest.MonkeyPatch) -> Client: return Client() -def test_projects(langchain_client: Client, monkeypatch: pytest.MonkeyPatch) -> None: - """Test projects.""" - new_project = "__Test Project" - if langchain_client.has_project(new_project): - langchain_client.delete_project(project_name=new_project) - - monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com") - langchain_client.create_project( - project_name=new_project, - project_extra={"evaluator": "THE EVALUATOR"}, - ) - project = langchain_client.read_project(project_name=new_project) - assert project.name == new_project - runs = list(langchain_client.list_runs(project_name=new_project)) - project_id_runs = list(langchain_client.list_runs(project_id=project.id)) - assert len(runs) == len(project_id_runs) == 0 - langchain_client.delete_project(project_name=new_project) - - with pytest.raises(LangSmithError): - langchain_client.read_project(project_name=new_project) - assert new_project not in set( - [ - sess.name - for sess in langchain_client.list_projects(name_contains=new_project) - ] - ) - with pytest.raises(LangSmithError): - langchain_client.delete_project(project_name=new_project) - - def test_datasets(langchain_client: Client) -> None: """Test datasets.""" csv_content = "col1,col2\nval1,val2" @@ -182,22 +152,6 @@ def test_error_surfaced_invalid_uri(monkeypatch: pytest.MonkeyPatch, uri: str) - client.create_run("My Run", inputs={"text": "hello world"}, run_type="llm") -@freeze_time("2023-01-01") -def test_create_project( - monkeypatch: pytest.MonkeyPatch, langchain_client: Client -) -> None: - """Test the project creation""" - monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com") - project_name = "__test_create_project" + uuid4().hex[:4] - if langchain_client.has_project(project_name): - langchain_client.delete_project(project_name=project_name) - try: - project = langchain_client.create_project(project_name=project_name) - assert project.name == project_name - finally: - langchain_client.delete_project(project_name=project_name) - - def test_create_dataset( monkeypatch: pytest.MonkeyPatch, langchain_client: Client ) -> None: