diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 6a7f3ab3d..2ab94700c 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -647,16 +647,20 @@ def evaluate_comparative( ... }, ... ) ... tool_args = completion.choices[0].message.tool_calls[0].function.arguments - ... preference = json.loads(tool_args)["preferred_option"] + ... loaded_args = json.loads(tool_args) + ... preference = loaded_args["preferred_option"] + ... comment = loaded_args["reasoning"] ... if preference == "A": ... return { ... "key": "ranked_preference", ... "scores": {runs[0].id: 1, runs[1].id: 0}, + ... "comment": comment, ... } ... else: ... return { ... "key": "ranked_preference", ... "scores": {runs[0].id: 0, runs[1].id: 1}, + ... "comment": comment, ... } >>> def score_length_difference(runs: list, example: schemas.Example): ... # Just return whichever response is longer. @@ -781,12 +785,18 @@ def evaluate_and_submit_feedback( result = comparator.compare_runs(runs_list, example) if client is None: raise ValueError("Client is required to submit feedback.") + comments = ( + {str(rid): result.comment for rid in result.scores} + if isinstance(result.comment, str) + else (result.comment or {}) + ) for run_id, score in result.scores.items(): executor.submit( client.create_feedback, run_id=run_id, key=result.key, score=score, + comment=comments.get(str(run_id)), comparative_experiment_id=comparative_experiment.id, source_run_id=result.source_run_id, feedback_group_id=feedback_group_id, diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index feb0e95e4..668f99e9d 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -158,6 +158,9 @@ class ComparisonEvaluationResult(BaseModel): """The scores for each run in the comparison.""" source_run_id: Optional[Union[uuid.UUID, str]] = None """The ID of the trace of the evaluator itself.""" + comment: Optional[Union[str, Dict[Union[uuid.UUID, str], str]]] = None + """Comment for the scores. If a string, it's shared across all target runs. + If a dict, it maps run IDs to individual comments.""" _COMPARISON_OUTPUT = Union[ComparisonEvaluationResult, dict]