From 9b3c5f4764ac2b5a75e93e5b90ab1398eb119430 Mon Sep 17 00:00:00 2001 From: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:20:01 -0700 Subject: [PATCH 1/3] Faster Serialization 1. Stop serializing arbitrary python objects 2. Stop recursing upon hitting unserializable object This should help reduce overhead of tracing when you have large or unserializable content --- python/langsmith/_testing.py | 3 +- python/langsmith/client.py | 121 ++++++++++++++++--------- python/tests/unit_tests/test_client.py | 109 ++++++++-------------- 3 files changed, 116 insertions(+), 117 deletions(-) diff --git a/python/langsmith/_testing.py b/python/langsmith/_testing.py index 3d5ac9c3b..7a7be742c 100644 --- a/python/langsmith/_testing.py +++ b/python/langsmith/_testing.py @@ -372,8 +372,7 @@ def _end_tests( def _serde_example_values(values: VT) -> VT: if values is None: return values - # Don't try to magically serialize Python objects, just use their REPRs. - bts = ls_client._dumps_json(values, serialize_py=False) + bts = ls_client._dumps_json(values) return orjson.loads(bts) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 1f0797bbe..0a678a4aa 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -3,16 +3,20 @@ from __future__ import annotations import atexit +import base64 import collections import concurrent.futures as cf import datetime +import decimal import functools import importlib import importlib.metadata import io +import ipaddress import json import logging import os +import pathlib import random import re import sys @@ -36,6 +40,7 @@ List, Literal, Mapping, + NamedTuple, Optional, Sequence, Tuple, @@ -57,11 +62,20 @@ from langsmith import utils as ls_utils from langsmith._internal._beta_decorator import warn_beta +try: + from zoneinfo import ZoneInfo # type: ignore[import-not-found] +except ImportError: + + class ZoneInfo: # type: ignore[no-redef] + """Introduced in python 3.9.""" + + if TYPE_CHECKING: import pandas as pd # type: ignore from langsmith.evaluation import evaluator as ls_evaluator + logger = logging.getLogger(__name__) _urllib3_logger = logging.getLogger("urllib3.connectionpool") @@ -151,74 +165,99 @@ def _default_retry_config() -> Retry: return ls_utils.LangSmithRetry(**retry_params) # type: ignore -_MAX_DEPTH = 2 +_MAX_DEPTH = 1 + + +class _Fragment(NamedTuple): + buf: bytes -def _simple_default(obj: Any) -> Any: - # Don't traverse into nested objects +def _simple_default(obj): try: + # Only need to handle types that orjson doesn't serialize by default + # https://github.com/ijl/orjson#serialize + if isinstance(obj, _Fragment): + return orjson.Fragment(obj.buf) if isinstance(obj, datetime.datetime): return obj.isoformat() if isinstance(obj, uuid.UUID): return str(obj) - return json.loads(json.dumps(obj)) + if hasattr(obj, "model_dump") and callable(obj.model_dump): + return obj.model_dump() + elif hasattr(obj, "dict") and callable(obj.dict): + return obj.dict() + elif hasattr(obj, "_asdict") and callable(obj._asdict): + return obj._asdict() + elif isinstance(obj, BaseException): + return {"error": type(obj).__name__, "message": str(obj)} + elif isinstance(obj, (set, frozenset, collections.deque)): + return list(obj) + elif isinstance(obj, (datetime.timezone, ZoneInfo)): + return obj.tzname(None) + elif isinstance(obj, datetime.timedelta): + return obj.total_seconds() + elif isinstance(obj, decimal.Decimal): + if obj.as_tuple().exponent >= 0: + return int(obj) + else: + return float(obj) + elif isinstance( + obj, + ( + ipaddress.IPv4Address, + ipaddress.IPv4Interface, + ipaddress.IPv4Network, + ipaddress.IPv6Address, + ipaddress.IPv6Interface, + ipaddress.IPv6Network, + pathlib.Path, + ), + ): + return str(obj) + elif isinstance(obj, re.Pattern): + return obj.pattern + elif isinstance(obj, (bytes, bytearray)): + return base64.b64encode(obj).decode() + return repr(obj) except BaseException as e: logger.debug(f"Failed to serialize {type(obj)} to JSON: {e}") - return repr(obj) + return repr(obj) -def _serialize_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> Any: +def _serialize_json(obj: Any, depth: int = 0) -> Any: try: if depth >= _MAX_DEPTH: try: - return orjson.loads(_dumps_json_single(obj)) + return _dumps_json_single(obj) except BaseException: return repr(obj) if isinstance(obj, bytes): return obj.decode("utf-8") if isinstance(obj, (set, tuple)): - return orjson.loads(_dumps_json_single(list(obj))) + if hasattr(obj, "_asdict") and callable(obj._asdict): + # NamedTuple + return obj._asdict() + return list(obj) serialization_methods = [ - ("model_dump_json", True), # Pydantic V2 - ("json", True), # Pydantic V1 - ("to_json", False), # dataclass_json ("model_dump", True), # Pydantic V2 with non-serializable fields - ("dict", False), # Pydantic V1 with non-serializable fields + ("dict", False), # Pydantic V1 with non-serializable field + ("to_dict", False), # dataclasses-json ] for attr, exclude_none in serialization_methods: if hasattr(obj, attr) and callable(getattr(obj, attr)): try: method = getattr(obj, attr) - json_str = ( + return ( method(exclude_none=exclude_none) if exclude_none else method() ) - if isinstance(json_str, str): - return json.loads(json_str) - return orjson.loads( - _dumps_json( - json_str, depth=depth + 1, serialize_py=serialize_py - ) - ) except Exception as e: - logger.debug(f"Failed to serialize {type(obj)} to JSON: {e}") + logger.error( + f"Failed to use {attr} to serialize {type(obj)} to" + f" JSON: {repr(e)}" + ) pass - if serialize_py: - all_attrs = {} - if hasattr(obj, "__slots__"): - all_attrs.update( - {slot: getattr(obj, slot, None) for slot in obj.__slots__} - ) - if hasattr(obj, "__dict__"): - all_attrs.update(vars(obj)) - if all_attrs: - filtered = { - k: v if v is not obj else repr(v) for k, v in all_attrs.items() - } - return orjson.loads( - _dumps_json(filtered, depth=depth + 1, serialize_py=serialize_py) - ) - return repr(obj) + return _simple_default(obj) except BaseException as e: logger.debug(f"Failed to serialize {type(obj)} to JSON: {e}") return repr(obj) @@ -236,7 +275,7 @@ def _dumps_json_single( try: return orjson.dumps( obj, - default=default, + default=default or _simple_default, option=orjson.OPT_SERIALIZE_NUMPY | orjson.OPT_SERIALIZE_DATACLASS | orjson.OPT_SERIALIZE_UUID @@ -259,7 +298,7 @@ def _dumps_json_single( return result -def _dumps_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> bytes: +def _dumps_json(obj: Any, depth: int = 0) -> bytes: """Serialize an object to a JSON formatted string. Parameters @@ -274,9 +313,7 @@ def _dumps_json(obj: Any, depth: int = 0, serialize_py: bool = True) -> bytes: str The JSON formatted string. """ - return _dumps_json_single( - obj, functools.partial(_serialize_json, depth=depth, serialize_py=serialize_py) - ) + return _dumps_json_single(obj, functools.partial(_serialize_json, depth=depth)) def close_session(session: requests.Session) -> None: diff --git a/python/tests/unit_tests/test_client.py b/python/tests/unit_tests/test_client.py index 0e648ffc4..2e8b2043a 100644 --- a/python/tests/unit_tests/test_client.py +++ b/python/tests/unit_tests/test_client.py @@ -7,7 +7,6 @@ import json import math import sys -import threading import time import uuid import warnings @@ -15,11 +14,10 @@ from datetime import datetime, timezone from enum import Enum from io import BytesIO -from typing import Any, NamedTuple, Optional, Type, Union +from typing import Dict, NamedTuple, Optional, Type, Union from unittest import mock from unittest.mock import MagicMock, patch -import attr import dataclasses_json import orjson import pytest @@ -690,18 +688,20 @@ def __init__(self, x: int) -> None: self.a_dict = {"foo": "bar"} self.my_bytes = b"foo" + def __repr__(self) -> str: + return "I fell back" + + def __hash__(self) -> int: + return 1 + class ClassWithTee: def __init__(self) -> None: tee_a, tee_b = itertools.tee(range(10)) self.tee_a = tee_a self.tee_b = tee_b - class MyClassWithSlots: - __slots__ = ["x", "y"] - - def __init__(self, x: int) -> None: - self.x = x - self.y = "y" + def __repr__(self): + return "tee_a, tee_b" class MyPydantic(BaseModel): foo: str @@ -719,11 +719,11 @@ class MyEnum(str, Enum): FOO = "foo" BAR = "bar" - class ClassWithFakeJson: - def json(self): + class ClassWithFakeDict: + def dict(self) -> Dict: raise ValueError("This should not be called") - def to_json(self) -> dict: + def to_dict(self) -> Dict: return {"foo": "bar"} @dataclasses_json.dataclass_json @@ -731,40 +731,9 @@ def to_json(self) -> dict: class Person: name: str - @attr.dataclass - class AttrDict: - foo: str = attr.ib() - bar: int - uid = uuid.uuid4() current_time = datetime.now() - class NestedClass: - __slots__ = ["person", "lock"] - - def __init__(self) -> None: - self.person = Person(name="foo") - self.lock = [threading.Lock()] - - class CyclicClass: - def __init__(self) -> None: - self.cyclic = self - - def __repr__(self) -> str: - return "SoCyclic" - - class CyclicClass2: - def __init__(self) -> None: - self.cyclic: Any = None - self.other: Any = None - - def __repr__(self) -> str: - return "SoCyclic2" - - cycle_2 = CyclicClass2() - cycle_2.cyclic = CyclicClass2() - cycle_2.cyclic.other = cycle_2 - class MyNamedTuple(NamedTuple): foo: str bar: int @@ -774,59 +743,39 @@ class MyNamedTuple(NamedTuple): "time": current_time, "my_class": MyClass(1), "class_with_tee": ClassWithTee(), - "my_slotted_class": MyClassWithSlots(1), "my_dataclass": MyDataclass("foo", 1), "my_enum": MyEnum.FOO, "my_pydantic": MyPydantic(foo="foo", bar=1), - "person": Person(name="foo"), + "person": Person(name="foo_person"), "a_bool": True, "a_none": None, "a_str": "foo", "an_int": 1, "a_float": 1.1, - "nested_class": NestedClass(), - "attr_dict": AttrDict(foo="foo", bar=1), "named_tuple": MyNamedTuple(foo="foo", bar=1), - "cyclic": CyclicClass(), - "cyclic2": cycle_2, - "fake_json": ClassWithFakeJson(), + "fake_json": ClassWithFakeDict(), + "some_set": set("a"), + "set_with_class": set([MyClass(1)]), } res = orjson.loads(_dumps_json(to_serialize)) expected = { "uid": str(uid), "time": current_time.isoformat(), - "my_class": { - "x": 1, - "y": "y", - "a_list": [1, 2, 3], - "a_tuple": [1, 2, 3], - "a_set": [1, 2, 3], - "a_dict": {"foo": "bar"}, - "my_bytes": "foo", - }, - "class_with_tee": lambda val: all( - ["_tee object" in val[key] for key in ["tee_a", "tee_b"]] - ), - "my_slotted_class": {"x": 1, "y": "y"}, + "my_class": "I fell back", + "class_with_tee": "tee_a, tee_b", "my_dataclass": {"foo": "foo", "bar": 1}, "my_enum": "foo", "my_pydantic": {"foo": "foo", "bar": 1}, - "person": {"name": "foo"}, + "person": {"name": "foo_person"}, "a_bool": True, "a_none": None, "a_str": "foo", "an_int": 1, "a_float": 1.1, - "nested_class": ( - lambda val: val["person"] == {"name": "foo"} - and "_thread.lock object" in str(val.get("lock")) - ), - "attr_dict": {"foo": "foo", "bar": 1}, - "named_tuple": ["foo", 1], - "cyclic": {"cyclic": "SoCyclic"}, - # We don't really care about this case just want to not err - "cyclic2": lambda _: True, + "named_tuple": {"bar": 1, "foo": "foo"}, "fake_json": {"foo": "bar"}, + "some_set": ["a"], + "set_with_class": ["I fell back"], } assert set(expected) == set(res) for k, v in expected.items(): @@ -838,6 +787,20 @@ class MyNamedTuple(NamedTuple): except AssertionError: raise + @dataclasses.dataclass + class CyclicClass: + other: Optional["CyclicClass"] + + def __repr__(self) -> str: + return "my_cycles..." + + my_cyclic = CyclicClass(other=CyclicClass(other=None)) + my_cyclic.other.other = my_cyclic # type: ignore + + res = orjson.loads(_dumps_json({"cyclic": my_cyclic})) + assert res == {"cyclic": "my_cycles..."} + expected = {"foo": "foo", "bar": 1} + def test__dumps_json(): chars = "".join(chr(cp) for cp in range(0, sys.maxunicode + 1)) From eeaaeda690fb0ade695f1b4395a416d0da140ff0 Mon Sep 17 00:00:00 2001 From: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:58:07 -0700 Subject: [PATCH 2/3] del --- python/langsmith/client.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 0a678a4aa..736d19db8 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -165,9 +165,6 @@ def _default_retry_config() -> Retry: return ls_utils.LangSmithRetry(**retry_params) # type: ignore -_MAX_DEPTH = 1 - - class _Fragment(NamedTuple): buf: bytes @@ -224,15 +221,8 @@ def _simple_default(obj): return repr(obj) -def _serialize_json(obj: Any, depth: int = 0) -> Any: +def _serialize_json(obj: Any) -> Any: try: - if depth >= _MAX_DEPTH: - try: - return _dumps_json_single(obj) - except BaseException: - return repr(obj) - if isinstance(obj, bytes): - return obj.decode("utf-8") if isinstance(obj, (set, tuple)): if hasattr(obj, "_asdict") and callable(obj._asdict): # NamedTuple @@ -313,7 +303,7 @@ def _dumps_json(obj: Any, depth: int = 0) -> bytes: str The JSON formatted string. """ - return _dumps_json_single(obj, functools.partial(_serialize_json, depth=depth)) + return _dumps_json_single(obj, _serialize_json) def close_session(session: requests.Session) -> None: From 9331ccd9f995fe6a5c29cea0123f0b24d98650ff Mon Sep 17 00:00:00 2001 From: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com> Date: Thu, 12 Sep 2024 17:12:03 -0700 Subject: [PATCH 3/3] feedback --- python/langsmith/client.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 1d0137cc1..28a97055e 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -50,7 +50,6 @@ List, Literal, Mapping, - NamedTuple, Optional, Sequence, Tuple, @@ -175,16 +174,10 @@ def _default_retry_config() -> Retry: return ls_utils.LangSmithRetry(**retry_params) # type: ignore -class _Fragment(NamedTuple): - buf: bytes - - def _simple_default(obj): try: # Only need to handle types that orjson doesn't serialize by default # https://github.com/ijl/orjson#serialize - if isinstance(obj, _Fragment): - return orjson.Fragment(obj.buf) if isinstance(obj, datetime.datetime): return obj.isoformat() if isinstance(obj, uuid.UUID):