From c8f1ad8b7169805b9aaf9c8e4b57f272b4590aba Mon Sep 17 00:00:00 2001 From: Greg Pauloski <18683347+gpauloski@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:53:43 -0500 Subject: [PATCH] Improve ProxystoreTransfer class - Reuse stateful Store object between calls to .proxy() - Do not reinitialize Store if already created in that process - Improve typing on .proxy() - Customize pickle behavior to ignore stateful attributes - Add evict setting for proxies - Add note on supporting customized serializers in future - Bump ProxyStore to 0.7.1 and later (should be okay now since flight is Python 3.11 and later and globus compute supports pydantic 2 now) --- flight/engine/data/proxy.py | 50 ++++++++++++++++++++++++++----------- pyproject.toml | 2 +- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/flight/engine/data/proxy.py b/flight/engine/data/proxy.py index 03e47ba..f19787a 100644 --- a/flight/engine/data/proxy.py +++ b/flight/engine/data/proxy.py @@ -5,7 +5,7 @@ import cloudpickle from proxystore.connectors.endpoint import EndpointConnector -from proxystore.store import Store +from proxystore.store import Store, get_or_create_store, get_store from ...federation.topologies import Topology from .base import AbstractTransfer @@ -13,9 +13,16 @@ if t.TYPE_CHECKING: from proxystore.proxy import Proxy +T = t.TypeVar("T") + class ProxystoreTransfer(AbstractTransfer): - def __init__(self, topo: Topology, name: str = "default") -> None: + def __init__( + self, + topo: Topology, + evict: bool = False, + name: str = "default", + ) -> None: if not topo.proxystore_ready: raise ValueError( "Flock is not ready to use ProxyStore (i.e., " @@ -30,15 +37,30 @@ def __init__(self, topo: Topology, name: str = "default") -> None: else: endpoints.append(node.proxystore_id) - self.name = name - self.connector = EndpointConnector(endpoints=endpoints) - store = Store( - name=name, - connector=self.connector, - serializer=cloudpickle.dumps, - deserializer=cloudpickle.loads, - ) - self.config = store.config() - - def __call__(self, data: t.Any) -> Proxy[t.Any]: - return Store.from_config(self.config).proxy(data) + store = get_store(name) + if store is None: + store = Store( + name=name, + connector=EndpointConnector(endpoints=endpoints), + # In the future, these could be customized (de)serializers + # that are optimized for Flight/PyTorch models. + serializer=cloudpickle.dumps, + deserializer=cloudpickle.loads, + register=True, + ) + self.evict = evict + self.store = store + + def __call__(self, data: T) -> Proxy[T]: + # evict=True is only safe when it is guarenteed that the proxy will + # only be used by a single consumer. + return self.store.proxy(data, evict=self.evict) + + def __getstate__(self) -> dict[str, t.Any]: + # Customize pickle behavior so that stateful objects are not pickled. + return {"config": self.store.config(), "evict": self.evict} + + def __setstate__(self, state: dict[str, t.Any]) -> None: + # Initialized an object from its pickled state. + self.evict = state["evict"] + self.store = get_or_create_store(state["config"], register=True) diff --git a/pyproject.toml b/pyproject.toml index a6430ba..e598fed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "numpy", "pandas", "parsl", - "proxystore", + "proxystore>=0.7.1", "scipy", "scikit-learn", "tqdm",