From 5b4413514bf31f5d1cb0a0d67e419767aff72d60 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Tue, 14 May 2024 11:54:06 -0400 Subject: [PATCH 01/41] reuse policy --- hello/hello_update.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hello/hello_update.py b/hello/hello_update.py index 111d95b1..bf439b15 100644 --- a/hello/hello_update.py +++ b/hello/hello_update.py @@ -1,6 +1,6 @@ import asyncio -from temporalio import workflow +from temporalio import common, workflow from temporalio.client import Client from temporalio.worker import Worker @@ -37,6 +37,7 @@ async def main(): GreetingWorkflow.run, id="hello-update-workflow-id", task_queue="update-workflow-task-queue", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, ) # Perform the update for GreetingWorkflow From abec6c0acdcbbc5fd0004e82ed48a392171a52d5 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Wed, 22 May 2024 04:33:32 -0400 Subject: [PATCH 02/41] Signal and update design patterns --- ...aces_between_handler_and_main_coroutine.py | 129 ++++++++++++++++++ .../order_handling_of_n_messages.py | 78 +++++++++++ .../order_handling_of_two_messages.py | 57 ++++++++ ...rder_handling_relative_to_main_workflow.py | 83 +++++++++++ 4 files changed, 347 insertions(+) create mode 100644 signals_and_updates/avoid_races_between_handler_and_main_coroutine.py create mode 100644 signals_and_updates/order_handling_of_n_messages.py create mode 100644 signals_and_updates/order_handling_of_two_messages.py create mode 100644 signals_and_updates/order_handling_relative_to_main_workflow.py diff --git a/signals_and_updates/avoid_races_between_handler_and_main_coroutine.py b/signals_and_updates/avoid_races_between_handler_and_main_coroutine.py new file mode 100644 index 00000000..86d5f0b3 --- /dev/null +++ b/signals_and_updates/avoid_races_between_handler_and_main_coroutine.py @@ -0,0 +1,129 @@ +import asyncio +import logging +from datetime import timedelta + +from temporalio import activity, common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +class WorkflowBase: + def __init__(self) -> None: + self.letters = [] + + async def get_letters(self, text: str): + for i in range(len(text)): + letter = await workflow.execute_activity( + get_letter, + args=[text, i], + start_to_close_timeout=timedelta(seconds=10), + ) + self.letters.append(letter) + + +@workflow.defn +class AccumulateLettersIncorrect(WorkflowBase): + """ + This workflow implementation is incorrect: the handler execution interleaves with the main + workflow coroutine. + """ + + def __init__(self) -> None: + super().__init__() + self.handler_started = False + + @workflow.run + async def run(self) -> str: + await workflow.wait_condition(lambda: self.handler_started) + await self.get_letters( + "world!" + ) # Bug: handler and main wf are now interleaving + + await workflow.wait_condition(lambda: len(self.letters) == len("Hello world!")) + return "".join(self.letters) + + @workflow.update + async def put_letters(self, text: str): + self.handler_started = True + await self.get_letters(text) + + +@workflow.defn +class AccumulateLettersCorrect1(WorkflowBase): + def __init__(self) -> None: + super().__init__() + self.handler_text = asyncio.Future[str]() + + @workflow.run + async def run(self) -> str: + handler_text = await self.handler_text + await self.get_letters(handler_text) + await self.get_letters("world!") + await workflow.wait_condition(lambda: len(self.letters) == len("Hello world!")) + return "".join(self.letters) + + # Note: sync handler + @workflow.update + def put_letters(self, text: str): + self.handler_text.set_result(text) + + +@workflow.defn +class AccumulateLettersCorrect2(WorkflowBase): + def __init__(self) -> None: + super().__init__() + self.handler_complete = False + + @workflow.run + async def run(self) -> str: + await workflow.wait_condition(lambda: self.handler_complete) + await self.get_letters("world!") + await workflow.wait_condition(lambda: len(self.letters) == len("Hello world!")) + return "".join(self.letters) + + @workflow.update + async def put_letters(self, text: str): + await self.get_letters(text) + self.handler_complete = True + + +@activity.defn +async def get_letter(text: str, i: int) -> str: + return text[i] + + +async def app(wf: WorkflowHandle): + await wf.execute_update(AccumulateLettersCorrect1.put_letters, args=["Hello "]) + print(await wf.result()) + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[ + AccumulateLettersIncorrect, + AccumulateLettersCorrect1, + AccumulateLettersCorrect2, + ], + activities=[get_letter], + ): + for wf in [ + AccumulateLettersIncorrect, + AccumulateLettersCorrect1, + AccumulateLettersCorrect2, + ]: + handle = await client.start_workflow( + wf.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(handle) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) diff --git a/signals_and_updates/order_handling_of_n_messages.py b/signals_and_updates/order_handling_of_n_messages.py new file mode 100644 index 00000000..a757d42a --- /dev/null +++ b/signals_and_updates/order_handling_of_n_messages.py @@ -0,0 +1,78 @@ +import asyncio +import logging +import random + +from temporalio import common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + +Payload = str + + +class Queue: + def __init__(self) -> None: + self._head = 0 + self._futures: dict[int, asyncio.Future[Payload]] = {} + self.lock = asyncio.Lock() + + def add(self, item: Payload, position: int): + fut = self._futures.setdefault(position, asyncio.Future()) + if not fut.done(): + fut.set_result(item) + else: + workflow.logger.warn(f"duplicate message for position {position}") + + async def next(self) -> Payload: + async with self.lock: + payload = await self._futures.setdefault(self._head, asyncio.Future()) + self._head += 1 + return payload + + +@workflow.defn +class MessageProcessor: + def __init__(self) -> None: + self.queue = Queue() + + @workflow.run + async def run(self): + while True: + payload = await self.queue.next() + workflow.logger.info(payload) + if workflow.info().is_continue_as_new_suggested(): + workflow.continue_as_new() + + @workflow.update + def process_message(self, sequence_number: int, payload: Payload): # sync handler + self.queue.add(payload, sequence_number) + + +async def app(wf: WorkflowHandle): + sequence_numbers = list(range(10)) + random.shuffle(sequence_numbers) + for i in sequence_numbers: + await wf.execute_update( + MessageProcessor.process_message, args=[i, f"payload {i}"] + ) + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[MessageProcessor], + ): + wf = await client.start_workflow( + MessageProcessor.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) diff --git a/signals_and_updates/order_handling_of_two_messages.py b/signals_and_updates/order_handling_of_two_messages.py new file mode 100644 index 00000000..c4e2c64c --- /dev/null +++ b/signals_and_updates/order_handling_of_two_messages.py @@ -0,0 +1,57 @@ +import asyncio +import logging + +from temporalio import common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +@workflow.defn +class WashAndDryCycle: + + def __init__(self) -> None: + self.wash_complete = False + self.dry_complete = False + + @workflow.run + async def run(self): + await workflow.wait_condition(lambda: self.dry_complete) + + @workflow.update + async def wash(self): + self.wash_complete = True + workflow.logger.info("washing") + + @workflow.update + async def dry(self): + await workflow.wait_condition(lambda: self.wash_complete) + self.dry_complete = True + workflow.logger.info("drying") + + +async def app(wf: WorkflowHandle): + await asyncio.gather( + wf.execute_update(WashAndDryCycle.dry), wf.execute_update(WashAndDryCycle.wash) + ) + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[WashAndDryCycle], + ): + handle = await client.start_workflow( + WashAndDryCycle.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(handle) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) diff --git a/signals_and_updates/order_handling_relative_to_main_workflow.py b/signals_and_updates/order_handling_relative_to_main_workflow.py new file mode 100644 index 00000000..44baa300 --- /dev/null +++ b/signals_and_updates/order_handling_relative_to_main_workflow.py @@ -0,0 +1,83 @@ +import asyncio +import logging +from datetime import timedelta + +from temporalio import activity, common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +@workflow.defn +class WashAndDryCycle: + + def __init__(self) -> None: + self.has_detergent = False + self.wash_complete = False + self.non_dryables_removed = False + self.dry_complete = False + + @workflow.run + async def run(self): + await workflow.execute_activity( + add_detergent, start_to_close_timeout=timedelta(seconds=10) + ) + self.has_detergent = True + await workflow.wait_condition(lambda: self.wash_complete) + await workflow.execute_activity( + remove_non_dryables, start_to_close_timeout=timedelta(seconds=10) + ) + self.non_dryables_removed = True + await workflow.wait_condition(lambda: self.dry_complete) + + @workflow.update + async def wash(self): + await workflow.wait_condition(lambda: self.has_detergent) + self.wash_complete = True + workflow.logger.info("washing") + + @workflow.update + async def dry(self): + await workflow.wait_condition( + lambda: self.wash_complete and self.non_dryables_removed + ) + self.dry_complete = True + workflow.logger.info("drying") + + +@activity.defn +async def add_detergent(): + print("adding detergent") + + +@activity.defn +async def remove_non_dryables(): + print("removing non-dryables") + + +async def app(wf: WorkflowHandle): + await asyncio.gather( + wf.execute_update(WashAndDryCycle.dry), wf.execute_update(WashAndDryCycle.wash) + ) + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[WashAndDryCycle], + activities=[add_detergent, remove_non_dryables], + ): + handle = await client.start_workflow( + WashAndDryCycle.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(handle) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) From 6d2df9010cfc64fd1707712ec4415cebdc7dbda4 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Wed, 22 May 2024 10:27:28 -0400 Subject: [PATCH 03/41] Carry state over CAN --- .../order_handling_of_n_messages.py | 62 +++++++++++++++---- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/signals_and_updates/order_handling_of_n_messages.py b/signals_and_updates/order_handling_of_n_messages.py index a757d42a..3169b74e 100644 --- a/signals_and_updates/order_handling_of_n_messages.py +++ b/signals_and_updates/order_handling_of_n_messages.py @@ -1,18 +1,20 @@ import asyncio import logging import random +from typing import Optional from temporalio import common, workflow from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker Payload = str +SerializedQueueState = tuple[int, list[tuple[int, Payload]]] -class Queue: - def __init__(self) -> None: - self._head = 0 - self._futures: dict[int, asyncio.Future[Payload]] = {} +class OrderedQueue: + def __init__(self): + self._futures = {} + self.head = 0 self.lock = asyncio.Lock() def add(self, item: Payload, position: int): @@ -24,33 +26,69 @@ def add(self, item: Payload, position: int): async def next(self) -> Payload: async with self.lock: - payload = await self._futures.setdefault(self._head, asyncio.Future()) - self._head += 1 + payload = await self._futures.setdefault(self.head, asyncio.Future()) + # Note: user must delete the payload to avoid unbounded memory usage + del self._futures[self.head] + self.head += 1 return payload + def serialize(self) -> SerializedQueueState: + payloads = [(i, fut.result()) for i, fut in self._futures.items() if fut.done()] + return (self.head, payloads) + + # This is bad, but AFAICS it's the best we can do currently until we have a workflow init + # functionality in all SDKs (https://github.com/temporalio/features/issues/400). Currently the + # problem is: if a signal/update handler is sync, then it cannot wait for anything in the main + # wf coroutine. After CAN, a message may arrive in the first WFT, but the sync handler cannot + # wait for wf state to be initialized. So we are forced to update an *existing* queue with the + # carried-over state. + def update_from_serialized_state(self, serialized_state: SerializedQueueState): + head, payloads = serialized_state + self.head = head + for i, p in payloads: + if i in self._futures: + workflow.logger.error(f"duplicate message {i} encountered when deserializing state carried over CAN") + else: + self._futures[i] = resolved_future(p) + + +def resolved_future[X](x: X) -> asyncio.Future[X]: + fut = asyncio.Future[X]() + fut.set_result(x) + return fut + + @workflow.defn class MessageProcessor: def __init__(self) -> None: - self.queue = Queue() + self.queue = OrderedQueue() @workflow.run - async def run(self): + async def run(self, serialized_queue_state: Optional[SerializedQueueState] = None): + # Initialize workflow state after CAN. Note that handler is sync, so it cannot wait for + # workflow initialization. + if serialized_queue_state: + self.queue.update_from_serialized_state(serialized_queue_state) while True: + workflow.logger.info(f"waiting for msg {self.queue.head + 1}") payload = await self.queue.next() workflow.logger.info(payload) if workflow.info().is_continue_as_new_suggested(): - workflow.continue_as_new() + workflow.logger.info("CAN") + workflow.continue_as_new(args=[self.queue.serialize()]) + # Note: sync handler @workflow.update - def process_message(self, sequence_number: int, payload: Payload): # sync handler + def process_message(self, sequence_number: int, payload: Payload): self.queue.add(payload, sequence_number) async def app(wf: WorkflowHandle): - sequence_numbers = list(range(10)) + sequence_numbers = list(range(100)) random.shuffle(sequence_numbers) for i in sequence_numbers: + print(f"sending update {i}") await wf.execute_update( MessageProcessor.process_message, args=[i, f"payload {i}"] ) @@ -70,7 +108,7 @@ async def main(): task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, ) - await app(wf) + await asyncio.gather(app(wf), wf.result()) if __name__ == "__main__": From 0f912102d5ee9c9d17e7974b32f6dff579ca4b92 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Wed, 22 May 2024 11:07:37 -0400 Subject: [PATCH 04/41] Serialized, non-ordered handling of n messages --- .../serialized_handling_of_n_messages.py | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 signals_and_updates/serialized_handling_of_n_messages.py diff --git a/signals_and_updates/serialized_handling_of_n_messages.py b/signals_and_updates/serialized_handling_of_n_messages.py new file mode 100644 index 00000000..7c6347c4 --- /dev/null +++ b/signals_and_updates/serialized_handling_of_n_messages.py @@ -0,0 +1,122 @@ +import asyncio +import logging +from datetime import timedelta +from typing import Optional + +from temporalio import activity, common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + +Arg = str +Result = str + +# Problem: +# ------- +# - Your workflow receives an unbounded number of updates. +# - Each update must be processed by calling two activities. +# - The next update may not start processing until the previous has completed. + +# Solution: +# -------- +# Enqueue updates, and process items from the queue in a single coroutine (the main workflow +# coroutine). + +# Discussion: +# ---------- +# The queue is used because Temporal's async update & signal handlers will interleave if they +# contain multiple yield points. An alternative would be to use standard async handler functions, +# with handling being done with an asyncio.Lock held. The queue approach would be necessary if we +# need to process in an order other than arrival. + + +class Queue(asyncio.Queue[tuple[Arg, asyncio.Future[Result]]]): + def __init__(self, serialized_queue_state: list[Arg]) -> None: + super().__init__() + for arg in serialized_queue_state: + self.put_nowait((arg, asyncio.Future())) + + def serialize(self) -> list[Arg]: + args = [] + while True: + try: + args.append(self.get_nowait()) + except asyncio.QueueEmpty: + return args + + +@workflow.defn +class MessageProcessor: + + @workflow.run + async def run(self, serialized_queue_state: Optional[list[Arg]] = None): + self.queue = Queue(serialized_queue_state or []) + while True: + arg, fut = await self.queue.get() + fut.set_result(await self.process_task(arg)) + if workflow.info().is_continue_as_new_suggested(): + # Footgun: If we don't let the event loop tick, then CAN will end the workflow + # before the update handler is notified that the result future has completed. + # See https://github.com/temporalio/features/issues/481 + await asyncio.sleep(0) # Let update handler complete + print("CAN") + return workflow.continue_as_new(args=[self.queue.serialize()]) + + # Note: handler must be async if we are both enqueuing, and returning an update result + # => We could add SDK APIs to manually complete updates. + @workflow.update + async def add_task(self, arg: Arg) -> Result: + # Footgun: handler must wait for workflow initialization + # See https://github.com/temporalio/features/issues/400 + await workflow.wait_condition(lambda: hasattr(self, "queue")) + fut = asyncio.Future[Result]() + self.queue.put_nowait((arg, fut)) # Note: update validation gates enqueue + return await fut + + async def process_task(self, arg): + t1, t2 = [ + await workflow.execute_activity( + get_current_time, start_to_close_timeout=timedelta(seconds=10) + ) + for _ in range(2) + ] + return f"{arg}-result-{t1}-{t2}" + + +time = 0 + + +@activity.defn +async def get_current_time() -> int: + global time + time += 1 + return time + + +async def app(wf: WorkflowHandle): + for i in range(20): + print(f"app(): sending update {i}") + result = await wf.execute_update(MessageProcessor.add_task, f"arg {i}") + print(f"app(): {result}") + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[MessageProcessor], + activities=[get_current_time], + ): + wf = await client.start_workflow( + MessageProcessor.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await asyncio.gather(app(wf), wf.result()) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) From 8060e509976b2013814a8a03ec6aa7039b37361e Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Wed, 22 May 2024 16:24:46 -0700 Subject: [PATCH 05/41] Add return values to WashAndDryCycle sample --- .../order_handling_of_two_messages.py | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/signals_and_updates/order_handling_of_two_messages.py b/signals_and_updates/order_handling_of_two_messages.py index c4e2c64c..6731e6c4 100644 --- a/signals_and_updates/order_handling_of_two_messages.py +++ b/signals_and_updates/order_handling_of_two_messages.py @@ -1,4 +1,5 @@ import asyncio +from dataclasses import dataclass import logging from temporalio import common, workflow @@ -6,32 +7,47 @@ from temporalio.worker import Worker +# Shows how to make a pair of update or signal handlers run in a certain order even +# if they are received out of order. @workflow.defn class WashAndDryCycle: + @dataclass + class WashResults: + num_items: int + + @dataclass + class DryResults: + num_items: int + moisture_level: int + def __init__(self) -> None: - self.wash_complete = False - self.dry_complete = False + self._wash_results: WashAndDryCycle.WashResults = None + self._dry_results: WashAndDryCycle.DryResults = None @workflow.run async def run(self): - await workflow.wait_condition(lambda: self.dry_complete) + await workflow.wait_condition(lambda: self._dry_results is not None) + workflow.logger.info( + f"Finished washing and drying {self._dry_results.num_items} items, moisture level: {self._dry_results.moisture_level}" + ) @workflow.update - async def wash(self): - self.wash_complete = True - workflow.logger.info("washing") + async def wash(self, num_items) -> WashResults: + self._wash_results = WashAndDryCycle.WashResults(num_items=num_items) + return self._wash_results @workflow.update - async def dry(self): - await workflow.wait_condition(lambda: self.wash_complete) - self.dry_complete = True - workflow.logger.info("drying") - - + async def dry(self) -> DryResults: + await workflow.wait_condition(lambda: self._wash_results is not None) + + self._dry_results = WashAndDryCycle.DryResults(num_items=self._wash_results.num_items, moisture_level=3) + return self._dry_results + async def app(wf: WorkflowHandle): + # In normal operation, wash comes before dry, but here we simulate out-of-order receipt of messages await asyncio.gather( - wf.execute_update(WashAndDryCycle.dry), wf.execute_update(WashAndDryCycle.wash) + wf.execute_update(WashAndDryCycle.dry), wf.execute_update(WashAndDryCycle.wash, 10) ) From 06e56c20acae7d422523e7cb640f2ef5e35cd96b Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 04:57:58 -0400 Subject: [PATCH 06/41] rename directory --- .../avoid_races_between_handler_and_main_coroutine.py | 0 {signals_and_updates => update}/order_handling_of_n_messages.py | 0 {signals_and_updates => update}/order_handling_of_two_messages.py | 0 .../order_handling_relative_to_main_workflow.py | 0 .../serialized_handling_of_n_messages.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename {signals_and_updates => update}/avoid_races_between_handler_and_main_coroutine.py (100%) rename {signals_and_updates => update}/order_handling_of_n_messages.py (100%) rename {signals_and_updates => update}/order_handling_of_two_messages.py (100%) rename {signals_and_updates => update}/order_handling_relative_to_main_workflow.py (100%) rename {signals_and_updates => update}/serialized_handling_of_n_messages.py (100%) diff --git a/signals_and_updates/avoid_races_between_handler_and_main_coroutine.py b/update/avoid_races_between_handler_and_main_coroutine.py similarity index 100% rename from signals_and_updates/avoid_races_between_handler_and_main_coroutine.py rename to update/avoid_races_between_handler_and_main_coroutine.py diff --git a/signals_and_updates/order_handling_of_n_messages.py b/update/order_handling_of_n_messages.py similarity index 100% rename from signals_and_updates/order_handling_of_n_messages.py rename to update/order_handling_of_n_messages.py diff --git a/signals_and_updates/order_handling_of_two_messages.py b/update/order_handling_of_two_messages.py similarity index 100% rename from signals_and_updates/order_handling_of_two_messages.py rename to update/order_handling_of_two_messages.py diff --git a/signals_and_updates/order_handling_relative_to_main_workflow.py b/update/order_handling_relative_to_main_workflow.py similarity index 100% rename from signals_and_updates/order_handling_relative_to_main_workflow.py rename to update/order_handling_relative_to_main_workflow.py diff --git a/signals_and_updates/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py similarity index 100% rename from signals_and_updates/serialized_handling_of_n_messages.py rename to update/serialized_handling_of_n_messages.py From 53fd1af6eb4c4e4cd9e9b96569dfcb44fdf44ff5 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 04:58:25 -0400 Subject: [PATCH 07/41] Whitespace --- update/order_handling_of_two_messages.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/update/order_handling_of_two_messages.py b/update/order_handling_of_two_messages.py index 6731e6c4..7c063db6 100644 --- a/update/order_handling_of_two_messages.py +++ b/update/order_handling_of_two_messages.py @@ -1,22 +1,22 @@ import asyncio -from dataclasses import dataclass import logging +from dataclasses import dataclass from temporalio import common, workflow from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker -# Shows how to make a pair of update or signal handlers run in a certain order even +# Shows how to make a pair of update or signal handlers run in a certain order even # if they are received out of order. @workflow.defn class WashAndDryCycle: @dataclass class WashResults: - num_items: int + num_items: int - @dataclass + @dataclass class DryResults: num_items: int moisture_level: int @@ -30,7 +30,7 @@ async def run(self): await workflow.wait_condition(lambda: self._dry_results is not None) workflow.logger.info( f"Finished washing and drying {self._dry_results.num_items} items, moisture level: {self._dry_results.moisture_level}" - ) + ) @workflow.update async def wash(self, num_items) -> WashResults: @@ -40,14 +40,18 @@ async def wash(self, num_items) -> WashResults: @workflow.update async def dry(self) -> DryResults: await workflow.wait_condition(lambda: self._wash_results is not None) - - self._dry_results = WashAndDryCycle.DryResults(num_items=self._wash_results.num_items, moisture_level=3) + + self._dry_results = WashAndDryCycle.DryResults( + num_items=self._wash_results.num_items, moisture_level=3 + ) return self._dry_results - + + async def app(wf: WorkflowHandle): # In normal operation, wash comes before dry, but here we simulate out-of-order receipt of messages await asyncio.gather( - wf.execute_update(WashAndDryCycle.dry), wf.execute_update(WashAndDryCycle.wash, 10) + wf.execute_update(WashAndDryCycle.dry), + wf.execute_update(WashAndDryCycle.wash, 10), ) From 67f70eead5498d28876e1beb4b6e4b5df05c4eb9 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 05:00:12 -0400 Subject: [PATCH 08/41] Appease type checker --- update/order_handling_of_two_messages.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/update/order_handling_of_two_messages.py b/update/order_handling_of_two_messages.py index 7c063db6..0aa0234a 100644 --- a/update/order_handling_of_two_messages.py +++ b/update/order_handling_of_two_messages.py @@ -1,6 +1,7 @@ import asyncio import logging from dataclasses import dataclass +from typing import Optional from temporalio import common, workflow from temporalio.client import Client, WorkflowHandle @@ -22,12 +23,13 @@ class DryResults: moisture_level: int def __init__(self) -> None: - self._wash_results: WashAndDryCycle.WashResults = None - self._dry_results: WashAndDryCycle.DryResults = None + self._wash_results: Optional[WashAndDryCycle.WashResults] = None + self._dry_results: Optional[WashAndDryCycle.DryResults] = None @workflow.run async def run(self): await workflow.wait_condition(lambda: self._dry_results is not None) + assert self._dry_results workflow.logger.info( f"Finished washing and drying {self._dry_results.num_items} items, moisture level: {self._dry_results.moisture_level}" ) @@ -40,7 +42,7 @@ async def wash(self, num_items) -> WashResults: @workflow.update async def dry(self) -> DryResults: await workflow.wait_condition(lambda: self._wash_results is not None) - + assert self._wash_results self._dry_results = WashAndDryCycle.DryResults( num_items=self._wash_results.num_items, moisture_level=3 ) From 379024787a857c5ccb8a85ecd7ab826e8ce3e874 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 05:11:56 -0400 Subject: [PATCH 09/41] Silence warnings in 3rd party code --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 44b97956..fbc3c038 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,10 @@ asyncio_mode = "auto" log_cli = true log_cli_level = "INFO" log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" +filterwarnings = [ + "ignore::DeprecationWarning:google\\..*", + "ignore::DeprecationWarning:importlib\\..*" +] [tool.isort] profile = "black" From cdd24e8fe69d93b57f2d3a6de5cecdb8f834fe2c Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 05:36:50 -0400 Subject: [PATCH 10/41] Tests --- .../serialized_handling_of_n_messages.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tests/update/serialized_handling_of_n_messages.py diff --git a/tests/update/serialized_handling_of_n_messages.py b/tests/update/serialized_handling_of_n_messages.py new file mode 100644 index 00000000..86af008d --- /dev/null +++ b/tests/update/serialized_handling_of_n_messages.py @@ -0,0 +1,95 @@ +import asyncio +import logging +import uuid +from dataclasses import dataclass +from unittest.mock import patch + +import temporalio.api.common.v1 +import temporalio.api.enums.v1 +import temporalio.api.update.v1 +import temporalio.api.workflowservice.v1 +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker +from temporalio.workflow import UpdateMethodMultiParam + +from update.serialized_handling_of_n_messages import ( + MessageProcessor, + Result, + get_current_time, +) + + +async def test_continue_as_new_doesnt_lose_updates(client: Client): + with patch( + "temporalio.workflow.Info.is_continue_as_new_suggested", return_value=True + ): + tq = str(uuid.uuid4()) + wf = await client.start_workflow( + MessageProcessor.run, id=str(uuid.uuid4()), task_queue=tq + ) + update_requests = [ + UpdateRequest(wf, MessageProcessor.add_task, i) for i in range(10) + ] + for req in update_requests: + await req.wait_until_admitted() + + async with Worker( + client, + task_queue=tq, + workflows=[MessageProcessor], + activities=[get_current_time], + ): + for req in update_requests: + update_result = await req.task + assert update_result.startswith(req.expected_result_prefix()) + + +@dataclass +class UpdateRequest: + wf_handle: WorkflowHandle + update: UpdateMethodMultiParam + sequence_number: int + + def __post_init__(self): + self.task = asyncio.Task[Result]( + self.wf_handle.execute_update(self.update, args=[self.arg], id=self.id) + ) + + async def wait_until_admitted(self): + while True: + try: + return await self._poll_update_non_blocking() + except Exception as err: + logging.warning(err) + + async def _poll_update_non_blocking(self): + req = temporalio.api.workflowservice.v1.PollWorkflowExecutionUpdateRequest( + namespace=self.wf_handle._client.namespace, + update_ref=temporalio.api.update.v1.UpdateRef( + workflow_execution=temporalio.api.common.v1.WorkflowExecution( + workflow_id=self.wf_handle.id, + run_id="", + ), + update_id=self.id, + ), + identity=self.wf_handle._client.identity, + ) + res = await self.wf_handle._client.workflow_service.poll_workflow_execution_update( + req + ) + # TODO: @cretz how do we work with these raw proto objects? + assert "stage: UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ADMITTED" in str(res) + + @property + def arg(self) -> str: + return str(self.sequence_number) + + @property + def id(self) -> str: + return str(self.sequence_number) + + def expected_result_prefix(self) -> str: + # TODO: Currently the server does not send updates to the worker in order of admission When + # this is fixed (https://github.com/temporalio/temporal/pull/5831), we can make a stronger + # assertion about the activity numbers used to construct each result. + return f"{self.arg}-result" From 511ce73469edfa57163c2d9a3802f7086e9db2da Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 08:38:35 -0400 Subject: [PATCH 11/41] Use non-blocking queue and drain it before CAN --- update/serialized_handling_of_n_messages.py | 31 ++++++--------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/update/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py index 7c6347c4..1ef00a53 100644 --- a/update/serialized_handling_of_n_messages.py +++ b/update/serialized_handling_of_n_messages.py @@ -1,7 +1,7 @@ import asyncio import logging +from collections import deque from datetime import timedelta -from typing import Optional from temporalio import activity, common, workflow from temporalio.client import Client, WorkflowHandle @@ -29,37 +29,24 @@ # need to process in an order other than arrival. -class Queue(asyncio.Queue[tuple[Arg, asyncio.Future[Result]]]): - def __init__(self, serialized_queue_state: list[Arg]) -> None: - super().__init__() - for arg in serialized_queue_state: - self.put_nowait((arg, asyncio.Future())) - - def serialize(self) -> list[Arg]: - args = [] - while True: - try: - args.append(self.get_nowait()) - except asyncio.QueueEmpty: - return args - - @workflow.defn class MessageProcessor: @workflow.run - async def run(self, serialized_queue_state: Optional[list[Arg]] = None): - self.queue = Queue(serialized_queue_state or []) + async def run(self): + self.queue = deque[tuple[Arg, asyncio.Future[Result]]]() while True: - arg, fut = await self.queue.get() - fut.set_result(await self.process_task(arg)) + await workflow.wait_condition(lambda: len(self.queue) > 0) + while self.queue: + arg, fut = self.queue.popleft() + fut.set_result(await self.process_task(arg)) if workflow.info().is_continue_as_new_suggested(): # Footgun: If we don't let the event loop tick, then CAN will end the workflow # before the update handler is notified that the result future has completed. # See https://github.com/temporalio/features/issues/481 await asyncio.sleep(0) # Let update handler complete print("CAN") - return workflow.continue_as_new(args=[self.queue.serialize()]) + return workflow.continue_as_new() # Note: handler must be async if we are both enqueuing, and returning an update result # => We could add SDK APIs to manually complete updates. @@ -69,7 +56,7 @@ async def add_task(self, arg: Arg) -> Result: # See https://github.com/temporalio/features/issues/400 await workflow.wait_condition(lambda: hasattr(self, "queue")) fut = asyncio.Future[Result]() - self.queue.put_nowait((arg, fut)) # Note: update validation gates enqueue + self.queue.append((arg, fut)) # Note: update validation gates enqueue return await fut async def process_task(self, arg): From 59d0c694bac08af024c1cfad790e762983487871 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 08:49:08 -0400 Subject: [PATCH 12/41] Cleanup --- update/serialized_handling_of_n_messages.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/update/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py index 1ef00a53..cf087f13 100644 --- a/update/serialized_handling_of_n_messages.py +++ b/update/serialized_handling_of_n_messages.py @@ -1,5 +1,6 @@ import asyncio import logging +from asyncio import Future from collections import deque from datetime import timedelta @@ -34,7 +35,7 @@ class MessageProcessor: @workflow.run async def run(self): - self.queue = deque[tuple[Arg, asyncio.Future[Result]]]() + self.queue = deque[tuple[Arg, Future[Result]]]() while True: await workflow.wait_condition(lambda: len(self.queue) > 0) while self.queue: @@ -55,7 +56,7 @@ async def add_task(self, arg: Arg) -> Result: # Footgun: handler must wait for workflow initialization # See https://github.com/temporalio/features/issues/400 await workflow.wait_condition(lambda: hasattr(self, "queue")) - fut = asyncio.Future[Result]() + fut = Future[Result]() self.queue.append((arg, fut)) # Note: update validation gates enqueue return await fut From cccaf6e0fa887cb21b6906e9d610351f64aa1910 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 08:53:37 -0400 Subject: [PATCH 13/41] Queue can be created before workflow start --- update/serialized_handling_of_n_messages.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/update/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py index cf087f13..7979771e 100644 --- a/update/serialized_handling_of_n_messages.py +++ b/update/serialized_handling_of_n_messages.py @@ -33,9 +33,11 @@ @workflow.defn class MessageProcessor: + def __init__(self): + self.queue = deque[tuple[Arg, Future[Result]]]() + @workflow.run async def run(self): - self.queue = deque[tuple[Arg, Future[Result]]]() while True: await workflow.wait_condition(lambda: len(self.queue) > 0) while self.queue: @@ -53,9 +55,9 @@ async def run(self): # => We could add SDK APIs to manually complete updates. @workflow.update async def add_task(self, arg: Arg) -> Result: - # Footgun: handler must wait for workflow initialization + # Footgun: handler may need to wait for workflow initialization after CAN # See https://github.com/temporalio/features/issues/400 - await workflow.wait_condition(lambda: hasattr(self, "queue")) + # await workflow.wait_condition(lambda: hasattr(self, "queue")) fut = Future[Result]() self.queue.append((arg, fut)) # Note: update validation gates enqueue return await fut From 4ba0d009c8e6ef26c95c9e34668a0fce68bd7b06 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 09:02:58 -0400 Subject: [PATCH 14/41] Cleanup --- tests/update/serialized_handling_of_n_messages.py | 2 +- update/serialized_handling_of_n_messages.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/update/serialized_handling_of_n_messages.py b/tests/update/serialized_handling_of_n_messages.py index 86af008d..5c78af37 100644 --- a/tests/update/serialized_handling_of_n_messages.py +++ b/tests/update/serialized_handling_of_n_messages.py @@ -28,7 +28,7 @@ async def test_continue_as_new_doesnt_lose_updates(client: Client): MessageProcessor.run, id=str(uuid.uuid4()), task_queue=tq ) update_requests = [ - UpdateRequest(wf, MessageProcessor.add_task, i) for i in range(10) + UpdateRequest(wf, MessageProcessor.process_message, i) for i in range(10) ] for req in update_requests: await req.wait_until_admitted() diff --git a/update/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py index 7979771e..d174c4bf 100644 --- a/update/serialized_handling_of_n_messages.py +++ b/update/serialized_handling_of_n_messages.py @@ -42,7 +42,7 @@ async def run(self): await workflow.wait_condition(lambda: len(self.queue) > 0) while self.queue: arg, fut = self.queue.popleft() - fut.set_result(await self.process_task(arg)) + fut.set_result(await self.execute_processing_task(arg)) if workflow.info().is_continue_as_new_suggested(): # Footgun: If we don't let the event loop tick, then CAN will end the workflow # before the update handler is notified that the result future has completed. @@ -54,7 +54,7 @@ async def run(self): # Note: handler must be async if we are both enqueuing, and returning an update result # => We could add SDK APIs to manually complete updates. @workflow.update - async def add_task(self, arg: Arg) -> Result: + async def process_message(self, arg: Arg) -> Result: # Footgun: handler may need to wait for workflow initialization after CAN # See https://github.com/temporalio/features/issues/400 # await workflow.wait_condition(lambda: hasattr(self, "queue")) @@ -62,7 +62,9 @@ async def add_task(self, arg: Arg) -> Result: self.queue.append((arg, fut)) # Note: update validation gates enqueue return await fut - async def process_task(self, arg): + async def execute_processing_task(self, arg): + # The purpose of the two activities and the result string format is to permit checks that + # the activities of different tasks do not interleave. t1, t2 = [ await workflow.execute_activity( get_current_time, start_to_close_timeout=timedelta(seconds=10) @@ -85,7 +87,7 @@ async def get_current_time() -> int: async def app(wf: WorkflowHandle): for i in range(20): print(f"app(): sending update {i}") - result = await wf.execute_update(MessageProcessor.add_task, f"arg {i}") + result = await wf.execute_update(MessageProcessor.process_message, f"arg {i}") print(f"app(): {result}") From 2988e447beec7115dfdf2a2d6405cb3d1d517e56 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 09:12:49 -0400 Subject: [PATCH 15/41] cleanup --- update/serialized_handling_of_n_messages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/update/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py index d174c4bf..4c9a0d5c 100644 --- a/update/serialized_handling_of_n_messages.py +++ b/update/serialized_handling_of_n_messages.py @@ -62,7 +62,7 @@ async def process_message(self, arg: Arg) -> Result: self.queue.append((arg, fut)) # Note: update validation gates enqueue return await fut - async def execute_processing_task(self, arg): + async def execute_processing_task(self, arg: Arg) -> Result: # The purpose of the two activities and the result string format is to permit checks that # the activities of different tasks do not interleave. t1, t2 = [ From 62f58487bd4e9f89fb8479420c415a7b82c1e854 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 11:23:15 -0400 Subject: [PATCH 16/41] Add explanatory comment --- update/avoid_races_between_handler_and_main_coroutine.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/update/avoid_races_between_handler_and_main_coroutine.py b/update/avoid_races_between_handler_and_main_coroutine.py index 86d5f0b3..5e4ee208 100644 --- a/update/avoid_races_between_handler_and_main_coroutine.py +++ b/update/avoid_races_between_handler_and_main_coroutine.py @@ -6,6 +6,13 @@ from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker +# Problem: You have the buggy workflow below. You need to fix it so that the workflow state is no +# longer garbled due to interleaving of the handler with the main workflow. +# +# Solution 1: Use a sync handler and process the message in the main workflow coroutine. +# +# Solution 2: Await a custom “handler complete” condition. + class WorkflowBase: def __init__(self) -> None: From 6bba635fd89712377641dda472836f79dc01f5e7 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 13:57:52 -0400 Subject: [PATCH 17/41] Clarify and abstract --- ...aces_between_handler_and_main_coroutine.py | 58 +++++++++++++------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/update/avoid_races_between_handler_and_main_coroutine.py b/update/avoid_races_between_handler_and_main_coroutine.py index 5e4ee208..486e3251 100644 --- a/update/avoid_races_between_handler_and_main_coroutine.py +++ b/update/avoid_races_between_handler_and_main_coroutine.py @@ -18,7 +18,7 @@ class WorkflowBase: def __init__(self) -> None: self.letters = [] - async def get_letters(self, text: str): + async def do_multiple_async_tasks_that_mutate_workflow_state(self, text: str): for i in range(len(text)): letter = await workflow.execute_activity( get_letter, @@ -38,60 +38,79 @@ class AccumulateLettersIncorrect(WorkflowBase): def __init__(self) -> None: super().__init__() self.handler_started = False + self.handler_finished = False @workflow.run async def run(self) -> str: await workflow.wait_condition(lambda: self.handler_started) - await self.get_letters( + await self.do_multiple_async_tasks_that_mutate_workflow_state( "world!" ) # Bug: handler and main wf are now interleaving - await workflow.wait_condition(lambda: len(self.letters) == len("Hello world!")) + await workflow.wait_condition(lambda: self.handler_finished) return "".join(self.letters) @workflow.update - async def put_letters(self, text: str): + async def update_that_does_multiple_async_tasks_that_mutate_workflow_state( + self, text: str + ): self.handler_started = True - await self.get_letters(text) + await self.do_multiple_async_tasks_that_mutate_workflow_state(text) + self.handler_finished = True @workflow.defn class AccumulateLettersCorrect1(WorkflowBase): + """ + Solution 1: sync handler enqueues work; splice work into the main wf coroutine so that it cannot + interleave with work of main wf coroutine. + """ + def __init__(self) -> None: super().__init__() self.handler_text = asyncio.Future[str]() + self.handler_finished = False @workflow.run async def run(self) -> str: - handler_text = await self.handler_text - await self.get_letters(handler_text) - await self.get_letters("world!") - await workflow.wait_condition(lambda: len(self.letters) == len("Hello world!")) + handler_input = await self.handler_text + await self.do_multiple_async_tasks_that_mutate_workflow_state(handler_input) + await self.do_multiple_async_tasks_that_mutate_workflow_state("world!") + await workflow.wait_condition(lambda: self.handler_finished) return "".join(self.letters) # Note: sync handler @workflow.update - def put_letters(self, text: str): + def update_that_does_multiple_async_tasks_that_mutate_workflow_state( + self, text: str + ): self.handler_text.set_result(text) + self.handler_finished = True @workflow.defn class AccumulateLettersCorrect2(WorkflowBase): + """ + Solution 2: async handler notifies when complete; main wf coroutine waits for this to avoid + interleaving its own work. + """ + def __init__(self) -> None: super().__init__() - self.handler_complete = False + self.handler_finished = False @workflow.run async def run(self) -> str: - await workflow.wait_condition(lambda: self.handler_complete) - await self.get_letters("world!") - await workflow.wait_condition(lambda: len(self.letters) == len("Hello world!")) + await workflow.wait_condition(lambda: self.handler_finished) + await self.do_multiple_async_tasks_that_mutate_workflow_state("world!") return "".join(self.letters) @workflow.update - async def put_letters(self, text: str): - await self.get_letters(text) - self.handler_complete = True + async def update_that_does_multiple_async_tasks_that_mutate_workflow_state( + self, text: str + ): + await self.do_multiple_async_tasks_that_mutate_workflow_state(text) + self.handler_finished = True @activity.defn @@ -100,7 +119,10 @@ async def get_letter(text: str, i: int) -> str: async def app(wf: WorkflowHandle): - await wf.execute_update(AccumulateLettersCorrect1.put_letters, args=["Hello "]) + await wf.execute_update( + AccumulateLettersCorrect1.update_that_does_multiple_async_tasks_that_mutate_workflow_state, + args=["Hello "], + ) print(await wf.result()) From 59e53a51d4705bb4fa28a14bd2efdbd6dc3944bd Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 23 May 2024 17:20:04 -0400 Subject: [PATCH 18/41] Clarify and abstract II --- ...aces_between_handler_and_main_coroutine.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/update/avoid_races_between_handler_and_main_coroutine.py b/update/avoid_races_between_handler_and_main_coroutine.py index 486e3251..97e7f8f3 100644 --- a/update/avoid_races_between_handler_and_main_coroutine.py +++ b/update/avoid_races_between_handler_and_main_coroutine.py @@ -29,7 +29,7 @@ async def do_multiple_async_tasks_that_mutate_workflow_state(self, text: str): @workflow.defn -class AccumulateLettersIncorrect(WorkflowBase): +class AvoidHandlerAndWorkflowInterleavingIncorrect(WorkflowBase): """ This workflow implementation is incorrect: the handler execution interleaves with the main workflow coroutine. @@ -60,7 +60,7 @@ async def update_that_does_multiple_async_tasks_that_mutate_workflow_state( @workflow.defn -class AccumulateLettersCorrect1(WorkflowBase): +class AvoidHandlerAndWorkflowInterleavingCorrect1(WorkflowBase): """ Solution 1: sync handler enqueues work; splice work into the main wf coroutine so that it cannot interleave with work of main wf coroutine. @@ -89,7 +89,7 @@ def update_that_does_multiple_async_tasks_that_mutate_workflow_state( @workflow.defn -class AccumulateLettersCorrect2(WorkflowBase): +class AvoidHandlerAndWorkflowInterleavingCorrect2(WorkflowBase): """ Solution 2: async handler notifies when complete; main wf coroutine waits for this to avoid interleaving its own work. @@ -120,7 +120,7 @@ async def get_letter(text: str, i: int) -> str: async def app(wf: WorkflowHandle): await wf.execute_update( - AccumulateLettersCorrect1.update_that_does_multiple_async_tasks_that_mutate_workflow_state, + AvoidHandlerAndWorkflowInterleavingCorrect1.update_that_does_multiple_async_tasks_that_mutate_workflow_state, args=["Hello "], ) print(await wf.result()) @@ -133,16 +133,16 @@ async def main(): client, task_queue="tq", workflows=[ - AccumulateLettersIncorrect, - AccumulateLettersCorrect1, - AccumulateLettersCorrect2, + AvoidHandlerAndWorkflowInterleavingIncorrect, + AvoidHandlerAndWorkflowInterleavingCorrect1, + AvoidHandlerAndWorkflowInterleavingCorrect2, ], activities=[get_letter], ): for wf in [ - AccumulateLettersIncorrect, - AccumulateLettersCorrect1, - AccumulateLettersCorrect2, + AvoidHandlerAndWorkflowInterleavingIncorrect, + AvoidHandlerAndWorkflowInterleavingCorrect1, + AvoidHandlerAndWorkflowInterleavingCorrect2, ]: handle = await client.start_workflow( wf.run, From dcb8f38223a1d2cfee38ac6523ae9664c62d8052 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Thu, 23 May 2024 19:45:49 -0700 Subject: [PATCH 19/41] Atomic Message Handlers w/ Stateful Workflow sample --- ...message_handlers_with_stateful_workflow.py | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 update/atomic_message_handlers_with_stateful_workflow.py diff --git a/update/atomic_message_handlers_with_stateful_workflow.py b/update/atomic_message_handlers_with_stateful_workflow.py new file mode 100644 index 00000000..8abdfd92 --- /dev/null +++ b/update/atomic_message_handlers_with_stateful_workflow.py @@ -0,0 +1,193 @@ +import asyncio +from datetime import timedelta +import logging +from typing import Dict, List, Optional + +from temporalio import activity, common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + +@activity.defn +async def allocate_nodes_to_job(nodes: List[int], task_name: str) -> List[int]: + print(f"Assigning nodes {nodes} to job {task_name}") + await asyncio.sleep(0.1) + +@activity.defn +async def deallocate_nodes_for_job(nodes: List[int], task_name: str) -> List[int]: + print(f"Deallocating nodes {nodes} from job {task_name}") + await asyncio.sleep(0.1) + +@activity.defn +async def find_bad_nodes(nodes: List[int]) -> List[int]: + await asyncio.sleep(0.1) + bad_nodes = [n for n in nodes if n % 5 == 0] + print(f"Found bad nodes: {bad_nodes}") + return bad_nodes + +# This samples shows off +# - Making signal and update handlers only operate when the workflow is within a certain state +# (here between cluster_started and cluster_shutdown) +# - Using a lock to protect shared state shared by the workflow and its signal and update handlers +# interleaving writes +# - Running start_workflow with an initializer signal that you want to run before anything else. +@workflow.defn +class ClusterManager: + def __init__(self) -> None: + self.cluster_started = False + self.cluster_shutdown = False + self.nodes_lock = asyncio.Lock() + + @workflow.signal + async def start_cluster(self): + self.cluster_started = True + self.nodes : Dict[Optional[str]] = dict([(k, None) for k in range(25)]) + workflow.logger.info("Cluster started") + + @workflow.signal + async def shutdown_cluster(self): + await workflow.wait_condition(lambda: self.cluster_started) + self.cluster_shutdown = True + workflow.logger.info("Cluster shut down") + + @workflow.update + async def allocate_n_nodes_to_job(self, task_name: str, num_nodes: int, ) -> List[int]: + await workflow.wait_condition(lambda: self.cluster_started) + assert not self.cluster_shutdown + + await self.nodes_lock.acquire() + try: + unassigned_nodes = [k for k, v in self.nodes.items() if v is None] + if len(unassigned_nodes) < num_nodes: + raise ValueError(f"Cannot allocate {num_nodes} nodes; have only {len(unassigned_nodes)} available") + assigned_nodes = unassigned_nodes[:num_nodes] + await self._allocate_nodes_to_job(assigned_nodes, task_name) + return assigned_nodes + finally: + self.nodes_lock.release() + + + async def _allocate_nodes_to_job(self, assigned_nodes: List[int], task_name: str) -> List[int]: + await workflow.execute_activity( + allocate_nodes_to_job, args=[assigned_nodes, task_name], start_to_close_timeout=timedelta(seconds=10) + ) + for node in assigned_nodes: + self.nodes[node] = task_name + + + @workflow.update + async def delete_job(self, task_name: str) -> str: + await workflow.wait_condition(lambda: self.cluster_started) + assert not self.cluster_shutdown + await self.nodes_lock.acquire() + try: + nodes_to_free = [k for k, v in self.nodes.items() if v == task_name] + await self._deallocate_nodes_for_job(nodes_to_free, task_name) + return "Done" + finally: + self.nodes_lock.release() + + async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], task_name: str) -> List[int]: + await workflow.execute_activity( + deallocate_nodes_for_job, args=[nodes_to_free, task_name], start_to_close_timeout=timedelta(seconds=10) + ) + for node in nodes_to_free: + self.nodes[node] = None + + + @workflow.update + async def resize_job(self, task_name: str, new_size: int) -> List[int]: + await workflow.wait_condition(lambda: self.cluster_started) + assert not self.cluster_shutdown + await self.nodes_lock.acquire() + try: + allocated_nodes = [k for k, v in self.nodes.items() if v == task_name] + delta = new_size - len(allocated_nodes) + if delta == 0: + return allocated_nodes + elif delta > 0: + unassigned_nodes = [k for k, v in self.nodes.items() if v is None] + if len(unassigned_nodes) < delta: + raise ValueError(f"Cannot allocate {delta} nodes; have only {len(unassigned_nodes)} available") + nodes_to_assign = unassigned_nodes[:delta] + await self._allocate_nodes_to_job(nodes_to_assign, task_name) + return allocated_nodes + nodes_to_assign + else: + nodes_to_deallocate = allocated_nodes[delta:] + await self._deallocate_nodes_for_job(nodes_to_deallocate, task_name) + return list(filter(lambda x: x not in nodes_to_deallocate, allocated_nodes)) + finally: + self.nodes_lock.release() + + async def perform_health_checks(self): + await self.nodes_lock.acquire() + try: + assigned_nodes = [k for k, v in self.nodes.items() if v is not None] + bad_nodes = await workflow.execute_activity(find_bad_nodes, assigned_nodes, start_to_close_timeout=timedelta(seconds=10)) + for node in bad_nodes: + self.nodes[node] = "BAD!" + finally: + self.nodes_lock.release() + + @workflow.run + async def run(self): + await workflow.wait_condition(lambda: self.cluster_started) + + while True: + try: + await workflow.wait_condition(lambda: self.cluster_shutdown, timeout=timedelta(seconds=1)) + except asyncio.TimeoutError: + pass + await self.perform_health_checks() + + # Now we can start allocating jobs to nodes + await workflow.wait_condition(lambda: self.cluster_shutdown) + + +async def do_cluster_lifecycle(wf: WorkflowHandle): + + allocation_updates = [] + for i in range(6): + allocation_updates.append(wf.execute_update(ClusterManager.allocate_n_nodes_to_job, args=[f"task-{i}", 2])) + await asyncio.gather(*allocation_updates) + + resize_updates = [] + for i in range(6): + resize_updates.append(wf.execute_update(ClusterManager.resize_job, args=[f"task-{i}", 4])) + await asyncio.gather(*resize_updates) + + deletion_updates = [] + for i in range(6): + deletion_updates.append(wf.execute_update(ClusterManager.delete_job, f"task-{i}")) + await asyncio.gather(*deletion_updates) + + await wf.signal(ClusterManager.shutdown_cluster) + print("Cluster shut down") + + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[ClusterManager], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + wf = await client.start_workflow( + ClusterManager.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + start_signal='start_cluster', + + ) + await do_cluster_lifecycle(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) + + + \ No newline at end of file From d5c1e51a432535eb57c7b17b27820f8791b0e8fa Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 24 May 2024 04:42:27 -0400 Subject: [PATCH 20/41] Rename variables task -> job for consistency with API --- ...message_handlers_with_stateful_workflow.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/update/atomic_message_handlers_with_stateful_workflow.py b/update/atomic_message_handlers_with_stateful_workflow.py index 8abdfd92..c26ddc0a 100644 --- a/update/atomic_message_handlers_with_stateful_workflow.py +++ b/update/atomic_message_handlers_with_stateful_workflow.py @@ -8,13 +8,13 @@ from temporalio.worker import Worker @activity.defn -async def allocate_nodes_to_job(nodes: List[int], task_name: str) -> List[int]: - print(f"Assigning nodes {nodes} to job {task_name}") +async def allocate_nodes_to_job(nodes: List[int], job_name: str) -> List[int]: + print(f"Assigning nodes {nodes} to job {job_name}") await asyncio.sleep(0.1) @activity.defn -async def deallocate_nodes_for_job(nodes: List[int], task_name: str) -> List[int]: - print(f"Deallocating nodes {nodes} from job {task_name}") +async def deallocate_nodes_for_job(nodes: List[int], job_name: str) -> List[int]: + print(f"Deallocating nodes {nodes} from job {job_name}") await asyncio.sleep(0.1) @activity.defn @@ -50,7 +50,7 @@ async def shutdown_cluster(self): workflow.logger.info("Cluster shut down") @workflow.update - async def allocate_n_nodes_to_job(self, task_name: str, num_nodes: int, ) -> List[int]: + async def allocate_n_nodes_to_job(self, job_name: str, num_nodes: int, ) -> List[int]: await workflow.wait_condition(lambda: self.cluster_started) assert not self.cluster_shutdown @@ -60,47 +60,47 @@ async def allocate_n_nodes_to_job(self, task_name: str, num_nodes: int, ) -> Lis if len(unassigned_nodes) < num_nodes: raise ValueError(f"Cannot allocate {num_nodes} nodes; have only {len(unassigned_nodes)} available") assigned_nodes = unassigned_nodes[:num_nodes] - await self._allocate_nodes_to_job(assigned_nodes, task_name) + await self._allocate_nodes_to_job(assigned_nodes, job_name) return assigned_nodes finally: self.nodes_lock.release() - - async def _allocate_nodes_to_job(self, assigned_nodes: List[int], task_name: str) -> List[int]: + + async def _allocate_nodes_to_job(self, assigned_nodes: List[int], job_name: str) -> List[int]: await workflow.execute_activity( - allocate_nodes_to_job, args=[assigned_nodes, task_name], start_to_close_timeout=timedelta(seconds=10) + allocate_nodes_to_job, args=[assigned_nodes, job_name], start_to_close_timeout=timedelta(seconds=10) ) for node in assigned_nodes: - self.nodes[node] = task_name + self.nodes[node] = job_name @workflow.update - async def delete_job(self, task_name: str) -> str: + async def delete_job(self, job_name: str) -> str: await workflow.wait_condition(lambda: self.cluster_started) assert not self.cluster_shutdown await self.nodes_lock.acquire() try: - nodes_to_free = [k for k, v in self.nodes.items() if v == task_name] - await self._deallocate_nodes_for_job(nodes_to_free, task_name) + nodes_to_free = [k for k, v in self.nodes.items() if v == job_name] + await self._deallocate_nodes_for_job(nodes_to_free, job_name) return "Done" finally: self.nodes_lock.release() - async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], task_name: str) -> List[int]: + async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], job_name: str) -> List[int]: await workflow.execute_activity( - deallocate_nodes_for_job, args=[nodes_to_free, task_name], start_to_close_timeout=timedelta(seconds=10) + deallocate_nodes_for_job, args=[nodes_to_free, job_name], start_to_close_timeout=timedelta(seconds=10) ) for node in nodes_to_free: self.nodes[node] = None @workflow.update - async def resize_job(self, task_name: str, new_size: int) -> List[int]: + async def resize_job(self, job_name: str, new_size: int) -> List[int]: await workflow.wait_condition(lambda: self.cluster_started) assert not self.cluster_shutdown await self.nodes_lock.acquire() try: - allocated_nodes = [k for k, v in self.nodes.items() if v == task_name] + allocated_nodes = [k for k, v in self.nodes.items() if v == job_name] delta = new_size - len(allocated_nodes) if delta == 0: return allocated_nodes @@ -109,11 +109,11 @@ async def resize_job(self, task_name: str, new_size: int) -> List[int]: if len(unassigned_nodes) < delta: raise ValueError(f"Cannot allocate {delta} nodes; have only {len(unassigned_nodes)} available") nodes_to_assign = unassigned_nodes[:delta] - await self._allocate_nodes_to_job(nodes_to_assign, task_name) + await self._allocate_nodes_to_job(nodes_to_assign, job_name) return allocated_nodes + nodes_to_assign else: nodes_to_deallocate = allocated_nodes[delta:] - await self._deallocate_nodes_for_job(nodes_to_deallocate, task_name) + await self._deallocate_nodes_for_job(nodes_to_deallocate, job_name) return list(filter(lambda x: x not in nodes_to_deallocate, allocated_nodes)) finally: self.nodes_lock.release() @@ -147,17 +147,17 @@ async def do_cluster_lifecycle(wf: WorkflowHandle): allocation_updates = [] for i in range(6): - allocation_updates.append(wf.execute_update(ClusterManager.allocate_n_nodes_to_job, args=[f"task-{i}", 2])) - await asyncio.gather(*allocation_updates) + allocation_updates.append(wf.execute_update(ClusterManager.allocate_n_nodes_to_job, args=[f"job-{i}", 2])) + await asyncio.gather(*allocation_updates) resize_updates = [] for i in range(6): - resize_updates.append(wf.execute_update(ClusterManager.resize_job, args=[f"task-{i}", 4])) + resize_updates.append(wf.execute_update(ClusterManager.resize_job, args=[f"job-{i}", 4])) await asyncio.gather(*resize_updates) deletion_updates = [] for i in range(6): - deletion_updates.append(wf.execute_update(ClusterManager.delete_job, f"task-{i}")) + deletion_updates.append(wf.execute_update(ClusterManager.delete_job, f"job-{i}")) await asyncio.gather(*deletion_updates) await wf.signal(ClusterManager.shutdown_cluster) From c0b1dc47c49b0bc2aa1add4d7bee831358f18e32 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 24 May 2024 04:49:41 -0400 Subject: [PATCH 21/41] Fix type annotations --- .../atomic_message_handlers_with_stateful_workflow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/update/atomic_message_handlers_with_stateful_workflow.py b/update/atomic_message_handlers_with_stateful_workflow.py index c26ddc0a..fd6c1d8c 100644 --- a/update/atomic_message_handlers_with_stateful_workflow.py +++ b/update/atomic_message_handlers_with_stateful_workflow.py @@ -8,12 +8,12 @@ from temporalio.worker import Worker @activity.defn -async def allocate_nodes_to_job(nodes: List[int], job_name: str) -> List[int]: +async def allocate_nodes_to_job(nodes: List[int], job_name: str): print(f"Assigning nodes {nodes} to job {job_name}") await asyncio.sleep(0.1) @activity.defn -async def deallocate_nodes_for_job(nodes: List[int], job_name: str) -> List[int]: +async def deallocate_nodes_for_job(nodes: List[int], job_name: str): print(f"Deallocating nodes {nodes} from job {job_name}") await asyncio.sleep(0.1) @@ -40,7 +40,7 @@ def __init__(self) -> None: @workflow.signal async def start_cluster(self): self.cluster_started = True - self.nodes : Dict[Optional[str]] = dict([(k, None) for k in range(25)]) + self.nodes : Dict[int, Optional[str]] = dict([(k, None) for k in range(25)]) workflow.logger.info("Cluster started") @workflow.signal @@ -66,7 +66,7 @@ async def allocate_n_nodes_to_job(self, job_name: str, num_nodes: int, ) -> List self.nodes_lock.release() - async def _allocate_nodes_to_job(self, assigned_nodes: List[int], job_name: str) -> List[int]: + async def _allocate_nodes_to_job(self, assigned_nodes: List[int], job_name: str): await workflow.execute_activity( allocate_nodes_to_job, args=[assigned_nodes, job_name], start_to_close_timeout=timedelta(seconds=10) ) @@ -86,7 +86,7 @@ async def delete_job(self, job_name: str) -> str: finally: self.nodes_lock.release() - async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], job_name: str) -> List[int]: + async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], job_name: str): await workflow.execute_activity( deallocate_nodes_for_job, args=[nodes_to_free, job_name], start_to_close_timeout=timedelta(seconds=10) ) From 965f1f9698902f431f071a7c0fc4a429ddd958de Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 24 May 2024 05:01:25 -0400 Subject: [PATCH 22/41] Add some documentation of ClusterManager --- ...message_handlers_with_stateful_workflow.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/update/atomic_message_handlers_with_stateful_workflow.py b/update/atomic_message_handlers_with_stateful_workflow.py index fd6c1d8c..9fff00a9 100644 --- a/update/atomic_message_handlers_with_stateful_workflow.py +++ b/update/atomic_message_handlers_with_stateful_workflow.py @@ -32,6 +32,26 @@ async def find_bad_nodes(nodes: List[int]) -> List[int]: # - Running start_workflow with an initializer signal that you want to run before anything else. @workflow.defn class ClusterManager: + """ + A workflow to manage a cluster of compute nodes. + + The cluster is transitioned between operational and non-operational states by two signals: + `start_cluster` and `shutdown_cluster`. + + While it is active, the workflow maintains a mapping of nodes to assigned job, and exposes the + following API (implemented as updates): + + - allocate_n_nodes_to_job: attempt to find n free nodes, assign them to the job; return assigned node IDs + - delete_job: unassign any nodes assigned to job; return a success acknowledgement + - resize_job: assign or unassign nodes as needed; return assigned node IDs + + An API call made while the cluster is non-operational will block until the cluster is + operational. + + If an API call is made while another is in progress, it will block until all other thus-enqueued + requests are complete. + """ + def __init__(self) -> None: self.cluster_started = False self.cluster_shutdown = False @@ -51,6 +71,9 @@ async def shutdown_cluster(self): @workflow.update async def allocate_n_nodes_to_job(self, job_name: str, num_nodes: int, ) -> List[int]: + """ + Attempt to find n free nodes, assign them to the job, return assigned node IDs. + """ await workflow.wait_condition(lambda: self.cluster_started) assert not self.cluster_shutdown @@ -76,6 +99,9 @@ async def _allocate_nodes_to_job(self, assigned_nodes: List[int], job_name: str) @workflow.update async def delete_job(self, job_name: str) -> str: + """ + Unassign any nodes assigned to job; return a success acknowledgement. + """ await workflow.wait_condition(lambda: self.cluster_started) assert not self.cluster_shutdown await self.nodes_lock.acquire() @@ -96,6 +122,9 @@ async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], job_name: st @workflow.update async def resize_job(self, job_name: str, new_size: int) -> List[int]: + """ + Assign or unassign nodes as needed; return assigned node IDs. + """ await workflow.wait_condition(lambda: self.cluster_started) assert not self.cluster_shutdown await self.nodes_lock.acquire() From ee47f1c9c0ba2a8d42d9fbf06a18571287c5cd42 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 24 May 2024 08:34:23 -0400 Subject: [PATCH 23/41] Add @cretz's example solution --- ...serialized_handling_of_n_messages_cretz.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 update/serialized_handling_of_n_messages_cretz.py diff --git a/update/serialized_handling_of_n_messages_cretz.py b/update/serialized_handling_of_n_messages_cretz.py new file mode 100644 index 00000000..64bf2bf9 --- /dev/null +++ b/update/serialized_handling_of_n_messages_cretz.py @@ -0,0 +1,48 @@ +from collections import deque +from datetime import timedelta +from typing import Optional + +from temporalio import workflow + +# !!! +# This version requires update to complete with result and won't CAN until after +# everything is done +# !!! + + +class UpdateTask: + def __init__(self, arg: str) -> None: + self.arg = arg + self.result: Optional[str] = None + self.returned = False + + +@workflow.defn +class MessageProcessor: + def __init__(self) -> None: + self.queue: deque[UpdateTask] = deque() + + @workflow.run + async def run(self) -> None: + while not workflow.info().is_continue_as_new_suggested() or len(self.queue) > 0: + await workflow.wait_condition(lambda: len(self.queue) > 0) + await self.process_task(self.queue.popleft()) + workflow.continue_as_new() + + @workflow.update + async def do_task(self, arg: str) -> str: + # Add task and wait on result + task = UpdateTask(arg) + try: + self.queue.append(task) + await workflow.wait_condition(lambda: task.result is not None) + assert task.result + return task.result + finally: + task.returned = True + + async def process_task(self, task: UpdateTask) -> None: + task.result = await workflow.execute_activity( + "some_activity", task.arg, start_to_close_timeout=timedelta(seconds=10) + ) + await workflow.wait_condition(lambda: task.returned) From 2c82d05e3a41374d9f3ce06b5e2d8ce1faa1daa9 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 24 May 2024 09:17:19 -0400 Subject: [PATCH 24/41] New APIs --- ...dling_of_n_messages_cretz_with_new_apis.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 update/serialized_handling_of_n_messages_cretz_with_new_apis.py diff --git a/update/serialized_handling_of_n_messages_cretz_with_new_apis.py b/update/serialized_handling_of_n_messages_cretz_with_new_apis.py new file mode 100644 index 00000000..a60cf0c0 --- /dev/null +++ b/update/serialized_handling_of_n_messages_cretz_with_new_apis.py @@ -0,0 +1,41 @@ +from collections import deque +from datetime import timedelta +from typing import Optional + +from temporalio import workflow + +# !!! +# This version requires update to complete with result and won't CAN until after +# everything is done +# !!! + + +class UpdateTask: + def __init__(self, arg: str, update_id: str) -> None: + self.arg = arg + self.update_id: str + self.result: Optional[str] = None + + +@workflow.defn +class MessageProcessor: + def __init__(self) -> None: + self.queue: deque[UpdateTask] = deque() + + @workflow.run + async def run(self) -> None: + while not workflow.info().is_continue_as_new_suggested() or len(self.queue) > 0: + await workflow.wait_condition(lambda: len(self.queue) > 0) + await self.process_task(self.queue.popleft()) + workflow.continue_as_new() + + @workflow.update + async def do_task(self, arg: str) -> str: + task = UpdateTask(arg, update_id=workflow.current_update_id()) + self.queue.append(task) + await workflow.wait_condition(lambda: task.result is not None) + return task.result + + async def process_task(self, task: UpdateTask) -> None: + # execute_activity(...) + await workflow.wait_condition(lambda: workflow.update_completed(task.update_id)) From 516648461a380e6a4fe4fac31ec8a9a1b194a8fb Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Wed, 29 May 2024 13:06:23 -0400 Subject: [PATCH 25/41] Delete non-current work --- hello/hello_update.py | 3 +- .../serialized_handling_of_n_messages.py | 95 ----------- ...aces_between_handler_and_main_coroutine.py | 158 ------------------ update/order_handling_of_n_messages.py | 116 ------------- update/order_handling_of_two_messages.py | 79 --------- ...rder_handling_relative_to_main_workflow.py | 83 --------- update/serialized_handling_of_n_messages.py | 114 ------------- ...serialized_handling_of_n_messages_cretz.py | 48 ------ ...dling_of_n_messages_cretz_with_new_apis.py | 41 ----- 9 files changed, 1 insertion(+), 736 deletions(-) delete mode 100644 tests/update/serialized_handling_of_n_messages.py delete mode 100644 update/avoid_races_between_handler_and_main_coroutine.py delete mode 100644 update/order_handling_of_n_messages.py delete mode 100644 update/order_handling_of_two_messages.py delete mode 100644 update/order_handling_relative_to_main_workflow.py delete mode 100644 update/serialized_handling_of_n_messages.py delete mode 100644 update/serialized_handling_of_n_messages_cretz.py delete mode 100644 update/serialized_handling_of_n_messages_cretz_with_new_apis.py diff --git a/hello/hello_update.py b/hello/hello_update.py index bf439b15..111d95b1 100644 --- a/hello/hello_update.py +++ b/hello/hello_update.py @@ -1,6 +1,6 @@ import asyncio -from temporalio import common, workflow +from temporalio import workflow from temporalio.client import Client from temporalio.worker import Worker @@ -37,7 +37,6 @@ async def main(): GreetingWorkflow.run, id="hello-update-workflow-id", task_queue="update-workflow-task-queue", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, ) # Perform the update for GreetingWorkflow diff --git a/tests/update/serialized_handling_of_n_messages.py b/tests/update/serialized_handling_of_n_messages.py deleted file mode 100644 index 5c78af37..00000000 --- a/tests/update/serialized_handling_of_n_messages.py +++ /dev/null @@ -1,95 +0,0 @@ -import asyncio -import logging -import uuid -from dataclasses import dataclass -from unittest.mock import patch - -import temporalio.api.common.v1 -import temporalio.api.enums.v1 -import temporalio.api.update.v1 -import temporalio.api.workflowservice.v1 -from temporalio.client import Client, WorkflowHandle -from temporalio.worker import Worker -from temporalio.workflow import UpdateMethodMultiParam - -from update.serialized_handling_of_n_messages import ( - MessageProcessor, - Result, - get_current_time, -) - - -async def test_continue_as_new_doesnt_lose_updates(client: Client): - with patch( - "temporalio.workflow.Info.is_continue_as_new_suggested", return_value=True - ): - tq = str(uuid.uuid4()) - wf = await client.start_workflow( - MessageProcessor.run, id=str(uuid.uuid4()), task_queue=tq - ) - update_requests = [ - UpdateRequest(wf, MessageProcessor.process_message, i) for i in range(10) - ] - for req in update_requests: - await req.wait_until_admitted() - - async with Worker( - client, - task_queue=tq, - workflows=[MessageProcessor], - activities=[get_current_time], - ): - for req in update_requests: - update_result = await req.task - assert update_result.startswith(req.expected_result_prefix()) - - -@dataclass -class UpdateRequest: - wf_handle: WorkflowHandle - update: UpdateMethodMultiParam - sequence_number: int - - def __post_init__(self): - self.task = asyncio.Task[Result]( - self.wf_handle.execute_update(self.update, args=[self.arg], id=self.id) - ) - - async def wait_until_admitted(self): - while True: - try: - return await self._poll_update_non_blocking() - except Exception as err: - logging.warning(err) - - async def _poll_update_non_blocking(self): - req = temporalio.api.workflowservice.v1.PollWorkflowExecutionUpdateRequest( - namespace=self.wf_handle._client.namespace, - update_ref=temporalio.api.update.v1.UpdateRef( - workflow_execution=temporalio.api.common.v1.WorkflowExecution( - workflow_id=self.wf_handle.id, - run_id="", - ), - update_id=self.id, - ), - identity=self.wf_handle._client.identity, - ) - res = await self.wf_handle._client.workflow_service.poll_workflow_execution_update( - req - ) - # TODO: @cretz how do we work with these raw proto objects? - assert "stage: UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ADMITTED" in str(res) - - @property - def arg(self) -> str: - return str(self.sequence_number) - - @property - def id(self) -> str: - return str(self.sequence_number) - - def expected_result_prefix(self) -> str: - # TODO: Currently the server does not send updates to the worker in order of admission When - # this is fixed (https://github.com/temporalio/temporal/pull/5831), we can make a stronger - # assertion about the activity numbers used to construct each result. - return f"{self.arg}-result" diff --git a/update/avoid_races_between_handler_and_main_coroutine.py b/update/avoid_races_between_handler_and_main_coroutine.py deleted file mode 100644 index 97e7f8f3..00000000 --- a/update/avoid_races_between_handler_and_main_coroutine.py +++ /dev/null @@ -1,158 +0,0 @@ -import asyncio -import logging -from datetime import timedelta - -from temporalio import activity, common, workflow -from temporalio.client import Client, WorkflowHandle -from temporalio.worker import Worker - -# Problem: You have the buggy workflow below. You need to fix it so that the workflow state is no -# longer garbled due to interleaving of the handler with the main workflow. -# -# Solution 1: Use a sync handler and process the message in the main workflow coroutine. -# -# Solution 2: Await a custom “handler complete” condition. - - -class WorkflowBase: - def __init__(self) -> None: - self.letters = [] - - async def do_multiple_async_tasks_that_mutate_workflow_state(self, text: str): - for i in range(len(text)): - letter = await workflow.execute_activity( - get_letter, - args=[text, i], - start_to_close_timeout=timedelta(seconds=10), - ) - self.letters.append(letter) - - -@workflow.defn -class AvoidHandlerAndWorkflowInterleavingIncorrect(WorkflowBase): - """ - This workflow implementation is incorrect: the handler execution interleaves with the main - workflow coroutine. - """ - - def __init__(self) -> None: - super().__init__() - self.handler_started = False - self.handler_finished = False - - @workflow.run - async def run(self) -> str: - await workflow.wait_condition(lambda: self.handler_started) - await self.do_multiple_async_tasks_that_mutate_workflow_state( - "world!" - ) # Bug: handler and main wf are now interleaving - - await workflow.wait_condition(lambda: self.handler_finished) - return "".join(self.letters) - - @workflow.update - async def update_that_does_multiple_async_tasks_that_mutate_workflow_state( - self, text: str - ): - self.handler_started = True - await self.do_multiple_async_tasks_that_mutate_workflow_state(text) - self.handler_finished = True - - -@workflow.defn -class AvoidHandlerAndWorkflowInterleavingCorrect1(WorkflowBase): - """ - Solution 1: sync handler enqueues work; splice work into the main wf coroutine so that it cannot - interleave with work of main wf coroutine. - """ - - def __init__(self) -> None: - super().__init__() - self.handler_text = asyncio.Future[str]() - self.handler_finished = False - - @workflow.run - async def run(self) -> str: - handler_input = await self.handler_text - await self.do_multiple_async_tasks_that_mutate_workflow_state(handler_input) - await self.do_multiple_async_tasks_that_mutate_workflow_state("world!") - await workflow.wait_condition(lambda: self.handler_finished) - return "".join(self.letters) - - # Note: sync handler - @workflow.update - def update_that_does_multiple_async_tasks_that_mutate_workflow_state( - self, text: str - ): - self.handler_text.set_result(text) - self.handler_finished = True - - -@workflow.defn -class AvoidHandlerAndWorkflowInterleavingCorrect2(WorkflowBase): - """ - Solution 2: async handler notifies when complete; main wf coroutine waits for this to avoid - interleaving its own work. - """ - - def __init__(self) -> None: - super().__init__() - self.handler_finished = False - - @workflow.run - async def run(self) -> str: - await workflow.wait_condition(lambda: self.handler_finished) - await self.do_multiple_async_tasks_that_mutate_workflow_state("world!") - return "".join(self.letters) - - @workflow.update - async def update_that_does_multiple_async_tasks_that_mutate_workflow_state( - self, text: str - ): - await self.do_multiple_async_tasks_that_mutate_workflow_state(text) - self.handler_finished = True - - -@activity.defn -async def get_letter(text: str, i: int) -> str: - return text[i] - - -async def app(wf: WorkflowHandle): - await wf.execute_update( - AvoidHandlerAndWorkflowInterleavingCorrect1.update_that_does_multiple_async_tasks_that_mutate_workflow_state, - args=["Hello "], - ) - print(await wf.result()) - - -async def main(): - client = await Client.connect("localhost:7233") - - async with Worker( - client, - task_queue="tq", - workflows=[ - AvoidHandlerAndWorkflowInterleavingIncorrect, - AvoidHandlerAndWorkflowInterleavingCorrect1, - AvoidHandlerAndWorkflowInterleavingCorrect2, - ], - activities=[get_letter], - ): - for wf in [ - AvoidHandlerAndWorkflowInterleavingIncorrect, - AvoidHandlerAndWorkflowInterleavingCorrect1, - AvoidHandlerAndWorkflowInterleavingCorrect2, - ]: - handle = await client.start_workflow( - wf.run, - id="wid", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - ) - await app(handle) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - asyncio.run(main()) diff --git a/update/order_handling_of_n_messages.py b/update/order_handling_of_n_messages.py deleted file mode 100644 index 3169b74e..00000000 --- a/update/order_handling_of_n_messages.py +++ /dev/null @@ -1,116 +0,0 @@ -import asyncio -import logging -import random -from typing import Optional - -from temporalio import common, workflow -from temporalio.client import Client, WorkflowHandle -from temporalio.worker import Worker - -Payload = str -SerializedQueueState = tuple[int, list[tuple[int, Payload]]] - - -class OrderedQueue: - def __init__(self): - self._futures = {} - self.head = 0 - self.lock = asyncio.Lock() - - def add(self, item: Payload, position: int): - fut = self._futures.setdefault(position, asyncio.Future()) - if not fut.done(): - fut.set_result(item) - else: - workflow.logger.warn(f"duplicate message for position {position}") - - async def next(self) -> Payload: - async with self.lock: - payload = await self._futures.setdefault(self.head, asyncio.Future()) - # Note: user must delete the payload to avoid unbounded memory usage - del self._futures[self.head] - self.head += 1 - return payload - - def serialize(self) -> SerializedQueueState: - payloads = [(i, fut.result()) for i, fut in self._futures.items() if fut.done()] - return (self.head, payloads) - - # This is bad, but AFAICS it's the best we can do currently until we have a workflow init - # functionality in all SDKs (https://github.com/temporalio/features/issues/400). Currently the - # problem is: if a signal/update handler is sync, then it cannot wait for anything in the main - # wf coroutine. After CAN, a message may arrive in the first WFT, but the sync handler cannot - # wait for wf state to be initialized. So we are forced to update an *existing* queue with the - # carried-over state. - def update_from_serialized_state(self, serialized_state: SerializedQueueState): - head, payloads = serialized_state - self.head = head - for i, p in payloads: - if i in self._futures: - workflow.logger.error(f"duplicate message {i} encountered when deserializing state carried over CAN") - else: - self._futures[i] = resolved_future(p) - - -def resolved_future[X](x: X) -> asyncio.Future[X]: - fut = asyncio.Future[X]() - fut.set_result(x) - return fut - - - -@workflow.defn -class MessageProcessor: - def __init__(self) -> None: - self.queue = OrderedQueue() - - @workflow.run - async def run(self, serialized_queue_state: Optional[SerializedQueueState] = None): - # Initialize workflow state after CAN. Note that handler is sync, so it cannot wait for - # workflow initialization. - if serialized_queue_state: - self.queue.update_from_serialized_state(serialized_queue_state) - while True: - workflow.logger.info(f"waiting for msg {self.queue.head + 1}") - payload = await self.queue.next() - workflow.logger.info(payload) - if workflow.info().is_continue_as_new_suggested(): - workflow.logger.info("CAN") - workflow.continue_as_new(args=[self.queue.serialize()]) - - # Note: sync handler - @workflow.update - def process_message(self, sequence_number: int, payload: Payload): - self.queue.add(payload, sequence_number) - - -async def app(wf: WorkflowHandle): - sequence_numbers = list(range(100)) - random.shuffle(sequence_numbers) - for i in sequence_numbers: - print(f"sending update {i}") - await wf.execute_update( - MessageProcessor.process_message, args=[i, f"payload {i}"] - ) - - -async def main(): - client = await Client.connect("localhost:7233") - - async with Worker( - client, - task_queue="tq", - workflows=[MessageProcessor], - ): - wf = await client.start_workflow( - MessageProcessor.run, - id="wid", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - ) - await asyncio.gather(app(wf), wf.result()) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - asyncio.run(main()) diff --git a/update/order_handling_of_two_messages.py b/update/order_handling_of_two_messages.py deleted file mode 100644 index 0aa0234a..00000000 --- a/update/order_handling_of_two_messages.py +++ /dev/null @@ -1,79 +0,0 @@ -import asyncio -import logging -from dataclasses import dataclass -from typing import Optional - -from temporalio import common, workflow -from temporalio.client import Client, WorkflowHandle -from temporalio.worker import Worker - - -# Shows how to make a pair of update or signal handlers run in a certain order even -# if they are received out of order. -@workflow.defn -class WashAndDryCycle: - - @dataclass - class WashResults: - num_items: int - - @dataclass - class DryResults: - num_items: int - moisture_level: int - - def __init__(self) -> None: - self._wash_results: Optional[WashAndDryCycle.WashResults] = None - self._dry_results: Optional[WashAndDryCycle.DryResults] = None - - @workflow.run - async def run(self): - await workflow.wait_condition(lambda: self._dry_results is not None) - assert self._dry_results - workflow.logger.info( - f"Finished washing and drying {self._dry_results.num_items} items, moisture level: {self._dry_results.moisture_level}" - ) - - @workflow.update - async def wash(self, num_items) -> WashResults: - self._wash_results = WashAndDryCycle.WashResults(num_items=num_items) - return self._wash_results - - @workflow.update - async def dry(self) -> DryResults: - await workflow.wait_condition(lambda: self._wash_results is not None) - assert self._wash_results - self._dry_results = WashAndDryCycle.DryResults( - num_items=self._wash_results.num_items, moisture_level=3 - ) - return self._dry_results - - -async def app(wf: WorkflowHandle): - # In normal operation, wash comes before dry, but here we simulate out-of-order receipt of messages - await asyncio.gather( - wf.execute_update(WashAndDryCycle.dry), - wf.execute_update(WashAndDryCycle.wash, 10), - ) - - -async def main(): - client = await Client.connect("localhost:7233") - - async with Worker( - client, - task_queue="tq", - workflows=[WashAndDryCycle], - ): - handle = await client.start_workflow( - WashAndDryCycle.run, - id="wid", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - ) - await app(handle) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - asyncio.run(main()) diff --git a/update/order_handling_relative_to_main_workflow.py b/update/order_handling_relative_to_main_workflow.py deleted file mode 100644 index 44baa300..00000000 --- a/update/order_handling_relative_to_main_workflow.py +++ /dev/null @@ -1,83 +0,0 @@ -import asyncio -import logging -from datetime import timedelta - -from temporalio import activity, common, workflow -from temporalio.client import Client, WorkflowHandle -from temporalio.worker import Worker - - -@workflow.defn -class WashAndDryCycle: - - def __init__(self) -> None: - self.has_detergent = False - self.wash_complete = False - self.non_dryables_removed = False - self.dry_complete = False - - @workflow.run - async def run(self): - await workflow.execute_activity( - add_detergent, start_to_close_timeout=timedelta(seconds=10) - ) - self.has_detergent = True - await workflow.wait_condition(lambda: self.wash_complete) - await workflow.execute_activity( - remove_non_dryables, start_to_close_timeout=timedelta(seconds=10) - ) - self.non_dryables_removed = True - await workflow.wait_condition(lambda: self.dry_complete) - - @workflow.update - async def wash(self): - await workflow.wait_condition(lambda: self.has_detergent) - self.wash_complete = True - workflow.logger.info("washing") - - @workflow.update - async def dry(self): - await workflow.wait_condition( - lambda: self.wash_complete and self.non_dryables_removed - ) - self.dry_complete = True - workflow.logger.info("drying") - - -@activity.defn -async def add_detergent(): - print("adding detergent") - - -@activity.defn -async def remove_non_dryables(): - print("removing non-dryables") - - -async def app(wf: WorkflowHandle): - await asyncio.gather( - wf.execute_update(WashAndDryCycle.dry), wf.execute_update(WashAndDryCycle.wash) - ) - - -async def main(): - client = await Client.connect("localhost:7233") - - async with Worker( - client, - task_queue="tq", - workflows=[WashAndDryCycle], - activities=[add_detergent, remove_non_dryables], - ): - handle = await client.start_workflow( - WashAndDryCycle.run, - id="wid", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - ) - await app(handle) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - asyncio.run(main()) diff --git a/update/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py deleted file mode 100644 index 4c9a0d5c..00000000 --- a/update/serialized_handling_of_n_messages.py +++ /dev/null @@ -1,114 +0,0 @@ -import asyncio -import logging -from asyncio import Future -from collections import deque -from datetime import timedelta - -from temporalio import activity, common, workflow -from temporalio.client import Client, WorkflowHandle -from temporalio.worker import Worker - -Arg = str -Result = str - -# Problem: -# ------- -# - Your workflow receives an unbounded number of updates. -# - Each update must be processed by calling two activities. -# - The next update may not start processing until the previous has completed. - -# Solution: -# -------- -# Enqueue updates, and process items from the queue in a single coroutine (the main workflow -# coroutine). - -# Discussion: -# ---------- -# The queue is used because Temporal's async update & signal handlers will interleave if they -# contain multiple yield points. An alternative would be to use standard async handler functions, -# with handling being done with an asyncio.Lock held. The queue approach would be necessary if we -# need to process in an order other than arrival. - - -@workflow.defn -class MessageProcessor: - - def __init__(self): - self.queue = deque[tuple[Arg, Future[Result]]]() - - @workflow.run - async def run(self): - while True: - await workflow.wait_condition(lambda: len(self.queue) > 0) - while self.queue: - arg, fut = self.queue.popleft() - fut.set_result(await self.execute_processing_task(arg)) - if workflow.info().is_continue_as_new_suggested(): - # Footgun: If we don't let the event loop tick, then CAN will end the workflow - # before the update handler is notified that the result future has completed. - # See https://github.com/temporalio/features/issues/481 - await asyncio.sleep(0) # Let update handler complete - print("CAN") - return workflow.continue_as_new() - - # Note: handler must be async if we are both enqueuing, and returning an update result - # => We could add SDK APIs to manually complete updates. - @workflow.update - async def process_message(self, arg: Arg) -> Result: - # Footgun: handler may need to wait for workflow initialization after CAN - # See https://github.com/temporalio/features/issues/400 - # await workflow.wait_condition(lambda: hasattr(self, "queue")) - fut = Future[Result]() - self.queue.append((arg, fut)) # Note: update validation gates enqueue - return await fut - - async def execute_processing_task(self, arg: Arg) -> Result: - # The purpose of the two activities and the result string format is to permit checks that - # the activities of different tasks do not interleave. - t1, t2 = [ - await workflow.execute_activity( - get_current_time, start_to_close_timeout=timedelta(seconds=10) - ) - for _ in range(2) - ] - return f"{arg}-result-{t1}-{t2}" - - -time = 0 - - -@activity.defn -async def get_current_time() -> int: - global time - time += 1 - return time - - -async def app(wf: WorkflowHandle): - for i in range(20): - print(f"app(): sending update {i}") - result = await wf.execute_update(MessageProcessor.process_message, f"arg {i}") - print(f"app(): {result}") - - -async def main(): - client = await Client.connect("localhost:7233") - - async with Worker( - client, - task_queue="tq", - workflows=[MessageProcessor], - activities=[get_current_time], - ): - wf = await client.start_workflow( - MessageProcessor.run, - id="wid", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - ) - await asyncio.gather(app(wf), wf.result()) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - asyncio.run(main()) diff --git a/update/serialized_handling_of_n_messages_cretz.py b/update/serialized_handling_of_n_messages_cretz.py deleted file mode 100644 index 64bf2bf9..00000000 --- a/update/serialized_handling_of_n_messages_cretz.py +++ /dev/null @@ -1,48 +0,0 @@ -from collections import deque -from datetime import timedelta -from typing import Optional - -from temporalio import workflow - -# !!! -# This version requires update to complete with result and won't CAN until after -# everything is done -# !!! - - -class UpdateTask: - def __init__(self, arg: str) -> None: - self.arg = arg - self.result: Optional[str] = None - self.returned = False - - -@workflow.defn -class MessageProcessor: - def __init__(self) -> None: - self.queue: deque[UpdateTask] = deque() - - @workflow.run - async def run(self) -> None: - while not workflow.info().is_continue_as_new_suggested() or len(self.queue) > 0: - await workflow.wait_condition(lambda: len(self.queue) > 0) - await self.process_task(self.queue.popleft()) - workflow.continue_as_new() - - @workflow.update - async def do_task(self, arg: str) -> str: - # Add task and wait on result - task = UpdateTask(arg) - try: - self.queue.append(task) - await workflow.wait_condition(lambda: task.result is not None) - assert task.result - return task.result - finally: - task.returned = True - - async def process_task(self, task: UpdateTask) -> None: - task.result = await workflow.execute_activity( - "some_activity", task.arg, start_to_close_timeout=timedelta(seconds=10) - ) - await workflow.wait_condition(lambda: task.returned) diff --git a/update/serialized_handling_of_n_messages_cretz_with_new_apis.py b/update/serialized_handling_of_n_messages_cretz_with_new_apis.py deleted file mode 100644 index a60cf0c0..00000000 --- a/update/serialized_handling_of_n_messages_cretz_with_new_apis.py +++ /dev/null @@ -1,41 +0,0 @@ -from collections import deque -from datetime import timedelta -from typing import Optional - -from temporalio import workflow - -# !!! -# This version requires update to complete with result and won't CAN until after -# everything is done -# !!! - - -class UpdateTask: - def __init__(self, arg: str, update_id: str) -> None: - self.arg = arg - self.update_id: str - self.result: Optional[str] = None - - -@workflow.defn -class MessageProcessor: - def __init__(self) -> None: - self.queue: deque[UpdateTask] = deque() - - @workflow.run - async def run(self) -> None: - while not workflow.info().is_continue_as_new_suggested() or len(self.queue) > 0: - await workflow.wait_condition(lambda: len(self.queue) > 0) - await self.process_task(self.queue.popleft()) - workflow.continue_as_new() - - @workflow.update - async def do_task(self, arg: str) -> str: - task = UpdateTask(arg, update_id=workflow.current_update_id()) - self.queue.append(task) - await workflow.wait_condition(lambda: task.result is not None) - return task.result - - async def process_task(self, task: UpdateTask) -> None: - # execute_activity(...) - await workflow.wait_condition(lambda: workflow.update_completed(task.update_id)) From efd8a52ba5042868d8edac95b011f95658f0ff52 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sun, 26 May 2024 03:46:16 -0400 Subject: [PATCH 26/41] Job runner notes --- update/job_runner_notes.py | 409 +++++++++++++++++++++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 update/job_runner_notes.py diff --git a/update/job_runner_notes.py b/update/job_runner_notes.py new file mode 100644 index 00000000..e3b7999a --- /dev/null +++ b/update/job_runner_notes.py @@ -0,0 +1,409 @@ +import asyncio +from asyncio import Future +from collections import deque +from datetime import datetime, timedelta +import logging +from sys import version +from typing import Any, Iterator, Optional, TypedDict, Union + +from attr import dataclass +from temporalio import common, workflow, activity +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +############################################################################### +## SDK internals +## +@dataclass +class Message: + type: str # maps to a handler if a matching handler exists + args: tuple[Any] # deserialized arg payloads + # Can expose other update/signal metadata + received_at: datetime + client_identity: str + + async def handle(self): + await workflow.call_handler(self.type, *self.args) + + +@dataclass +class Signal(Message): + pass + + +@dataclass +class Update(Message): + id: str # the update ID + + +# +# Raw incoming workflow events +# +# The incoming event stream contains newly delivered updates and signals. Perhaps +# ContinueAsNewSuggested could also be an event. +# +# We could have "UpdateReceived" as the incoming event, with UpdateAccepted/UpdateRejected emitted +# later. However, An alternative is that the SDK executes the update validators immediately, before +# the user has a chance to interact with the event stream. We'll adopt that version for now, since +# it involves fewer event types. +@dataclass +class SignalReceived: + signal: Signal + + +@dataclass +class UpdateRejected: + update: Update + + +@dataclass +class UpdateAccepted: + update: Update + + +@dataclass +class ContinueAsNewSuggested: + pass + + +IncomingWorkflowEvent = Union[ + UpdateAccepted, UpdateRejected, SignalReceived, ContinueAsNewSuggested +] + + +def workflow_incoming_event_stream() -> Iterator[IncomingWorkflowEvent]: ... + + +setattr(workflow, "incoming_event_stream", workflow_incoming_event_stream) +# +# Other events that are emitted automatically by a running workflow +# + + +@dataclass +class UpdateCompleted: + pass + + +class SignalHandlerReturned: + # This is tangential to work on updates. Introducing this event would introduce a new concept of + # "signal-processing finished", which could be used to help users wait for signal processing to + # finish before CAN / workflow return. The idea is that the event would be emitted when a signal + # handler returns. + pass + + +# +# Events that users can add to the event stream +# + + +class TimerFired: + pass + + +class CustomFutureResolved: + pass + + +EventStream = Iterator[ + Union[ + SignalReceived, + UpdateRejected, + UpdateAccepted, + ContinueAsNewSuggested, + UpdateCompleted, + SignalHandlerReturned, + TimerFired, + CustomFutureResolved, + ] +] + + +class Selector: + def get_event_stream(self) -> EventStream: ... + + +# By default, a workflow behaves as if it is doing the following: +def handle_incoming_events(): + for ev in workflow.incoming_event_stream(): + match ev: + case SignalReceived(signal): + asyncio.create_task(signal.handle()) + case UpdateAccepted(update): + asyncio.create_task(update.handle()) + + +HandlerType = str +UpdateID = str + + +# This class is just a toy prototype: this functionality will be implemented on WorkflowInstance in +# the SDK. +class WorkflowInternals: + def __init__(self): + self.incoming_event_stream = deque[IncomingWorkflowEvent]() + self.ready_to_execute_handler: dict[UpdateID, Future[None]] = {} + + def accept_update(self, update: Update): + # This will be done by the SDK prior to invoking handler, around + # https://github.com/temporalio/sdk-python/blob/11a97d1ab2ebfe8c973bf396b1e14077ec611e52/temporalio/worker/_workflow_instance.py#L506 + self.incoming_event_stream.append(UpdateAccepted(update)) + + # By default, handlers are ready to execute immediately after the update is accepted. + self.ready_to_execute_handler[update.id] = resolved_future(None) + + async def _wait_until_ready_to_execute(self, update_id: UpdateID): + await self.ready_to_execute_handler[update_id] + + +def resolved_future[X](result: X) -> Future[X]: + fut = Future() + fut.set_result(result) + return fut + + +workflow_internals = WorkflowInternals() + +############################################################################### +## +## User's code +## + +# A user may want to handle the event stream differently. + +# workflow API design must make the following two things convenient for users: +# - DrainBeforeWorkflowCompletion +# - NoInterleaving, optionally with CustomOrder + + +def make_event_stream() -> EventStream: + selector = Selector() + return selector.get_event_stream() + + +event_stream = make_event_stream() + +# +JobId = int +ClusterSlot = str + + +class Job(TypedDict): + update_id: UpdateID + depends_on: list[UpdateID] + after_time: Optional[int] + name: str + run: str + python_interpreter_version: Optional[str] + + +class JobOutput(TypedDict): + status: int + stdout: str + stderr: str + + +@workflow.defn +class JobRunner: + + def __init__(self): + self.jobs: dict[JobId, Job] = {} + + # Design notes + # ------------ + # Updates always have handler functions, and an update handler function is still just a normal + # handler function: it implements the handling logic and the return value. + # + # Every workflow will have an underlying event stream. By default, this yields the following + # events: + # + # - UpdateRejected + # - UpdateAccepted (UpdateEnqueued) + # - UpdateDequeued + # - UpdateCompleted + # + # The SDK will provide a default implementation of the event stream, looking something like this: + + # + # The handler will be invoked automatically by the SDK when the underlying event stream yields + # an UpdateDequeued event for this update ID. The user does not have to know anything about + # this: by default, handlers are executed before other workflow code, in order of update + # arrival. + + # The SDK is capable of automatically draining the event stream before workflow return / CAN, + # including an option for this draining to result in serial execution of the handlers (i.e. + # waiting for all async work scheduled by the handler to complete before the next handler is + # invoked, and not allowing the workflow to complete until all such work is complete.) Default + # behavior TBD. + # + # The event stream thus provides a way for users to wait until a specific message handler has + # completed, or until all message handlers have completed. These can be exposed via convenient + # `wait_for_X()` APIs, rather than interacting with the raw event stream + # + # Furthermore, users can optionally implement the EventStream themselves. This gives them + # precise control over the ordering of handler invocation with respect to all other workflow + # events (e.g. other update completions, and custom futures such as timers and + # workflow.wait_condition). + # + # TODO: does handler invocation remain automatic on yielding Dequeue, or is that too magical. An + # alternative would be for users to be able to call update.handle() on an update object obtained + # from an event object yielded by the event stream. + + @workflow.update + async def run_shell_script_job(self, job: Job) -> JobOutput: + """ + To be executed in order dictated by job dependency graph (see `jobs.depends_on`) and not + before `job.after_time`. + """ + ## SDK internals: please pretend this is implemented in the SDK + await workflow_internals._wait_until_ready_to_execute(job["update_id"]) + ## + + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job["run"]], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) # SDK emits UpdateCompleted + return job_output + + @workflow.update + async def run_python_job(self, job: Job) -> JobOutput: + """ + To be executed in order dictated by job dependency graph (see `jobs.depends_on`) and not + before `job.after_time`. + """ + ## SDK internals: please pretend this is implemented in the SDK + await workflow_internals._wait_until_ready_to_execute(job["update_id"]) + ## + + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job["python_interpreter_version"]], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {version} is not available", + ) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) # SDK emits UpdateCompleted + return job_output + + @run_shell_script_job.validator + def validate_shell_script_job(self, job: Job): + ## SDK internals: please pretend this is implemented in the SDK + workflow_internals.accept_update( + Update( + type="run_shell_script_job", + args=(job,), + client_identity="some-client-id", + id=job["update_id"], + received_at=workflow.now(), + ) + ) + ## + + @run_python_job.validator + def validate_python_job(self, job: Job): + ## SDK internals: please pretend this is implemented in the SDK + workflow_internals.accept_update( + Update( + type="run_python_job", + args=(job,), + client_identity="some-client-id", + id=job["update_id"], + received_at=workflow.now(), + ) + ) + ## + + @workflow.run + async def run(self): + while not workflow.info().is_continue_as_new_suggested(): + await workflow.wait_condition(lambda: len(self.jobs) > 0) + workflow.continue_as_new() + + +@activity.defn +async def run_job(job: Job) -> JobOutput: + await asyncio.sleep(0.1) + stdout = f"Ran job {job["name"]} at {datetime.now()}" + print(stdout) + return JobOutput(status=0, stdout=stdout, stderr="") + + +@activity.defn +async def request_cluster_slot(job: Job) -> ClusterSlot: + await asyncio.sleep(0.1) + return "cluster-slot-token-abc123" + + +@activity.defn +async def run_shell_script_security_linter(code: str) -> str: + # The user's organization requires that all shell scripts pass an in-house linter that checks + # for shell scripting constructions deemed insecure. + await asyncio.sleep(0.1) + return "" + + +@activity.defn +async def check_python_interpreter_version(version: str) -> bool: + await asyncio.sleep(0.1) + version_is_available = True + return version_is_available + + +async def app(wf: WorkflowHandle): + job_1 = Job( + update_id="1", + depends_on=[], + after_time=None, + name="should-run-first", + run="echo 'Hello world 1!'", + python_interpreter_version=None, + ) + job_2 = Job( + update_id="2", + depends_on=["1"], + after_time=None, + name="should-run-second", + run="print('Hello world 2!')", + python_interpreter_version=None, + ) + job_2 = await wf.execute_update(JobRunner.run_python_job, job_2) + job_1 = await wf.execute_update(JobRunner.run_shell_script_job, job_1) + + +async def main(): + client = await Client.connect("localhost:7233") + async with Worker( + client, + task_queue="tq", + workflows=[JobRunner], + activities=[ + run_job, + run_shell_script_security_linter, + check_python_interpreter_version, + request_cluster_slot, + ], + ): + wf = await client.start_workflow( + JobRunner.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) From 00c77a80df0e4434d03001c74604f4ea5ef6e264 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Tue, 28 May 2024 14:07:55 -0400 Subject: [PATCH 27/41] Job runner base --- update/job_runner_I1.py | 146 ++++++++++++++++++++++++++++++++++++++ update/job_runner_I2.py | 146 ++++++++++++++++++++++++++++++++++++++ update/job_runner_base.py | 146 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 update/job_runner_I1.py create mode 100644 update/job_runner_I2.py create mode 100644 update/job_runner_base.py diff --git a/update/job_runner_I1.py b/update/job_runner_I1.py new file mode 100644 index 00000000..47f0a513 --- /dev/null +++ b/update/job_runner_I1.py @@ -0,0 +1,146 @@ +import asyncio +from dataclasses import dataclass +from datetime import datetime, timedelta +import logging +from typing import Optional + +from temporalio import common, workflow, activity +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +JobID = str + + +@dataclass +class Job: + id: JobID + depends_on: list[JobID] + after_time: Optional[int] + name: str + run: str + python_interpreter_version: Optional[str] + + +@dataclass +class JobOutput: + status: int + stdout: str + stderr: str + + +@workflow.defn +class JobRunner: + """ + Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and + not before `job.after_time`. + """ + + @workflow.run + async def run(self): + await workflow.wait_condition( + lambda: workflow.info().is_continue_as_new_suggested() + ) + workflow.continue_as_new() + + @workflow.update + async def run_shell_script_job(self, job: Job) -> JobOutput: + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job.run], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + @workflow.update + async def run_python_job(self, job: Job) -> JobOutput: + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job.python_interpreter_version], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {job.python_interpreter_version} is not available", + ) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + +@activity.defn +async def run_job(job: Job) -> JobOutput: + await asyncio.sleep(0.1) + stdout = f"Ran job {job.name} at {datetime.now()}" + print(stdout) + return JobOutput(status=0, stdout=stdout, stderr="") + + +@activity.defn +async def run_shell_script_security_linter(code: str) -> str: + # The user's organization requires that all shell scripts pass an in-house linter that checks + # for shell scripting constructions deemed insecure. + await asyncio.sleep(0.1) + return "" + + +@activity.defn +async def check_python_interpreter_version(version: str) -> bool: + await asyncio.sleep(0.1) + version_is_available = True + return version_is_available + + +async def app(wf: WorkflowHandle): + job_1 = Job( + id="1", + depends_on=[], + after_time=None, + name="should-run-first", + run="echo 'Hello world 1!'", + python_interpreter_version=None, + ) + job_2 = Job( + id="2", + depends_on=["1"], + after_time=None, + name="should-run-second", + run="print('Hello world 2!')", + python_interpreter_version=None, + ) + await asyncio.gather( + wf.execute_update(JobRunner.run_python_job, job_2), + wf.execute_update(JobRunner.run_shell_script_job, job_1), + ) + + +async def main(): + client = await Client.connect("localhost:7233") + async with Worker( + client, + task_queue="tq", + workflows=[JobRunner], + activities=[ + run_job, + run_shell_script_security_linter, + check_python_interpreter_version, + ], + ): + wf = await client.start_workflow( + JobRunner.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) diff --git a/update/job_runner_I2.py b/update/job_runner_I2.py new file mode 100644 index 00000000..47f0a513 --- /dev/null +++ b/update/job_runner_I2.py @@ -0,0 +1,146 @@ +import asyncio +from dataclasses import dataclass +from datetime import datetime, timedelta +import logging +from typing import Optional + +from temporalio import common, workflow, activity +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +JobID = str + + +@dataclass +class Job: + id: JobID + depends_on: list[JobID] + after_time: Optional[int] + name: str + run: str + python_interpreter_version: Optional[str] + + +@dataclass +class JobOutput: + status: int + stdout: str + stderr: str + + +@workflow.defn +class JobRunner: + """ + Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and + not before `job.after_time`. + """ + + @workflow.run + async def run(self): + await workflow.wait_condition( + lambda: workflow.info().is_continue_as_new_suggested() + ) + workflow.continue_as_new() + + @workflow.update + async def run_shell_script_job(self, job: Job) -> JobOutput: + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job.run], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + @workflow.update + async def run_python_job(self, job: Job) -> JobOutput: + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job.python_interpreter_version], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {job.python_interpreter_version} is not available", + ) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + +@activity.defn +async def run_job(job: Job) -> JobOutput: + await asyncio.sleep(0.1) + stdout = f"Ran job {job.name} at {datetime.now()}" + print(stdout) + return JobOutput(status=0, stdout=stdout, stderr="") + + +@activity.defn +async def run_shell_script_security_linter(code: str) -> str: + # The user's organization requires that all shell scripts pass an in-house linter that checks + # for shell scripting constructions deemed insecure. + await asyncio.sleep(0.1) + return "" + + +@activity.defn +async def check_python_interpreter_version(version: str) -> bool: + await asyncio.sleep(0.1) + version_is_available = True + return version_is_available + + +async def app(wf: WorkflowHandle): + job_1 = Job( + id="1", + depends_on=[], + after_time=None, + name="should-run-first", + run="echo 'Hello world 1!'", + python_interpreter_version=None, + ) + job_2 = Job( + id="2", + depends_on=["1"], + after_time=None, + name="should-run-second", + run="print('Hello world 2!')", + python_interpreter_version=None, + ) + await asyncio.gather( + wf.execute_update(JobRunner.run_python_job, job_2), + wf.execute_update(JobRunner.run_shell_script_job, job_1), + ) + + +async def main(): + client = await Client.connect("localhost:7233") + async with Worker( + client, + task_queue="tq", + workflows=[JobRunner], + activities=[ + run_job, + run_shell_script_security_linter, + check_python_interpreter_version, + ], + ): + wf = await client.start_workflow( + JobRunner.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) diff --git a/update/job_runner_base.py b/update/job_runner_base.py new file mode 100644 index 00000000..47f0a513 --- /dev/null +++ b/update/job_runner_base.py @@ -0,0 +1,146 @@ +import asyncio +from dataclasses import dataclass +from datetime import datetime, timedelta +import logging +from typing import Optional + +from temporalio import common, workflow, activity +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +JobID = str + + +@dataclass +class Job: + id: JobID + depends_on: list[JobID] + after_time: Optional[int] + name: str + run: str + python_interpreter_version: Optional[str] + + +@dataclass +class JobOutput: + status: int + stdout: str + stderr: str + + +@workflow.defn +class JobRunner: + """ + Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and + not before `job.after_time`. + """ + + @workflow.run + async def run(self): + await workflow.wait_condition( + lambda: workflow.info().is_continue_as_new_suggested() + ) + workflow.continue_as_new() + + @workflow.update + async def run_shell_script_job(self, job: Job) -> JobOutput: + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job.run], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + @workflow.update + async def run_python_job(self, job: Job) -> JobOutput: + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job.python_interpreter_version], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {job.python_interpreter_version} is not available", + ) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + +@activity.defn +async def run_job(job: Job) -> JobOutput: + await asyncio.sleep(0.1) + stdout = f"Ran job {job.name} at {datetime.now()}" + print(stdout) + return JobOutput(status=0, stdout=stdout, stderr="") + + +@activity.defn +async def run_shell_script_security_linter(code: str) -> str: + # The user's organization requires that all shell scripts pass an in-house linter that checks + # for shell scripting constructions deemed insecure. + await asyncio.sleep(0.1) + return "" + + +@activity.defn +async def check_python_interpreter_version(version: str) -> bool: + await asyncio.sleep(0.1) + version_is_available = True + return version_is_available + + +async def app(wf: WorkflowHandle): + job_1 = Job( + id="1", + depends_on=[], + after_time=None, + name="should-run-first", + run="echo 'Hello world 1!'", + python_interpreter_version=None, + ) + job_2 = Job( + id="2", + depends_on=["1"], + after_time=None, + name="should-run-second", + run="print('Hello world 2!')", + python_interpreter_version=None, + ) + await asyncio.gather( + wf.execute_update(JobRunner.run_python_job, job_2), + wf.execute_update(JobRunner.run_shell_script_job, job_1), + ) + + +async def main(): + client = await Client.connect("localhost:7233") + async with Worker( + client, + task_queue="tq", + workflows=[JobRunner], + activities=[ + run_job, + run_shell_script_security_linter, + check_python_interpreter_version, + ], + ): + wf = await client.start_workflow( + JobRunner.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) From 16835afdb38b371f0cb86ac72e38912bbe2b82a8 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Tue, 28 May 2024 14:08:08 -0400 Subject: [PATCH 28/41] Job runner I1 --- update/job_runner_I1.py | 85 +++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 24 deletions(-) diff --git a/update/job_runner_I1.py b/update/job_runner_I1.py index 47f0a513..05eb4646 100644 --- a/update/job_runner_I1.py +++ b/update/job_runner_I1.py @@ -36,42 +36,79 @@ class JobRunner: not before `job.after_time`. """ + def __init__(self) -> None: + self._pending_tasks = 0 + self.completed_tasks = set[JobID]() + self.handler_mutex = asyncio.Lock() + + def all_handlers_completed(self) -> bool: + # We are considering adding an API like `all_handlers_completed` to SDKs. We've added + # self._pending tasks to this workflow in lieu of it being built into the SDKs. + return not self._pending_tasks + @workflow.run async def run(self): await workflow.wait_condition( - lambda: workflow.info().is_continue_as_new_suggested() + lambda: ( + workflow.info().is_continue_as_new_suggested() + and self.all_handlers_completed() + ) ) workflow.continue_as_new() + def ready_to_execute(self, job: Job) -> bool: + if not set(job.depends_on) <= self.completed_tasks: + return False + if after_time := job.after_time: + if float(after_time) > workflow.now().timestamp(): + return False + return True + @workflow.update async def run_shell_script_job(self, job: Job) -> JobOutput: - if security_errors := await workflow.execute_activity( - run_shell_script_security_linter, - args=[job.run], - start_to_close_timeout=timedelta(seconds=10), - ): - return JobOutput(status=1, stdout="", stderr=security_errors) - job_output = await workflow.execute_activity( - run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) - ) - return job_output + self._pending_tasks += 1 + await workflow.wait_condition(lambda: self.ready_to_execute(job)) + await self.handler_mutex.acquire() + + try: + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job.run], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + finally: + # FIXME: unbounded memory usage + self.completed_tasks.add(job.id) + self.handler_mutex.release() + self._pending_tasks -= 1 @workflow.update async def run_python_job(self, job: Job) -> JobOutput: - if not await workflow.execute_activity( - check_python_interpreter_version, - args=[job.python_interpreter_version], - start_to_close_timeout=timedelta(seconds=10), - ): - return JobOutput( - status=1, - stdout="", - stderr=f"Python interpreter version {job.python_interpreter_version} is not available", + await workflow.wait_condition(lambda: self.ready_to_execute(job)) + await self.handler_mutex.acquire() + + try: + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job.python_interpreter_version], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {job.python_interpreter_version} is not available", + ) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) ) - job_output = await workflow.execute_activity( - run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) - ) - return job_output + return job_output + finally: + self.handler_mutex.release() @activity.defn From 887b729db6a916c079656e78eec49bc5d01239cc Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Wed, 29 May 2024 05:28:24 -0400 Subject: [PATCH 29/41] Job runner I2 --- update/job_runner_I2.py | 86 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/update/job_runner_I2.py b/update/job_runner_I2.py index 47f0a513..921bdbfe 100644 --- a/update/job_runner_I2.py +++ b/update/job_runner_I2.py @@ -1,8 +1,10 @@ import asyncio +from collections import OrderedDict from dataclasses import dataclass from datetime import datetime, timedelta +from enum import Enum import logging -from typing import Optional +from typing import Awaitable, Callable, Optional from temporalio import common, workflow, activity from temporalio.client import Client, WorkflowHandle @@ -29,6 +31,19 @@ class JobOutput: stderr: str +class TaskStatus(Enum): + BLOCKED = 1 + UNBLOCKED = 2 + + +@dataclass +class Task: + input: Job + handler: Callable[["JobRunner", Job], Awaitable[JobOutput]] + status: TaskStatus = TaskStatus.BLOCKED + output: Optional[JobOutput] = None + + @workflow.defn class JobRunner: """ @@ -36,15 +51,73 @@ class JobRunner: not before `job.after_time`. """ + def __init__(self) -> None: + self.task_queue = OrderedDict[JobID, Task]() + self.completed_tasks = set[JobID]() + + def all_handlers_completed(self): + # We are considering adding an API like `all_handlers_completed` to SDKs. In this particular + # case, the user doesn't actually need the new API, since they are forced to track pending + # tasks in their queue implementation. + return not self.task_queue + + # Note some undesirable things: + # 1. The update handler functions have become generic enqueuers; the "real" handler functions + # are some other methods that don't have the @workflow.update decorator. + # 2. The update handler functions have to store a reference to the real handler in the queue. + # 3. The workflow `run` method is *much* more complicated and bug-prone here, compared to + # I1:WaitUntilReadyToExecuteHandler + @workflow.run async def run(self): - await workflow.wait_condition( - lambda: workflow.info().is_continue_as_new_suggested() - ) + """ + Process all tasks in the queue serially, in the main workflow coroutine. + """ + # Note: there are many mistakes a user will make while trying to implement this workflow. + while not ( + workflow.info().is_continue_as_new_suggested() + and self.all_handlers_completed() + ): + await workflow.wait_condition(lambda: bool(self.task_queue)) + for id, task in list(self.task_queue.items()): + if task.status == TaskStatus.UNBLOCKED: + await task.handler(self, task.input) + del self.task_queue[id] + self.completed_tasks.add(id) + for id, task in self.task_queue.items(): + if task.status == TaskStatus.BLOCKED and self.ready_to_execute( + task.input + ): + task.status = TaskStatus.UNBLOCKED workflow.continue_as_new() + def ready_to_execute(self, job: Job) -> bool: + if not set(job.depends_on) <= self.completed_tasks: + return False + if after_time := job.after_time: + if float(after_time) > workflow.now().timestamp(): + return False + return True + + async def _enqueue_job_and_wait_for_result( + self, job: Job, handler: Callable[["JobRunner", Job], Awaitable[JobOutput]] + ) -> JobOutput: + task = Task(job, handler) + self.task_queue[job.id] = task + await workflow.wait_condition(lambda: task.output is not None) + # Footgun: a user might well think that they can record task completion here, but in fact it + # deadlocks. + # self.completed_tasks.add(job.id) + assert task.output + return task.output + @workflow.update async def run_shell_script_job(self, job: Job) -> JobOutput: + return await self._enqueue_job_and_wait_for_result( + job, JobRunner._actually_run_shell_script_job + ) + + async def _actually_run_shell_script_job(self, job: Job) -> JobOutput: if security_errors := await workflow.execute_activity( run_shell_script_security_linter, args=[job.run], @@ -58,6 +131,11 @@ async def run_shell_script_job(self, job: Job) -> JobOutput: @workflow.update async def run_python_job(self, job: Job) -> JobOutput: + return await self._enqueue_job_and_wait_for_result( + job, JobRunner._actually_run_python_job + ) + + async def _actually_run_python_job(self, job: Job) -> JobOutput: if not await workflow.execute_activity( check_python_interpreter_version, args=[job.python_interpreter_version], From 4f86b4cfe37eefda6225fd73a04bb12ce4e33b8d Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 04:20:33 -0400 Subject: [PATCH 30/41] Modify I2 for native --- update/job_runner_I2.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/update/job_runner_I2.py b/update/job_runner_I2.py index 921bdbfe..b33a68c8 100644 --- a/update/job_runner_I2.py +++ b/update/job_runner_I2.py @@ -14,6 +14,11 @@ JobID = str +class JobStatus(Enum): + BLOCKED = 1 + UNBLOCKED = 2 + + @dataclass class Job: id: JobID @@ -22,6 +27,16 @@ class Job: name: str run: str python_interpreter_version: Optional[str] + # TODO: How to handle enums in dataclasses with Temporal's ser/de. + status_value: int = JobStatus.BLOCKED.value + + @property + def status(self): + return JobStatus(self.status_value) + + @status.setter + def status(self, status: JobStatus): + self.status_value = status.value @dataclass @@ -31,16 +46,10 @@ class JobOutput: stderr: str -class TaskStatus(Enum): - BLOCKED = 1 - UNBLOCKED = 2 - - @dataclass class Task: input: Job handler: Callable[["JobRunner", Job], Awaitable[JobOutput]] - status: TaskStatus = TaskStatus.BLOCKED output: Optional[JobOutput] = None @@ -80,15 +89,14 @@ async def run(self): ): await workflow.wait_condition(lambda: bool(self.task_queue)) for id, task in list(self.task_queue.items()): - if task.status == TaskStatus.UNBLOCKED: - await task.handler(self, task.input) + job = task.input + if job.status == JobStatus.UNBLOCKED: + await task.handler(self, job) del self.task_queue[id] self.completed_tasks.add(id) for id, task in self.task_queue.items(): - if task.status == TaskStatus.BLOCKED and self.ready_to_execute( - task.input - ): - task.status = TaskStatus.UNBLOCKED + if job.status == JobStatus.BLOCKED and self.ready_to_execute(job): + job.status = JobStatus.UNBLOCKED workflow.continue_as_new() def ready_to_execute(self, job: Job) -> bool: From a75fb69504ed32f513641efe680008a0bdc92fb7 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 04:23:38 -0400 Subject: [PATCH 31/41] Job runner I2 native base --- update/job_runner_I2_native.py | 232 +++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 update/job_runner_I2_native.py diff --git a/update/job_runner_I2_native.py b/update/job_runner_I2_native.py new file mode 100644 index 00000000..b33a68c8 --- /dev/null +++ b/update/job_runner_I2_native.py @@ -0,0 +1,232 @@ +import asyncio +from collections import OrderedDict +from dataclasses import dataclass +from datetime import datetime, timedelta +from enum import Enum +import logging +from typing import Awaitable, Callable, Optional + +from temporalio import common, workflow, activity +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +JobID = str + + +class JobStatus(Enum): + BLOCKED = 1 + UNBLOCKED = 2 + + +@dataclass +class Job: + id: JobID + depends_on: list[JobID] + after_time: Optional[int] + name: str + run: str + python_interpreter_version: Optional[str] + # TODO: How to handle enums in dataclasses with Temporal's ser/de. + status_value: int = JobStatus.BLOCKED.value + + @property + def status(self): + return JobStatus(self.status_value) + + @status.setter + def status(self, status: JobStatus): + self.status_value = status.value + + +@dataclass +class JobOutput: + status: int + stdout: str + stderr: str + + +@dataclass +class Task: + input: Job + handler: Callable[["JobRunner", Job], Awaitable[JobOutput]] + output: Optional[JobOutput] = None + + +@workflow.defn +class JobRunner: + """ + Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and + not before `job.after_time`. + """ + + def __init__(self) -> None: + self.task_queue = OrderedDict[JobID, Task]() + self.completed_tasks = set[JobID]() + + def all_handlers_completed(self): + # We are considering adding an API like `all_handlers_completed` to SDKs. In this particular + # case, the user doesn't actually need the new API, since they are forced to track pending + # tasks in their queue implementation. + return not self.task_queue + + # Note some undesirable things: + # 1. The update handler functions have become generic enqueuers; the "real" handler functions + # are some other methods that don't have the @workflow.update decorator. + # 2. The update handler functions have to store a reference to the real handler in the queue. + # 3. The workflow `run` method is *much* more complicated and bug-prone here, compared to + # I1:WaitUntilReadyToExecuteHandler + + @workflow.run + async def run(self): + """ + Process all tasks in the queue serially, in the main workflow coroutine. + """ + # Note: there are many mistakes a user will make while trying to implement this workflow. + while not ( + workflow.info().is_continue_as_new_suggested() + and self.all_handlers_completed() + ): + await workflow.wait_condition(lambda: bool(self.task_queue)) + for id, task in list(self.task_queue.items()): + job = task.input + if job.status == JobStatus.UNBLOCKED: + await task.handler(self, job) + del self.task_queue[id] + self.completed_tasks.add(id) + for id, task in self.task_queue.items(): + if job.status == JobStatus.BLOCKED and self.ready_to_execute(job): + job.status = JobStatus.UNBLOCKED + workflow.continue_as_new() + + def ready_to_execute(self, job: Job) -> bool: + if not set(job.depends_on) <= self.completed_tasks: + return False + if after_time := job.after_time: + if float(after_time) > workflow.now().timestamp(): + return False + return True + + async def _enqueue_job_and_wait_for_result( + self, job: Job, handler: Callable[["JobRunner", Job], Awaitable[JobOutput]] + ) -> JobOutput: + task = Task(job, handler) + self.task_queue[job.id] = task + await workflow.wait_condition(lambda: task.output is not None) + # Footgun: a user might well think that they can record task completion here, but in fact it + # deadlocks. + # self.completed_tasks.add(job.id) + assert task.output + return task.output + + @workflow.update + async def run_shell_script_job(self, job: Job) -> JobOutput: + return await self._enqueue_job_and_wait_for_result( + job, JobRunner._actually_run_shell_script_job + ) + + async def _actually_run_shell_script_job(self, job: Job) -> JobOutput: + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job.run], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + @workflow.update + async def run_python_job(self, job: Job) -> JobOutput: + return await self._enqueue_job_and_wait_for_result( + job, JobRunner._actually_run_python_job + ) + + async def _actually_run_python_job(self, job: Job) -> JobOutput: + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job.python_interpreter_version], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {job.python_interpreter_version} is not available", + ) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + + +@activity.defn +async def run_job(job: Job) -> JobOutput: + await asyncio.sleep(0.1) + stdout = f"Ran job {job.name} at {datetime.now()}" + print(stdout) + return JobOutput(status=0, stdout=stdout, stderr="") + + +@activity.defn +async def run_shell_script_security_linter(code: str) -> str: + # The user's organization requires that all shell scripts pass an in-house linter that checks + # for shell scripting constructions deemed insecure. + await asyncio.sleep(0.1) + return "" + + +@activity.defn +async def check_python_interpreter_version(version: str) -> bool: + await asyncio.sleep(0.1) + version_is_available = True + return version_is_available + + +async def app(wf: WorkflowHandle): + job_1 = Job( + id="1", + depends_on=[], + after_time=None, + name="should-run-first", + run="echo 'Hello world 1!'", + python_interpreter_version=None, + ) + job_2 = Job( + id="2", + depends_on=["1"], + after_time=None, + name="should-run-second", + run="print('Hello world 2!')", + python_interpreter_version=None, + ) + await asyncio.gather( + wf.execute_update(JobRunner.run_python_job, job_2), + wf.execute_update(JobRunner.run_shell_script_job, job_1), + ) + + +async def main(): + client = await Client.connect("localhost:7233") + async with Worker( + client, + task_queue="tq", + workflows=[JobRunner], + activities=[ + run_job, + run_shell_script_security_linter, + check_python_interpreter_version, + ], + ): + wf = await client.start_workflow( + JobRunner.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) From 158eb78a3880fa3735400159130c25cd768d5a06 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 04:02:48 -0400 Subject: [PATCH 32/41] Start sketching I2 native --- update/job_runner_I2_native.py | 163 ++++++++++++++++++++++----------- 1 file changed, 111 insertions(+), 52 deletions(-) diff --git a/update/job_runner_I2_native.py b/update/job_runner_I2_native.py index b33a68c8..86d60375 100644 --- a/update/job_runner_I2_native.py +++ b/update/job_runner_I2_native.py @@ -3,14 +3,20 @@ from dataclasses import dataclass from datetime import datetime, timedelta from enum import Enum +import inspect import logging -from typing import Awaitable, Callable, Optional +from typing import Awaitable, Callable, Optional, Type from temporalio import common, workflow, activity from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker +## +## user code +## + + JobID = str @@ -46,35 +52,90 @@ class JobOutput: stderr: str +## +## SDK internals toy prototype +## + +# TODO: use generics to satisfy serializable interface. Faking for now by using user-defined classes. +I = Job +O = JobOutput + +UpdateID = str +Workflow = Type + + @dataclass -class Task: - input: Job - handler: Callable[["JobRunner", Job], Awaitable[JobOutput]] - output: Optional[JobOutput] = None +class Update: + arg: I # real implementation will support multiple args + handler: Callable[[Workflow, I], Awaitable[O]] + output: Optional[O] = None + + @property + def id(self): + # In our real implementation the SDK will have native access to the update ID. Currently + # this example is assuming the user passes it in the update arg. + return self.arg.id + + async def handle(self, wf: Workflow) -> O: + # TODO: error-handling + # TODO: prevent handling an update twice + update_result = await self.handler(wf, self.arg) + del workflow.update_queue[self.id] + return update_result + + +async def _sdk_internals_enqueue_job_and_wait_for_result( + arg: I, handler: Callable[[Type, I], Awaitable[O]] +) -> O: + update = Update(arg, handler) + workflow.update_queue[update.id] = update + await workflow.wait_condition(lambda: update.output is not None) + assert update.output + return update.output + + +class SDKInternals: + # Here, the SDK is wrapping the user's update handlers with the required enqueue-and-wait + # functionality. This is a fake implementation: the real implementation will automatically + # inspect and wrap the user's declared update handlers. + + @workflow.update + async def run_shell_script_job(self, arg: I) -> O: + handler = getattr(self.__class__, "_" + inspect.currentframe().f_code.co_name) + return await _sdk_internals_enqueue_job_and_wait_for_result(arg, handler) + + @workflow.update + async def run_python_job(self, arg: I) -> O: + handler = getattr(self.__class__, "_" + inspect.currentframe().f_code.co_name) + return await _sdk_internals_enqueue_job_and_wait_for_result(arg, handler) + + +# Monkey-patch proposed new public API +setattr(workflow, "update_queue", OrderedDict[UpdateID, Update]()) +# The queue-processing style doesn't need an `all_handlers_completed` API: this condition is true +# iff workflow.update_queue is empty. + +## +## END SDK internals prototype +## @workflow.defn -class JobRunner: +class JobRunner(SDKInternals): """ Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and not before `job.after_time`. """ def __init__(self) -> None: - self.task_queue = OrderedDict[JobID, Task]() + super().__init__() self.completed_tasks = set[JobID]() - def all_handlers_completed(self): - # We are considering adding an API like `all_handlers_completed` to SDKs. In this particular - # case, the user doesn't actually need the new API, since they are forced to track pending - # tasks in their queue implementation. - return not self.task_queue + # Note some desirable things: + # 1. The update handler functions are now "real" handler functions # Note some undesirable things: - # 1. The update handler functions have become generic enqueuers; the "real" handler functions - # are some other methods that don't have the @workflow.update decorator. - # 2. The update handler functions have to store a reference to the real handler in the queue. - # 3. The workflow `run` method is *much* more complicated and bug-prone here, compared to + # 1. The workflow `run` method is still *much* more complicated and bug-prone here, compared to # I1:WaitUntilReadyToExecuteHandler @workflow.run @@ -82,19 +143,30 @@ async def run(self): """ Process all tasks in the queue serially, in the main workflow coroutine. """ - # Note: there are many mistakes a user will make while trying to implement this workflow. - while not ( - workflow.info().is_continue_as_new_suggested() - and self.all_handlers_completed() + # Note: a user will make mistakes while trying to implement this workflow, due to the + # unblocking algorithm that this particular example seems to require when implemented via + # queue-processing in the main workflow coroutine (this example is simpler to implement by + # making each handler invocation wait until it should execute, and allowing the execution to + # take place in the handler coroutine, with a mutex held. See job_runner_I1.py and + # job_runner_I1_native.py) + while ( + workflow.update_queue or not workflow.info().is_continue_as_new_suggested() ): - await workflow.wait_condition(lambda: bool(self.task_queue)) - for id, task in list(self.task_queue.items()): - job = task.input + await workflow.wait_condition(lambda: bool(workflow.update_queue)) + for id, update in list(workflow.update_queue.items()): + job = update.arg if job.status == JobStatus.UNBLOCKED: - await task.handler(self, job) - del self.task_queue[id] + # This is how a user manually handles an update. Note that it takes a reference + # to the workflow instance, since an update handler has access to the workflow + # instance. + await update.handle(self) + + # FIXME: unbounded memory usage; this example use-case needs to know which + # updates have completed. Perhaps the real problem here lies with the example, + # i.e. the example needs to be made more realistic. self.completed_tasks.add(id) - for id, task in self.task_queue.items(): + for id, update in workflow.update_queue.items(): + job = update.arg if job.status == JobStatus.BLOCKED and self.ready_to_execute(job): job.status = JobStatus.UNBLOCKED workflow.continue_as_new() @@ -107,25 +179,17 @@ def ready_to_execute(self, job: Job) -> bool: return False return True - async def _enqueue_job_and_wait_for_result( - self, job: Job, handler: Callable[["JobRunner", Job], Awaitable[JobOutput]] - ) -> JobOutput: - task = Task(job, handler) - self.task_queue[job.id] = task - await workflow.wait_condition(lambda: task.output is not None) - # Footgun: a user might well think that they can record task completion here, but in fact it - # deadlocks. - # self.completed_tasks.add(job.id) - assert task.output - return task.output - - @workflow.update - async def run_shell_script_job(self, job: Job) -> JobOutput: - return await self._enqueue_job_and_wait_for_result( - job, JobRunner._actually_run_shell_script_job - ) - - async def _actually_run_shell_script_job(self, job: Job) -> JobOutput: + # These are the real handler functions. When we implement SDK support, these will use the + # @workflow.update decorator and will not use an underscore prefix. + # TBD update decorator argument name: + # queue=True + # enqueue=True + # auto=False + # auto_handle=False + # manual=True + + # @workflow.update(queue=True) + async def _run_shell_script_job(self, job: Job) -> JobOutput: if security_errors := await workflow.execute_activity( run_shell_script_security_linter, args=[job.run], @@ -137,13 +201,8 @@ async def _actually_run_shell_script_job(self, job: Job) -> JobOutput: ) return job_output - @workflow.update - async def run_python_job(self, job: Job) -> JobOutput: - return await self._enqueue_job_and_wait_for_result( - job, JobRunner._actually_run_python_job - ) - - async def _actually_run_python_job(self, job: Job) -> JobOutput: + # @workflow.update(queue=True) + async def _run_python_job(self, job: Job) -> JobOutput: if not await workflow.execute_activity( check_python_interpreter_version, args=[job.python_interpreter_version], From 87052fb67d2c07466b8c1cb2eac9b47b4df0206f Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 07:04:16 -0400 Subject: [PATCH 33/41] Job runner I1 native base --- update/job_runner_I1_native.py | 183 +++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 update/job_runner_I1_native.py diff --git a/update/job_runner_I1_native.py b/update/job_runner_I1_native.py new file mode 100644 index 00000000..05eb4646 --- /dev/null +++ b/update/job_runner_I1_native.py @@ -0,0 +1,183 @@ +import asyncio +from dataclasses import dataclass +from datetime import datetime, timedelta +import logging +from typing import Optional + +from temporalio import common, workflow, activity +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + + +JobID = str + + +@dataclass +class Job: + id: JobID + depends_on: list[JobID] + after_time: Optional[int] + name: str + run: str + python_interpreter_version: Optional[str] + + +@dataclass +class JobOutput: + status: int + stdout: str + stderr: str + + +@workflow.defn +class JobRunner: + """ + Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and + not before `job.after_time`. + """ + + def __init__(self) -> None: + self._pending_tasks = 0 + self.completed_tasks = set[JobID]() + self.handler_mutex = asyncio.Lock() + + def all_handlers_completed(self) -> bool: + # We are considering adding an API like `all_handlers_completed` to SDKs. We've added + # self._pending tasks to this workflow in lieu of it being built into the SDKs. + return not self._pending_tasks + + @workflow.run + async def run(self): + await workflow.wait_condition( + lambda: ( + workflow.info().is_continue_as_new_suggested() + and self.all_handlers_completed() + ) + ) + workflow.continue_as_new() + + def ready_to_execute(self, job: Job) -> bool: + if not set(job.depends_on) <= self.completed_tasks: + return False + if after_time := job.after_time: + if float(after_time) > workflow.now().timestamp(): + return False + return True + + @workflow.update + async def run_shell_script_job(self, job: Job) -> JobOutput: + self._pending_tasks += 1 + await workflow.wait_condition(lambda: self.ready_to_execute(job)) + await self.handler_mutex.acquire() + + try: + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job.run], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + finally: + # FIXME: unbounded memory usage + self.completed_tasks.add(job.id) + self.handler_mutex.release() + self._pending_tasks -= 1 + + @workflow.update + async def run_python_job(self, job: Job) -> JobOutput: + await workflow.wait_condition(lambda: self.ready_to_execute(job)) + await self.handler_mutex.acquire() + + try: + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job.python_interpreter_version], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {job.python_interpreter_version} is not available", + ) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + return job_output + finally: + self.handler_mutex.release() + + +@activity.defn +async def run_job(job: Job) -> JobOutput: + await asyncio.sleep(0.1) + stdout = f"Ran job {job.name} at {datetime.now()}" + print(stdout) + return JobOutput(status=0, stdout=stdout, stderr="") + + +@activity.defn +async def run_shell_script_security_linter(code: str) -> str: + # The user's organization requires that all shell scripts pass an in-house linter that checks + # for shell scripting constructions deemed insecure. + await asyncio.sleep(0.1) + return "" + + +@activity.defn +async def check_python_interpreter_version(version: str) -> bool: + await asyncio.sleep(0.1) + version_is_available = True + return version_is_available + + +async def app(wf: WorkflowHandle): + job_1 = Job( + id="1", + depends_on=[], + after_time=None, + name="should-run-first", + run="echo 'Hello world 1!'", + python_interpreter_version=None, + ) + job_2 = Job( + id="2", + depends_on=["1"], + after_time=None, + name="should-run-second", + run="print('Hello world 2!')", + python_interpreter_version=None, + ) + await asyncio.gather( + wf.execute_update(JobRunner.run_python_job, job_2), + wf.execute_update(JobRunner.run_shell_script_job, job_1), + ) + + +async def main(): + client = await Client.connect("localhost:7233") + async with Worker( + client, + task_queue="tq", + workflows=[JobRunner], + activities=[ + run_job, + run_shell_script_security_linter, + check_python_interpreter_version, + ], + ): + wf = await client.start_workflow( + JobRunner.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await app(wf) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) From 1473af6207467adf270b9b009cd236fdf6a8ae51 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 07:32:20 -0400 Subject: [PATCH 34/41] Start sketching I1 native --- update/job_runner_I1_native.py | 105 +++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 23 deletions(-) diff --git a/update/job_runner_I1_native.py b/update/job_runner_I1_native.py index 05eb4646..4f88557f 100644 --- a/update/job_runner_I1_native.py +++ b/update/job_runner_I1_native.py @@ -1,8 +1,10 @@ import asyncio +from contextlib import asynccontextmanager from dataclasses import dataclass from datetime import datetime, timedelta +import inspect import logging -from typing import Optional +from typing import Callable, Optional, Type from temporalio import common, workflow, activity from temporalio.client import Client, WorkflowHandle @@ -29,29 +31,91 @@ class JobOutput: stderr: str +## +## SDK internals toy prototype +## + +# TODO: use generics to satisfy serializable interface. Faking for now by using user-defined classes. +I = Job +O = JobOutput + +UpdateID = str +Workflow = Type + +_sdk_internals_pending_tasks_count = 0 +_sdk_internals_handler_mutex = asyncio.Lock() + + +def _sdk_internals_all_handlers_completed(self) -> bool: + # We are considering adding an API like `all_handlers_completed` to SDKs. We've added + # self._pending tasks to this workflow in lieu of it being built into the SDKs. + return not _sdk_internals_pending_tasks_count + + +@asynccontextmanager +async def _sdk_internals__track_pending__wait_until_ready__serialize_execution( + ready_condition: Callable[[], bool] +): + global _sdk_internals_pending_tasks_count + _sdk_internals_pending_tasks_count += 1 + await workflow.wait_condition(ready_condition) + await _sdk_internals_handler_mutex.acquire() + try: + yield + finally: + _sdk_internals_handler_mutex.release() + _sdk_internals_pending_tasks_count -= 1 + + +class SDKInternals: + # Here, the SDK is wrapping the user's update handlers with the required wait-until-ready, + # pending tasks tracking, and synchronization functionality. This is a fake implementation: the + # real implementation will automatically inspect and wrap the user's declared update handlers. + + def ready_to_execute(self, arg: I) -> bool: + # Implemented by user + return True + + @workflow.update + async def run_shell_script_job(self, arg: I) -> O: + handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) + async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( + lambda: self.ready_to_execute(arg) + ): + return await handler(arg) + + @workflow.update + async def run_python_job(self, arg: I) -> O: + handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) + async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( + lambda: self.ready_to_execute(arg) + ): + return await handler(arg) + + +# Monkey-patch proposed new public API +setattr(workflow, "all_handlers_completed", _sdk_internals_all_handlers_completed) +## +## END SDK internals prototype +## + + @workflow.defn -class JobRunner: +class JobRunner(SDKInternals): """ Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and not before `job.after_time`. """ def __init__(self) -> None: - self._pending_tasks = 0 self.completed_tasks = set[JobID]() - self.handler_mutex = asyncio.Lock() - - def all_handlers_completed(self) -> bool: - # We are considering adding an API like `all_handlers_completed` to SDKs. We've added - # self._pending tasks to this workflow in lieu of it being built into the SDKs. - return not self._pending_tasks @workflow.run async def run(self): await workflow.wait_condition( lambda: ( workflow.info().is_continue_as_new_suggested() - and self.all_handlers_completed() + and workflow.all_handlers_completed() ) ) workflow.continue_as_new() @@ -64,12 +128,11 @@ def ready_to_execute(self, job: Job) -> bool: return False return True - @workflow.update - async def run_shell_script_job(self, job: Job) -> JobOutput: - self._pending_tasks += 1 - await workflow.wait_condition(lambda: self.ready_to_execute(job)) - await self.handler_mutex.acquire() + # These are the real handler functions. When we implement SDK support, these will use the + # @workflow.update decorator and will not use an underscore prefix. + # @workflow.update + async def _run_shell_script_job(self, job: Job) -> JobOutput: try: if security_errors := await workflow.execute_activity( run_shell_script_security_linter, @@ -84,14 +147,9 @@ async def run_shell_script_job(self, job: Job) -> JobOutput: finally: # FIXME: unbounded memory usage self.completed_tasks.add(job.id) - self.handler_mutex.release() - self._pending_tasks -= 1 - - @workflow.update - async def run_python_job(self, job: Job) -> JobOutput: - await workflow.wait_condition(lambda: self.ready_to_execute(job)) - await self.handler_mutex.acquire() + # @workflow.update + async def _run_python_job(self, job: Job) -> JobOutput: try: if not await workflow.execute_activity( check_python_interpreter_version, @@ -108,7 +166,8 @@ async def run_python_job(self, job: Job) -> JobOutput: ) return job_output finally: - self.handler_mutex.release() + # FIXME: unbounded memory usage + self.completed_tasks.add(job.id) @activity.defn From c73b8b77a62b14b1f360376a3dfbe40c611a79b7 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 09:28:08 -0400 Subject: [PATCH 35/41] Try to prototype new decorator --- update/job_runner_I1_native.py | 77 +++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/update/job_runner_I1_native.py b/update/job_runner_I1_native.py index 4f88557f..4df48668 100644 --- a/update/job_runner_I1_native.py +++ b/update/job_runner_I1_native.py @@ -2,13 +2,13 @@ from contextlib import asynccontextmanager from dataclasses import dataclass from datetime import datetime, timedelta -import inspect import logging from typing import Callable, Optional, Type from temporalio import common, workflow, activity from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker +from temporalio.workflow import UpdateMethodMultiParam, _UpdateDefinition JobID = str @@ -42,6 +42,13 @@ class JobOutput: UpdateID = str Workflow = Type + +@dataclass +class Update: + id: UpdateID + arg: I + + _sdk_internals_pending_tasks_count = 0 _sdk_internals_handler_mutex = asyncio.Lock() @@ -67,41 +74,51 @@ async def _sdk_internals__track_pending__wait_until_ready__serialize_execution( _sdk_internals_pending_tasks_count -= 1 -class SDKInternals: - # Here, the SDK is wrapping the user's update handlers with the required wait-until-ready, - # pending tasks tracking, and synchronization functionality. This is a fake implementation: the - # real implementation will automatically inspect and wrap the user's declared update handlers. +_original_workflow_update_decorator = workflow.update - def ready_to_execute(self, arg: I) -> bool: - # Implemented by user - return True - @workflow.update - async def run_shell_script_job(self, arg: I) -> O: - handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) - async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( - lambda: self.ready_to_execute(arg) - ): - return await handler(arg) +def _new_workflow_update_decorator( + execute_condition: Callable[[Workflow, Update], bool], **kwargs +) -> Callable[ + [Callable[[Workflow, I], Awaitable[O]]], + UpdateMethodMultiParam[[Workflow, I], O], +]: + def decorator( + handler: Callable[[Workflow, I], Awaitable[O]] + ) -> UpdateMethodMultiParam: + async def wrapped_handler(self: Workflow, arg: I): + async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( + lambda: execute_condition(self, Update(arg.id, arg)) + ): + return await handler(self, arg) + + dec = ( + _original_workflow_update_decorator(**kwargs) + if kwargs + else _original_workflow_update_decorator + ) + wrapped_handler.__name__ = handler.__name__ + fn = dec(wrapped_handler) + setattr( + fn, + "_defn", + _UpdateDefinition(name=handler.__name__, fn=fn, is_method=True), + ) + return fn - @workflow.update - async def run_python_job(self, arg: I) -> O: - handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) - async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( - lambda: self.ready_to_execute(arg) - ): - return await handler(arg) + return decorator # Monkey-patch proposed new public API setattr(workflow, "all_handlers_completed", _sdk_internals_all_handlers_completed) +setattr(workflow, "update", _new_workflow_update_decorator) ## ## END SDK internals prototype ## @workflow.defn -class JobRunner(SDKInternals): +class JobRunner: """ Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and not before `job.after_time`. @@ -120,7 +137,8 @@ async def run(self): ) workflow.continue_as_new() - def ready_to_execute(self, job: Job) -> bool: + def ready_to_execute(self, update: Update) -> bool: + job = update.arg if not set(job.depends_on) <= self.completed_tasks: return False if after_time := job.after_time: @@ -128,11 +146,10 @@ def ready_to_execute(self, job: Job) -> bool: return False return True - # These are the real handler functions. When we implement SDK support, these will use the - # @workflow.update decorator and will not use an underscore prefix. + # These are the real handler functions. - # @workflow.update - async def _run_shell_script_job(self, job: Job) -> JobOutput: + @workflow.update(execute_condition=ready_to_execute) + async def run_shell_script_job(self, job: Job) -> JobOutput: try: if security_errors := await workflow.execute_activity( run_shell_script_security_linter, @@ -148,8 +165,8 @@ async def _run_shell_script_job(self, job: Job) -> JobOutput: # FIXME: unbounded memory usage self.completed_tasks.add(job.id) - # @workflow.update - async def _run_python_job(self, job: Job) -> JobOutput: + @workflow.update(execute_condition=ready_to_execute) + async def run_python_job(self, job: Job) -> JobOutput: try: if not await workflow.execute_activity( check_python_interpreter_version, From 7064b7ac3ecb691333ca6a7405d49ffbaa0bba08 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 10:03:23 -0400 Subject: [PATCH 36/41] Revert "Try to prototype new decorator" This reverts commit cdfa4c9a50436642047116a7792fdcb3d2650ac3. --- update/job_runner_I1_native.py | 79 +++++++++++++--------------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/update/job_runner_I1_native.py b/update/job_runner_I1_native.py index 4df48668..494798b3 100644 --- a/update/job_runner_I1_native.py +++ b/update/job_runner_I1_native.py @@ -2,13 +2,13 @@ from contextlib import asynccontextmanager from dataclasses import dataclass from datetime import datetime, timedelta +import inspect import logging from typing import Callable, Optional, Type from temporalio import common, workflow, activity from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker -from temporalio.workflow import UpdateMethodMultiParam, _UpdateDefinition JobID = str @@ -42,13 +42,6 @@ class JobOutput: UpdateID = str Workflow = Type - -@dataclass -class Update: - id: UpdateID - arg: I - - _sdk_internals_pending_tasks_count = 0 _sdk_internals_handler_mutex = asyncio.Lock() @@ -74,51 +67,41 @@ async def _sdk_internals__track_pending__wait_until_ready__serialize_execution( _sdk_internals_pending_tasks_count -= 1 -_original_workflow_update_decorator = workflow.update - +class SDKInternals: + # Here, the SDK is wrapping the user's update handlers with the required wait-until-ready, + # pending tasks tracking, and synchronization functionality. This is a fake implementation: the + # real implementation will automatically inspect and wrap the user's declared update handlers. -def _new_workflow_update_decorator( - execute_condition: Callable[[Workflow, Update], bool], **kwargs -) -> Callable[ - [Callable[[Workflow, I], Awaitable[O]]], - UpdateMethodMultiParam[[Workflow, I], O], -]: - def decorator( - handler: Callable[[Workflow, I], Awaitable[O]] - ) -> UpdateMethodMultiParam: - async def wrapped_handler(self: Workflow, arg: I): - async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( - lambda: execute_condition(self, Update(arg.id, arg)) - ): - return await handler(self, arg) + def ready_to_execute(self, arg: I) -> bool: + # Implemented by user + return True - dec = ( - _original_workflow_update_decorator(**kwargs) - if kwargs - else _original_workflow_update_decorator - ) - wrapped_handler.__name__ = handler.__name__ - fn = dec(wrapped_handler) - setattr( - fn, - "_defn", - _UpdateDefinition(name=handler.__name__, fn=fn, is_method=True), - ) - return fn + @workflow.update + async def run_shell_script_job(self, arg: I) -> O: + handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) + async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( + lambda: self.ready_to_execute(arg) + ): + return await handler(arg) - return decorator + @workflow.update + async def run_python_job(self, arg: I) -> O: + handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) + async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( + lambda: self.ready_to_execute(arg) + ): + return await handler(arg) # Monkey-patch proposed new public API setattr(workflow, "all_handlers_completed", _sdk_internals_all_handlers_completed) -setattr(workflow, "update", _new_workflow_update_decorator) ## ## END SDK internals prototype ## @workflow.defn -class JobRunner: +class JobRunner(SDKInternals): """ Jobs must be executed in order dictated by job dependency graph (see `job.depends_on`) and not before `job.after_time`. @@ -132,13 +115,12 @@ async def run(self): await workflow.wait_condition( lambda: ( workflow.info().is_continue_as_new_suggested() - and workflow.all_handlers_completed() + and self.all_handlers_completed() ) ) workflow.continue_as_new() - def ready_to_execute(self, update: Update) -> bool: - job = update.arg + def ready_to_execute(self, job: Job) -> bool: if not set(job.depends_on) <= self.completed_tasks: return False if after_time := job.after_time: @@ -146,10 +128,11 @@ def ready_to_execute(self, update: Update) -> bool: return False return True - # These are the real handler functions. + # These are the real handler functions. When we implement SDK support, these will use the + # @workflow.update decorator and will not use an underscore prefix. - @workflow.update(execute_condition=ready_to_execute) - async def run_shell_script_job(self, job: Job) -> JobOutput: + # @workflow.update + async def _run_shell_script_job(self, job: Job) -> JobOutput: try: if security_errors := await workflow.execute_activity( run_shell_script_security_linter, @@ -165,8 +148,8 @@ async def run_shell_script_job(self, job: Job) -> JobOutput: # FIXME: unbounded memory usage self.completed_tasks.add(job.id) - @workflow.update(execute_condition=ready_to_execute) - async def run_python_job(self, job: Job) -> JobOutput: + # @workflow.update + async def _run_python_job(self, job: Job) -> JobOutput: try: if not await workflow.execute_activity( check_python_interpreter_version, From 0f335b4383b6f843f078ea483b424992308313bb Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 10:10:34 -0400 Subject: [PATCH 37/41] Continue sketching --- update/job_runner_I1_native.py | 89 ++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/update/job_runner_I1_native.py b/update/job_runner_I1_native.py index 494798b3..74a6d9ca 100644 --- a/update/job_runner_I1_native.py +++ b/update/job_runner_I1_native.py @@ -42,6 +42,13 @@ class JobOutput: UpdateID = str Workflow = Type + +@dataclass +class Update: + id: UpdateID + arg: I + + _sdk_internals_pending_tasks_count = 0 _sdk_internals_handler_mutex = asyncio.Lock() @@ -54,11 +61,11 @@ def _sdk_internals_all_handlers_completed(self) -> bool: @asynccontextmanager async def _sdk_internals__track_pending__wait_until_ready__serialize_execution( - ready_condition: Callable[[], bool] + execute_condition: Callable[[], bool] ): global _sdk_internals_pending_tasks_count _sdk_internals_pending_tasks_count += 1 - await workflow.wait_condition(ready_condition) + await workflow.wait_condition(execute_condition) await _sdk_internals_handler_mutex.acquire() try: yield @@ -72,15 +79,15 @@ class SDKInternals: # pending tasks tracking, and synchronization functionality. This is a fake implementation: the # real implementation will automatically inspect and wrap the user's declared update handlers. - def ready_to_execute(self, arg: I) -> bool: - # Implemented by user + def ready_to_execute(self, update: Update) -> bool: + # Overridden by users who wish to control order of execution return True @workflow.update async def run_shell_script_job(self, arg: I) -> O: handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( - lambda: self.ready_to_execute(arg) + lambda: self.ready_to_execute(Update(arg.id, arg)) ): return await handler(arg) @@ -88,13 +95,14 @@ async def run_shell_script_job(self, arg: I) -> O: async def run_python_job(self, arg: I) -> O: handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( - lambda: self.ready_to_execute(arg) + lambda: self.ready_to_execute(Update(arg.id, arg)) ): return await handler(arg) # Monkey-patch proposed new public API setattr(workflow, "all_handlers_completed", _sdk_internals_all_handlers_completed) +setattr(workflow, "Update", Update) ## ## END SDK internals prototype ## @@ -115,12 +123,13 @@ async def run(self): await workflow.wait_condition( lambda: ( workflow.info().is_continue_as_new_suggested() - and self.all_handlers_completed() + and workflow.all_handlers_completed() ) ) workflow.continue_as_new() - def ready_to_execute(self, job: Job) -> bool: + def ready_to_execute(self, update: workflow.Update) -> bool: + job = update.arg if not set(job.depends_on) <= self.completed_tasks: return False if after_time := job.after_time: @@ -129,45 +138,41 @@ def ready_to_execute(self, job: Job) -> bool: return True # These are the real handler functions. When we implement SDK support, these will use the - # @workflow.update decorator and will not use an underscore prefix. + # decorator form commented out below, and will not use an underscore prefix. - # @workflow.update + # @workflow.update(execute_condition=ready_to_execute) async def _run_shell_script_job(self, job: Job) -> JobOutput: - try: - if security_errors := await workflow.execute_activity( - run_shell_script_security_linter, - args=[job.run], - start_to_close_timeout=timedelta(seconds=10), - ): - return JobOutput(status=1, stdout="", stderr=security_errors) - job_output = await workflow.execute_activity( - run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) - ) - return job_output - finally: - # FIXME: unbounded memory usage - self.completed_tasks.add(job.id) + if security_errors := await workflow.execute_activity( + run_shell_script_security_linter, + args=[job.run], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput(status=1, stdout="", stderr=security_errors) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + # FIXME: unbounded memory usage + self.completed_tasks.add(job.id) + return job_output - # @workflow.update + # @workflow.update(execute_condition=ready_to_execute) async def _run_python_job(self, job: Job) -> JobOutput: - try: - if not await workflow.execute_activity( - check_python_interpreter_version, - args=[job.python_interpreter_version], - start_to_close_timeout=timedelta(seconds=10), - ): - return JobOutput( - status=1, - stdout="", - stderr=f"Python interpreter version {job.python_interpreter_version} is not available", - ) - job_output = await workflow.execute_activity( - run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + if not await workflow.execute_activity( + check_python_interpreter_version, + args=[job.python_interpreter_version], + start_to_close_timeout=timedelta(seconds=10), + ): + return JobOutput( + status=1, + stdout="", + stderr=f"Python interpreter version {job.python_interpreter_version} is not available", ) - return job_output - finally: - # FIXME: unbounded memory usage - self.completed_tasks.add(job.id) + job_output = await workflow.execute_activity( + run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) + ) + # FIXME: unbounded memory usage + self.completed_tasks.add(job.id) + return job_output @activity.defn From d2cf21fc95753d28426611bb2106786820841c24 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 12:05:02 -0400 Subject: [PATCH 38/41] Add top-level explanatory comments to files --- update/job_runner_I1_native.py | 4 ++++ update/job_runner_I2_native.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/update/job_runner_I1_native.py b/update/job_runner_I1_native.py index 74a6d9ca..762a647d 100644 --- a/update/job_runner_I1_native.py +++ b/update/job_runner_I1_native.py @@ -10,6 +10,10 @@ from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker +# This file contains a proposal for how the Python SDK could provide I1:WaitUntilReadyToExecute +# functionality to help users defer processing, control interleaving of handler coroutines, and +# ensure processing is complete before workflow completion. + JobID = str diff --git a/update/job_runner_I2_native.py b/update/job_runner_I2_native.py index 86d60375..e7a439b4 100644 --- a/update/job_runner_I2_native.py +++ b/update/job_runner_I2_native.py @@ -11,6 +11,9 @@ from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker +# This file contains a proposal for how the Python SDK could provide a native update queue +# (I2:PushToQueue) to help users defer processing, control interleaving of handler coroutines, and +# ensure processing is complete before workflow completion. ## ## user code From bd9fc670d175320717c0a109b56145a75719d532 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 12:09:47 -0400 Subject: [PATCH 39/41] rm notes --- update/job_runner_notes.py | 409 ------------------------------------- 1 file changed, 409 deletions(-) delete mode 100644 update/job_runner_notes.py diff --git a/update/job_runner_notes.py b/update/job_runner_notes.py deleted file mode 100644 index e3b7999a..00000000 --- a/update/job_runner_notes.py +++ /dev/null @@ -1,409 +0,0 @@ -import asyncio -from asyncio import Future -from collections import deque -from datetime import datetime, timedelta -import logging -from sys import version -from typing import Any, Iterator, Optional, TypedDict, Union - -from attr import dataclass -from temporalio import common, workflow, activity -from temporalio.client import Client, WorkflowHandle -from temporalio.worker import Worker - - -############################################################################### -## SDK internals -## -@dataclass -class Message: - type: str # maps to a handler if a matching handler exists - args: tuple[Any] # deserialized arg payloads - # Can expose other update/signal metadata - received_at: datetime - client_identity: str - - async def handle(self): - await workflow.call_handler(self.type, *self.args) - - -@dataclass -class Signal(Message): - pass - - -@dataclass -class Update(Message): - id: str # the update ID - - -# -# Raw incoming workflow events -# -# The incoming event stream contains newly delivered updates and signals. Perhaps -# ContinueAsNewSuggested could also be an event. -# -# We could have "UpdateReceived" as the incoming event, with UpdateAccepted/UpdateRejected emitted -# later. However, An alternative is that the SDK executes the update validators immediately, before -# the user has a chance to interact with the event stream. We'll adopt that version for now, since -# it involves fewer event types. -@dataclass -class SignalReceived: - signal: Signal - - -@dataclass -class UpdateRejected: - update: Update - - -@dataclass -class UpdateAccepted: - update: Update - - -@dataclass -class ContinueAsNewSuggested: - pass - - -IncomingWorkflowEvent = Union[ - UpdateAccepted, UpdateRejected, SignalReceived, ContinueAsNewSuggested -] - - -def workflow_incoming_event_stream() -> Iterator[IncomingWorkflowEvent]: ... - - -setattr(workflow, "incoming_event_stream", workflow_incoming_event_stream) -# -# Other events that are emitted automatically by a running workflow -# - - -@dataclass -class UpdateCompleted: - pass - - -class SignalHandlerReturned: - # This is tangential to work on updates. Introducing this event would introduce a new concept of - # "signal-processing finished", which could be used to help users wait for signal processing to - # finish before CAN / workflow return. The idea is that the event would be emitted when a signal - # handler returns. - pass - - -# -# Events that users can add to the event stream -# - - -class TimerFired: - pass - - -class CustomFutureResolved: - pass - - -EventStream = Iterator[ - Union[ - SignalReceived, - UpdateRejected, - UpdateAccepted, - ContinueAsNewSuggested, - UpdateCompleted, - SignalHandlerReturned, - TimerFired, - CustomFutureResolved, - ] -] - - -class Selector: - def get_event_stream(self) -> EventStream: ... - - -# By default, a workflow behaves as if it is doing the following: -def handle_incoming_events(): - for ev in workflow.incoming_event_stream(): - match ev: - case SignalReceived(signal): - asyncio.create_task(signal.handle()) - case UpdateAccepted(update): - asyncio.create_task(update.handle()) - - -HandlerType = str -UpdateID = str - - -# This class is just a toy prototype: this functionality will be implemented on WorkflowInstance in -# the SDK. -class WorkflowInternals: - def __init__(self): - self.incoming_event_stream = deque[IncomingWorkflowEvent]() - self.ready_to_execute_handler: dict[UpdateID, Future[None]] = {} - - def accept_update(self, update: Update): - # This will be done by the SDK prior to invoking handler, around - # https://github.com/temporalio/sdk-python/blob/11a97d1ab2ebfe8c973bf396b1e14077ec611e52/temporalio/worker/_workflow_instance.py#L506 - self.incoming_event_stream.append(UpdateAccepted(update)) - - # By default, handlers are ready to execute immediately after the update is accepted. - self.ready_to_execute_handler[update.id] = resolved_future(None) - - async def _wait_until_ready_to_execute(self, update_id: UpdateID): - await self.ready_to_execute_handler[update_id] - - -def resolved_future[X](result: X) -> Future[X]: - fut = Future() - fut.set_result(result) - return fut - - -workflow_internals = WorkflowInternals() - -############################################################################### -## -## User's code -## - -# A user may want to handle the event stream differently. - -# workflow API design must make the following two things convenient for users: -# - DrainBeforeWorkflowCompletion -# - NoInterleaving, optionally with CustomOrder - - -def make_event_stream() -> EventStream: - selector = Selector() - return selector.get_event_stream() - - -event_stream = make_event_stream() - -# -JobId = int -ClusterSlot = str - - -class Job(TypedDict): - update_id: UpdateID - depends_on: list[UpdateID] - after_time: Optional[int] - name: str - run: str - python_interpreter_version: Optional[str] - - -class JobOutput(TypedDict): - status: int - stdout: str - stderr: str - - -@workflow.defn -class JobRunner: - - def __init__(self): - self.jobs: dict[JobId, Job] = {} - - # Design notes - # ------------ - # Updates always have handler functions, and an update handler function is still just a normal - # handler function: it implements the handling logic and the return value. - # - # Every workflow will have an underlying event stream. By default, this yields the following - # events: - # - # - UpdateRejected - # - UpdateAccepted (UpdateEnqueued) - # - UpdateDequeued - # - UpdateCompleted - # - # The SDK will provide a default implementation of the event stream, looking something like this: - - # - # The handler will be invoked automatically by the SDK when the underlying event stream yields - # an UpdateDequeued event for this update ID. The user does not have to know anything about - # this: by default, handlers are executed before other workflow code, in order of update - # arrival. - - # The SDK is capable of automatically draining the event stream before workflow return / CAN, - # including an option for this draining to result in serial execution of the handlers (i.e. - # waiting for all async work scheduled by the handler to complete before the next handler is - # invoked, and not allowing the workflow to complete until all such work is complete.) Default - # behavior TBD. - # - # The event stream thus provides a way for users to wait until a specific message handler has - # completed, or until all message handlers have completed. These can be exposed via convenient - # `wait_for_X()` APIs, rather than interacting with the raw event stream - # - # Furthermore, users can optionally implement the EventStream themselves. This gives them - # precise control over the ordering of handler invocation with respect to all other workflow - # events (e.g. other update completions, and custom futures such as timers and - # workflow.wait_condition). - # - # TODO: does handler invocation remain automatic on yielding Dequeue, or is that too magical. An - # alternative would be for users to be able to call update.handle() on an update object obtained - # from an event object yielded by the event stream. - - @workflow.update - async def run_shell_script_job(self, job: Job) -> JobOutput: - """ - To be executed in order dictated by job dependency graph (see `jobs.depends_on`) and not - before `job.after_time`. - """ - ## SDK internals: please pretend this is implemented in the SDK - await workflow_internals._wait_until_ready_to_execute(job["update_id"]) - ## - - if security_errors := await workflow.execute_activity( - run_shell_script_security_linter, - args=[job["run"]], - start_to_close_timeout=timedelta(seconds=10), - ): - return JobOutput(status=1, stdout="", stderr=security_errors) - job_output = await workflow.execute_activity( - run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) - ) # SDK emits UpdateCompleted - return job_output - - @workflow.update - async def run_python_job(self, job: Job) -> JobOutput: - """ - To be executed in order dictated by job dependency graph (see `jobs.depends_on`) and not - before `job.after_time`. - """ - ## SDK internals: please pretend this is implemented in the SDK - await workflow_internals._wait_until_ready_to_execute(job["update_id"]) - ## - - if not await workflow.execute_activity( - check_python_interpreter_version, - args=[job["python_interpreter_version"]], - start_to_close_timeout=timedelta(seconds=10), - ): - return JobOutput( - status=1, - stdout="", - stderr=f"Python interpreter version {version} is not available", - ) - job_output = await workflow.execute_activity( - run_job, args=[job], start_to_close_timeout=timedelta(seconds=10) - ) # SDK emits UpdateCompleted - return job_output - - @run_shell_script_job.validator - def validate_shell_script_job(self, job: Job): - ## SDK internals: please pretend this is implemented in the SDK - workflow_internals.accept_update( - Update( - type="run_shell_script_job", - args=(job,), - client_identity="some-client-id", - id=job["update_id"], - received_at=workflow.now(), - ) - ) - ## - - @run_python_job.validator - def validate_python_job(self, job: Job): - ## SDK internals: please pretend this is implemented in the SDK - workflow_internals.accept_update( - Update( - type="run_python_job", - args=(job,), - client_identity="some-client-id", - id=job["update_id"], - received_at=workflow.now(), - ) - ) - ## - - @workflow.run - async def run(self): - while not workflow.info().is_continue_as_new_suggested(): - await workflow.wait_condition(lambda: len(self.jobs) > 0) - workflow.continue_as_new() - - -@activity.defn -async def run_job(job: Job) -> JobOutput: - await asyncio.sleep(0.1) - stdout = f"Ran job {job["name"]} at {datetime.now()}" - print(stdout) - return JobOutput(status=0, stdout=stdout, stderr="") - - -@activity.defn -async def request_cluster_slot(job: Job) -> ClusterSlot: - await asyncio.sleep(0.1) - return "cluster-slot-token-abc123" - - -@activity.defn -async def run_shell_script_security_linter(code: str) -> str: - # The user's organization requires that all shell scripts pass an in-house linter that checks - # for shell scripting constructions deemed insecure. - await asyncio.sleep(0.1) - return "" - - -@activity.defn -async def check_python_interpreter_version(version: str) -> bool: - await asyncio.sleep(0.1) - version_is_available = True - return version_is_available - - -async def app(wf: WorkflowHandle): - job_1 = Job( - update_id="1", - depends_on=[], - after_time=None, - name="should-run-first", - run="echo 'Hello world 1!'", - python_interpreter_version=None, - ) - job_2 = Job( - update_id="2", - depends_on=["1"], - after_time=None, - name="should-run-second", - run="print('Hello world 2!')", - python_interpreter_version=None, - ) - job_2 = await wf.execute_update(JobRunner.run_python_job, job_2) - job_1 = await wf.execute_update(JobRunner.run_shell_script_job, job_1) - - -async def main(): - client = await Client.connect("localhost:7233") - async with Worker( - client, - task_queue="tq", - workflows=[JobRunner], - activities=[ - run_job, - run_shell_script_security_linter, - check_python_interpreter_version, - request_cluster_slot, - ], - ): - wf = await client.start_workflow( - JobRunner.run, - id="wid", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - ) - await app(wf) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - asyncio.run(main()) From 569eed18f77d0f1f61802c4332f6e40e300e9427 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Thu, 30 May 2024 12:26:47 -0400 Subject: [PATCH 40/41] Introduce max_concurrent --- update/job_runner_I1_native.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/update/job_runner_I1_native.py b/update/job_runner_I1_native.py index 762a647d..72807f24 100644 --- a/update/job_runner_I1_native.py +++ b/update/job_runner_I1_native.py @@ -64,12 +64,13 @@ def _sdk_internals_all_handlers_completed(self) -> bool: @asynccontextmanager -async def _sdk_internals__track_pending__wait_until_ready__serialize_execution( +async def _sdk_internals__track_pending__wait_until_ready__synchronize( execute_condition: Callable[[], bool] ): global _sdk_internals_pending_tasks_count _sdk_internals_pending_tasks_count += 1 await workflow.wait_condition(execute_condition) + # TODO: honor max_concurrent, using a semaphore (if not None). await _sdk_internals_handler_mutex.acquire() try: yield @@ -90,7 +91,7 @@ def ready_to_execute(self, update: Update) -> bool: @workflow.update async def run_shell_script_job(self, arg: I) -> O: handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) - async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( + async with _sdk_internals__track_pending__wait_until_ready__synchronize( lambda: self.ready_to_execute(Update(arg.id, arg)) ): return await handler(arg) @@ -98,7 +99,7 @@ async def run_shell_script_job(self, arg: I) -> O: @workflow.update async def run_python_job(self, arg: I) -> O: handler = getattr(self, "_" + inspect.currentframe().f_code.co_name) - async with _sdk_internals__track_pending__wait_until_ready__serialize_execution( + async with _sdk_internals__track_pending__wait_until_ready__synchronize( lambda: self.ready_to_execute(Update(arg.id, arg)) ): return await handler(arg) @@ -144,7 +145,7 @@ def ready_to_execute(self, update: workflow.Update) -> bool: # These are the real handler functions. When we implement SDK support, these will use the # decorator form commented out below, and will not use an underscore prefix. - # @workflow.update(execute_condition=ready_to_execute) + # @workflow.update(max_concurrent=1, execute_condition=ready_to_execute) async def _run_shell_script_job(self, job: Job) -> JobOutput: if security_errors := await workflow.execute_activity( run_shell_script_security_linter, @@ -159,7 +160,7 @@ async def _run_shell_script_job(self, job: Job) -> JobOutput: self.completed_tasks.add(job.id) return job_output - # @workflow.update(execute_condition=ready_to_execute) + # @workflow.update(max_concurrent=1, execute_condition=ready_to_execute) async def _run_python_job(self, job: Job) -> JobOutput: if not await workflow.execute_activity( check_python_interpreter_version, From 139100d2b72e9861a43eae159cd2eec0736e0c0a Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Fri, 31 May 2024 17:07:53 -0400 Subject: [PATCH 41/41] Resurrect potentially useful additional sample with test cc @drewhoskins --- .../serialized_handling_of_n_messages.py | 95 +++++++++++++++ update/serialized_handling_of_n_messages.py | 114 ++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 tests/update/serialized_handling_of_n_messages.py create mode 100644 update/serialized_handling_of_n_messages.py diff --git a/tests/update/serialized_handling_of_n_messages.py b/tests/update/serialized_handling_of_n_messages.py new file mode 100644 index 00000000..5c78af37 --- /dev/null +++ b/tests/update/serialized_handling_of_n_messages.py @@ -0,0 +1,95 @@ +import asyncio +import logging +import uuid +from dataclasses import dataclass +from unittest.mock import patch + +import temporalio.api.common.v1 +import temporalio.api.enums.v1 +import temporalio.api.update.v1 +import temporalio.api.workflowservice.v1 +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker +from temporalio.workflow import UpdateMethodMultiParam + +from update.serialized_handling_of_n_messages import ( + MessageProcessor, + Result, + get_current_time, +) + + +async def test_continue_as_new_doesnt_lose_updates(client: Client): + with patch( + "temporalio.workflow.Info.is_continue_as_new_suggested", return_value=True + ): + tq = str(uuid.uuid4()) + wf = await client.start_workflow( + MessageProcessor.run, id=str(uuid.uuid4()), task_queue=tq + ) + update_requests = [ + UpdateRequest(wf, MessageProcessor.process_message, i) for i in range(10) + ] + for req in update_requests: + await req.wait_until_admitted() + + async with Worker( + client, + task_queue=tq, + workflows=[MessageProcessor], + activities=[get_current_time], + ): + for req in update_requests: + update_result = await req.task + assert update_result.startswith(req.expected_result_prefix()) + + +@dataclass +class UpdateRequest: + wf_handle: WorkflowHandle + update: UpdateMethodMultiParam + sequence_number: int + + def __post_init__(self): + self.task = asyncio.Task[Result]( + self.wf_handle.execute_update(self.update, args=[self.arg], id=self.id) + ) + + async def wait_until_admitted(self): + while True: + try: + return await self._poll_update_non_blocking() + except Exception as err: + logging.warning(err) + + async def _poll_update_non_blocking(self): + req = temporalio.api.workflowservice.v1.PollWorkflowExecutionUpdateRequest( + namespace=self.wf_handle._client.namespace, + update_ref=temporalio.api.update.v1.UpdateRef( + workflow_execution=temporalio.api.common.v1.WorkflowExecution( + workflow_id=self.wf_handle.id, + run_id="", + ), + update_id=self.id, + ), + identity=self.wf_handle._client.identity, + ) + res = await self.wf_handle._client.workflow_service.poll_workflow_execution_update( + req + ) + # TODO: @cretz how do we work with these raw proto objects? + assert "stage: UPDATE_WORKFLOW_EXECUTION_LIFECYCLE_STAGE_ADMITTED" in str(res) + + @property + def arg(self) -> str: + return str(self.sequence_number) + + @property + def id(self) -> str: + return str(self.sequence_number) + + def expected_result_prefix(self) -> str: + # TODO: Currently the server does not send updates to the worker in order of admission When + # this is fixed (https://github.com/temporalio/temporal/pull/5831), we can make a stronger + # assertion about the activity numbers used to construct each result. + return f"{self.arg}-result" diff --git a/update/serialized_handling_of_n_messages.py b/update/serialized_handling_of_n_messages.py new file mode 100644 index 00000000..4c9a0d5c --- /dev/null +++ b/update/serialized_handling_of_n_messages.py @@ -0,0 +1,114 @@ +import asyncio +import logging +from asyncio import Future +from collections import deque +from datetime import timedelta + +from temporalio import activity, common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + +Arg = str +Result = str + +# Problem: +# ------- +# - Your workflow receives an unbounded number of updates. +# - Each update must be processed by calling two activities. +# - The next update may not start processing until the previous has completed. + +# Solution: +# -------- +# Enqueue updates, and process items from the queue in a single coroutine (the main workflow +# coroutine). + +# Discussion: +# ---------- +# The queue is used because Temporal's async update & signal handlers will interleave if they +# contain multiple yield points. An alternative would be to use standard async handler functions, +# with handling being done with an asyncio.Lock held. The queue approach would be necessary if we +# need to process in an order other than arrival. + + +@workflow.defn +class MessageProcessor: + + def __init__(self): + self.queue = deque[tuple[Arg, Future[Result]]]() + + @workflow.run + async def run(self): + while True: + await workflow.wait_condition(lambda: len(self.queue) > 0) + while self.queue: + arg, fut = self.queue.popleft() + fut.set_result(await self.execute_processing_task(arg)) + if workflow.info().is_continue_as_new_suggested(): + # Footgun: If we don't let the event loop tick, then CAN will end the workflow + # before the update handler is notified that the result future has completed. + # See https://github.com/temporalio/features/issues/481 + await asyncio.sleep(0) # Let update handler complete + print("CAN") + return workflow.continue_as_new() + + # Note: handler must be async if we are both enqueuing, and returning an update result + # => We could add SDK APIs to manually complete updates. + @workflow.update + async def process_message(self, arg: Arg) -> Result: + # Footgun: handler may need to wait for workflow initialization after CAN + # See https://github.com/temporalio/features/issues/400 + # await workflow.wait_condition(lambda: hasattr(self, "queue")) + fut = Future[Result]() + self.queue.append((arg, fut)) # Note: update validation gates enqueue + return await fut + + async def execute_processing_task(self, arg: Arg) -> Result: + # The purpose of the two activities and the result string format is to permit checks that + # the activities of different tasks do not interleave. + t1, t2 = [ + await workflow.execute_activity( + get_current_time, start_to_close_timeout=timedelta(seconds=10) + ) + for _ in range(2) + ] + return f"{arg}-result-{t1}-{t2}" + + +time = 0 + + +@activity.defn +async def get_current_time() -> int: + global time + time += 1 + return time + + +async def app(wf: WorkflowHandle): + for i in range(20): + print(f"app(): sending update {i}") + result = await wf.execute_update(MessageProcessor.process_message, f"arg {i}") + print(f"app(): {result}") + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[MessageProcessor], + activities=[get_current_time], + ): + wf = await client.start_workflow( + MessageProcessor.run, + id="wid", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ) + await asyncio.gather(app(wf), wf.result()) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main())