Skip to content

Commit

Permalink
Allow proper stack trace on eviction deadlock (#530)
Browse files Browse the repository at this point in the history
Fixes #527
  • Loading branch information
cretz authored May 28, 2024
1 parent aa26503 commit afadc15
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 6 deletions.
5 changes: 4 additions & 1 deletion temporalio/worker/_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,10 @@ async def _handle_activation(
# TODO(cretz): Should we build a complex mechanism to continually
# try the eviction until it succeeds?
if cache_remove_job:
logger.exception("Failed running eviction job, not evicting")
logger.exception(
"Failed running eviction job, not evicting. "
+ "Since eviction could not be processed, this worker cannot complete and the slot will remain forever used."
)
self._could_not_evict_count += 1
return

Expand Down
4 changes: 2 additions & 2 deletions temporalio/worker/_workflow_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def activate(
# have a different workflow/event-loop going.
if self._deleting and self._tasks:
raise RuntimeError(
f"Cache removal processed, but {len(self._tasks)} tasks remain. "
f"Eviction processed, but {len(self._tasks)} tasks remain. "
+ f"Stack traces below:\n\n{self._stack_trace()}"
)

Expand Down Expand Up @@ -1776,7 +1776,7 @@ async def _signal_external_workflow(

def _stack_trace(self) -> str:
stacks = []
for task in self._tasks:
for task in list(self._tasks):
# TODO(cretz): These stacks are not very clean currently
frames = []
for frame in task.get_stack():
Expand Down
2 changes: 1 addition & 1 deletion temporalio/worker/workflow_sandbox/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,4 +173,4 @@ def _run_code(self, code: str, **extra_globals: Any) -> None:
finally:
temporalio.workflow.unsafe._set_in_sandbox(False)
for k, v in extra_globals.items():
del self.globals_and_locals[k]
self.globals_and_locals.pop(k, None)
2 changes: 1 addition & 1 deletion tests/helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ async def assert_eq_eventually(
await asyncio.sleep(interval.total_seconds())
assert (
expected == last_value
), "timed out waiting for equal, asserted against last value"
), f"timed out waiting for equal, asserted against last value of {last_value}"


async def worker_versioning_enabled(client: Client) -> bool:
Expand Down
62 changes: 61 additions & 1 deletion tests/worker/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2467,6 +2467,8 @@ async def test_workflow_deadlock(client: Client):
async with new_worker(
client, DeadlockedWorkflow, disable_safe_workflow_eviction=True
) as worker:
if worker._workflow_worker:
worker._workflow_worker._deadlock_timeout_seconds = 1
deadlock_thread_event.clear()
handle = await client.start_workflow(
DeadlockedWorkflow.run,
Expand All @@ -2488,7 +2490,7 @@ async def last_history_task_failure() -> str:

try:
await assert_eq_eventually(
"[TMPRL1101] Potential deadlock detected, workflow didn't yield within 2 second(s)",
"[TMPRL1101] Potential deadlock detected, workflow didn't yield within 1 second(s)",
last_history_task_failure,
timeout=timedelta(seconds=5),
interval=timedelta(seconds=1),
Expand All @@ -2497,6 +2499,64 @@ async def last_history_task_failure() -> str:
deadlock_thread_event.set()


@workflow.defn
class EvictionDeadlockWorkflow:
def __init__(self) -> None:
self.val = 1

async def wait_until_positive(self):
while True:
await workflow.wait_condition(lambda: self.val > 0)
self.val = -self.val

async def wait_until_negative(self):
while True:
await workflow.wait_condition(lambda: self.val < 0)
self.val = -self.val

@workflow.run
async def run(self):
await asyncio.gather(self.wait_until_negative(), self.wait_until_positive())


async def test_workflow_eviction_deadlock(client: Client):
# We are running the worker, but we can't ever shut it down on eviction
# error so we send shutdown in the background and leave this worker dangling
worker = new_worker(client, EvictionDeadlockWorkflow)
if worker._workflow_worker:
worker._workflow_worker._deadlock_timeout_seconds = 1
worker_task = asyncio.create_task(worker.run())

# Run workflow that deadlocks
handle = await client.start_workflow(
EvictionDeadlockWorkflow.run,
id=f"workflow-{uuid.uuid4()}",
task_queue=worker.task_queue,
)

async def last_history_task_failure() -> str:
resp = await client.workflow_service.get_workflow_execution_history(
GetWorkflowExecutionHistoryRequest(
namespace=client.namespace,
execution=WorkflowExecution(workflow_id=handle.id),
),
)
for event in reversed(resp.history.events):
if event.event_type == EventType.EVENT_TYPE_WORKFLOW_TASK_FAILED:
return event.workflow_task_failed_event_attributes.failure.message
return "<no failure>"

await assert_eq_eventually(
"[TMPRL1101] Potential deadlock detected, workflow didn't yield within 1 second(s)",
last_history_task_failure,
timeout=timedelta(seconds=5),
interval=timedelta(seconds=1),
)

# Send cancel but don't wait
worker_task.cancel()


class PatchWorkflowBase:
def __init__(self) -> None:
self._result = "<unset>"
Expand Down

0 comments on commit afadc15

Please sign in to comment.