Add enum for trainer actions

The enum is used for validation check before triggering one of them. Previously I was checking if the queue was alive, but that won't be enough, for example if you want to perform resume, while you are resumed, the queue is operational, but the action shouldn't be valid.
ilastik · Dec 10, 2024 · e4b53d5 · e4b53d5
1 parent 5cb34c0
commit e4b53d5
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 34 deletions.
diff --git a/tests/test_server/test_grpc/test_training_servicer.py b/tests/test_server/test_grpc/test_training_servicer.py
@@ -9,14 +9,14 @@
 import numpy as np
 import pytest
 
-from tiktorch.converters import trainer_state_to_pb
+from tiktorch.converters import pb_state_to_trainer, trainer_state_to_pb
 from tiktorch.proto import training_pb2, training_pb2_grpc
 from tiktorch.server.device_pool import TorchDevicePool
 from tiktorch.server.grpc import training_servicer
 from tiktorch.server.session.backend.base import TrainerSessionBackend
 from tiktorch.server.session.process import TrainerSessionProcess
 from tiktorch.server.session_manager import SessionManager
-from tiktorch.trainer import Callbacks, ShouldStopCallbacks, Trainer, TrainerState
+from tiktorch.trainer import ShouldStopCallbacks, Trainer, TrainerState
 
 
 @pytest.fixture(scope="module")
@@ -57,8 +57,8 @@ def unet2d_config_path(checkpoint_dir, train_data_dir, val_data_path, device: st
   resume: null
   validate_after_iters: 2
   log_after_iters: 2
-  max_num_epochs: 1
-  max_num_iterations: 1
+  max_num_epochs: 1000
+  max_num_iterations: 10000
   eval_score_higher_is_better: True
 optimizer:
   learning_rate: 0.0002
@@ -164,7 +164,7 @@ def create_random_dataset(shape, channel_per_class):
             l_shape = (2,) + l_shape
 
         f.create_dataset("raw", data=np.random.rand(*shape))
-        f.create_dataset("label", data=np.random.randint(0, 1, l_shape))
+        f.create_dataset("label", data=np.random.randint(0, 2, l_shape, dtype=np.int64))
         f.create_dataset("weight_map", data=np.random.rand(*w_shape))
 
     return tmp.name
@@ -188,7 +188,7 @@ def assert_state(self, grpc_stub, training_session_id: str, state_to_check: Trai
 
     def poll_for_state_grpc(self, grpc_stub, session_id, expected_state: TrainerState, timeout=3, poll_interval=0.1):
         def get_status(*args):
-            return trainer_state_to_pb[grpc_stub.GetStatus(session_id).state]
+            return pb_state_to_trainer[grpc_stub.GetStatus(session_id).state]
 
         self.poll_for_state(get_status, expected_state, timeout, poll_interval)
 
@@ -285,12 +285,12 @@ def test_concurrent_state_transitions(self, grpc_stub):
             thread.join()
 
     def test_queueing_multiple_commands(self, grpc_stub):
-        init_response = grpc_stub.Init(training_pb2.TrainingConfig(yaml_content=prepare_unet2d_test_environment()))
-        training_session_id = training_pb2.TrainingSessionId(id=init_response.id)
-
         def assert_state(state_to_check):
             self.assert_state(grpc_stub, training_session_id, state_to_check)
 
+        init_response = grpc_stub.Init(training_pb2.TrainingConfig(yaml_content=prepare_unet2d_test_environment()))
+        training_session_id = training_pb2.TrainingSessionId(id=init_response.id)
+
         grpc_stub.Start(training_session_id)
         assert_state(TrainerState.RUNNING)
 
@@ -330,12 +330,12 @@ def test_error_handling_on_invalid_state_transitions_before_training_started(sel
         # Attempt to resume before start
         with pytest.raises(grpc.RpcError) as excinfo:
             grpc_stub.Resume(training_session_id)
-        assert "Training hasn't started" in excinfo.value.details()
+        assert "Invalid state transition: TrainerState.IDLE -> TrainerState.RUNNING" in excinfo.value.details()
 
         # Attempt to pause before start
         with pytest.raises(grpc.RpcError) as excinfo:
             grpc_stub.Pause(training_session_id)
-        assert "Training hasn't started" in excinfo.value.details()
+        assert "Invalid state transition: TrainerState.IDLE -> TrainerState.PAUSED" in excinfo.value.details()
 
     def test_start_training_without_init(self, grpc_stub):
         """
@@ -347,20 +347,20 @@ def test_start_training_without_init(self, grpc_stub):
         assert "trainer-session with id  doesn't exist" in excinfo.value.details()
 
     def test_recover_training_failed(self):
-        class MockedExceptionTrainer(Trainer):
+        class MockedExceptionTrainer:
             def __init__(self):
-                self.should_stop_callbacks = Callbacks()
+                self.should_stop_callbacks = ShouldStopCallbacks()
 
             def fit(self):
                 raise Exception("mocked exception")
 
-        class MockedNominalTrainer(Trainer):
+        class MockedNominalTrainer:
             def __init__(self):
                 self.num_epochs = 0
                 self.max_num_epochs = 10
                 self.num_iterations = 0
                 self.max_num_iterations = 100
-                self.should_stop_callbacks = Callbacks()
+                self.should_stop_callbacks = ShouldStopCallbacks()
 
             def fit(self):
                 for epoch in range(self.max_num_epochs):
@@ -397,9 +397,9 @@ def assert_error(func, expected_message: str):
                 func()
             assert expected_message in str(excinfo.value)
 
-        class MockedExceptionTrainer(Trainer):
+        class MockedExceptionTrainer:
             def __init__(self):
-                self.should_stop_callbacks = Callbacks()
+                self.should_stop_callbacks = ShouldStopCallbacks()
 
             def fit(self):
                 raise Exception("mocked exception")

diff --git a/tiktorch/server/session/backend/commands.py b/tiktorch/server/session/backend/commands.py
@@ -8,7 +8,7 @@
 from dataclasses import dataclass, field
 from typing import Generic, Type, TypeVar
 
-from tiktorch.trainer import TrainerState
+from tiktorch.trainer import TrainerAction, TrainerState
 
 if typing.TYPE_CHECKING:
     from tiktorch.server.session.backend.supervisor import BioModelSupervisor, Supervisors, TrainerSupervisor
@@ -110,17 +110,17 @@ def execute(self, ctx: Context[TrainerSupervisor]) -> None:
 
 class SetStartStateTrainingCmd(ICommand):
     def execute(self, ctx: Context[TrainerSupervisor]) -> None:
-        ctx.session.transition_to_state(new_state=TrainerState.RUNNING, valid_states={TrainerState.IDLE})
+        ctx.session.transition_to_state(new_state=TrainerState.RUNNING, trainer_action=TrainerAction.START)
 
 
 class SetPauseStateTrainingCmd(ICommand):
     def execute(self, ctx: Context[TrainerSupervisor]) -> None:
-        ctx.session.transition_to_state(new_state=TrainerState.PAUSED, valid_states={TrainerState.RUNNING})
+        ctx.session.transition_to_state(new_state=TrainerState.PAUSED, trainer_action=TrainerAction.PAUSE)
 
 
 class SetResumeStateTrainingCmd(ICommand):
     def execute(self, ctx: Context[TrainerSupervisor]) -> None:
-        ctx.session.transition_to_state(new_state=TrainerState.RUNNING, valid_states={TrainerState.PAUSED})
+        ctx.session.transition_to_state(new_state=TrainerState.RUNNING, trainer_action=TrainerAction.RESUME)
 
 
 class ShutdownCmd(ICommand):

diff --git a/tiktorch/server/session/backend/supervisor.py b/tiktorch/server/session/backend/supervisor.py
@@ -6,7 +6,7 @@
 
 from tiktorch.server.session.backend import commands
 from tiktorch.server.session.backend.commands import CommandPriorityQueueUtils, ShutdownWithTeardownCmd
-from tiktorch.trainer import BaseCallbacks, ErrorCallbacks, Trainer, TrainerState
+from tiktorch.trainer import BaseCallbacks, ErrorCallbacks, Trainer, TrainerAction, TrainerState
 
 logger = logging.getLogger(__name__)
 
@@ -57,10 +57,7 @@ def get_state(self) -> TrainerState:
         return self._state
 
     def start(self):
-        if self._state != TrainerState.IDLE:
-            raise StateTransitionError(
-                current_state=self._state, transitioning_state=TrainerState.RUNNING, valid_states={TrainerState.IDLE}
-            )
+        self._check_transition_to_start()
         self._session_thread.start()
         self._pause_triggered = False
         start_cmd = commands.SetStartStateTrainingCmd()
@@ -98,23 +95,23 @@ def is_training_finished(self):
         return (
             self._trainer.num_epochs == self._trainer.max_num_epochs
             or self._trainer.num_iterations == self._trainer.max_num_iterations
-        )
+        ) or self._trainer.should_stop_model_criteria()
 
     def _get_num_iterations_epochs(self) -> str:
         iterations = f"Iterations[{self._trainer.num_iterations}/{self._trainer.max_num_iterations}]"
         epochs = f"Epochs[{self._trainer.num_epochs}/{self._trainer.max_num_epochs}]"
         return f"{iterations}, {epochs}"
 
-    @requires_queue_alive
     def resume(self):
+        self._check_transition_to_resume()
         self._pause_triggered = False
         resume_cmd = commands.SetResumeStateTrainingCmd()
         self._command_queue_utils.send_command(resume_cmd.awaitable)
         resume_cmd.awaitable.wait()  # make sure that the state has actually changed (acknowledge)
         logger.info(f"Resume training: {self._get_num_iterations_epochs()}")
 
-    @requires_queue_alive
     def pause(self):
+        self._check_transition_to_pause()
         self._pause_triggered = True
         pause_cmd = commands.SetPauseStateTrainingCmd()
         self._command_queue_utils.send_command(pause_cmd.awaitable)
@@ -128,7 +125,6 @@ def shutdown(self):
         self._command_queue_utils.send_command(commands.ShutdownCmd())
         self._session_thread.join()
 
-    @requires_queue_alive
     def forward(self, input_tensors):
         self.pause()
         self._trainer.forward(input_tensors)
@@ -143,14 +139,28 @@ def export(self):
     def _should_stop(self):
         return self._pause_triggered
 
-    def transition_to_state(self, new_state: TrainerState, valid_states: Set[TrainerState]):
+    def transition_to_state(self, new_state: TrainerState, trainer_action: TrainerAction):
         """
         Should be used via the ICommands to monitor the state of the training
         """
-        self._check_transition_to_state(new_state, valid_states)
+        if trainer_action == TrainerAction.START:
+            self._check_transition_to_start()
+        elif trainer_action == TrainerAction.PAUSE:
+            self._check_transition_to_pause()
+        elif trainer_action == TrainerAction.RESUME:
+            self._check_transition_to_resume()
         logger.info(f"State transition: {self._state} -> {new_state}")
         self._state = new_state
 
+    def _check_transition_to_start(self):
+        return self._check_transition_to_state(TrainerState.RUNNING, {TrainerState.IDLE})
+
+    def _check_transition_to_pause(self):
+        return self._check_transition_to_state(TrainerState.PAUSED, {TrainerState.RUNNING})
+
+    def _check_transition_to_resume(self):
+        return self._check_transition_to_state(TrainerState.RUNNING, {TrainerState.PAUSED})
+
     def _check_transition_to_state(self, new_state: TrainerState, valid_states: Set[TrainerState]):
         if self._state not in valid_states:
             raise StateTransitionError(

diff --git a/tiktorch/trainer.py b/tiktorch/trainer.py
@@ -77,6 +77,13 @@ def __str__(self):
 LogsCallbacks = Callbacks[Callable[[Logs], None]]
 
 
+class TrainerAction(Enum):
+    START = "start"
+    PAUSE = "pause"
+    RESUME = "resume"
+    SHUTDOWN = "shutdown"
+
+
 class TrainerState(Enum):
     IDLE = 0
     RUNNING = 1
@@ -148,8 +155,18 @@ def forward(self, input_tensors):
         with torch.no_grad():
             self.model(input_tensors)
 
-    def should_stop(self):
-        return self.should_stop_callbacks() or super().should_stop()
+    def should_stop(self) -> bool:
+        """
+        Intervene on how to stop the training.
+        """
+        return self.should_stop_callbacks() or self.should_stop_model_criteria()
+
+    def should_stop_model_criteria(self) -> bool:
+        """
+        Retain the logic designed by a custom model on how to stop the training
+        e.g. learning rate lower than a threshold.
+        """
+        return super().should_stop()
 
     def _log_stats(self, phase, loss_avg, eval_score_avg):
         logs = Logs(