You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the bug
Looks like a 10 minute or so DNS failure might lead to a system failure when running longstanding training jobs. For example lead to the following error -
train loss:-1 | 0.00% of 64x1... rate=0 Hz, eta=?, total=0:00:00
train loss:0.2479 | 100.00% of 64x1... rate=5.98 Hz, eta=0:00:00, total=0:00:10
train loss:0.2479 | 100.00% of 64x1... rate=5.98 Hz, eta=0:00:00, total=0:00:10
OperationalError: [Errno -3] Temporary failure in name resolution
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 451, in trace_task
R = retval = fun(args, **kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/task.py", line 154, in call
results = super(Task, self).call(_t_args, **_t_kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 734, in protected_call
return self.run(*args, **kwargs)
File "/opt/dive/src/dive_tasks/tasks.py", line 371, in train_pipeline
utils.stream_subprocess(self, context, manager, popen_kwargs)
File "/opt/dive/src/dive_tasks/utils.py", line 101, in stream_subprocess
if check_canceled(task, context, force=False):
File "/opt/dive/src/dive_tasks/utils.py", line 54, in check_canceled
return task.canceled
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/task.py", line 122, in canceled
return is_revoked(self)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/utils.py", line 118, in is_revoked
return task.request.id in _revoked_tasks(task)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/utils.py", line 60, in _revoked_tasks
_revoked = _worker_inspector(task).revoked()
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/control.py", line 254, in revoked
return self._request('revoked')
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/control.py", line 106, in _request
return self._prepare(self.app.control.broadcast(
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/control.py", line 741, in broadcast
return self.mailbox(conn)._broadcast(
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/pidbox.py", line 328, in _broadcast
chan = channel or self.connection.default_channel
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/connection.py", line 895, in default_channel
self._ensure_connection(**conn_opts)
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/connection.py", line 433, in _ensure_connection
return retry_over_time(
File "/usr/lib/python3.8/contextlib.py", line 131, in exit
self.gen.throw(type, value, traceback)
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/connection.py", line 450, in _reraise_as_library_errors
raise ConnectionError(str(exc)) from exc
Invalid state transition to '2', Current state is '4'.
Invalid state transition to '820', Current state is '4'.
StateTransitionException: ("Invalid state transition to '820', Current state is '4'.", HTTPError('400 Client Error: Bad Request for url: https://viame.kitware.com/api/v1/job/64b072770e4d81c00e1b7068'))
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 451, in trace_task
R = retval = fun(args, **kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/task.py", line 154, in call
results = super(Task, self).call(_t_args, **_t_kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 734, in protected_call
return self.run(*args, **kwargs)
File "/opt/dive/src/dive_tasks/tasks.py", line 295, in train_pipeline
manager.updateStatus(JobStatus.FETCHING_INPUT)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/utils.py", line 315, in updateStatus
raise StateTransitionException(json_response['message'], hex)
The text was updated successfully, but these errors were encountered:
Describe the bug
Looks like a 10 minute or so DNS failure might lead to a system failure when running longstanding training jobs. For example lead to the following error -
INFO: �[33mACC FOR 'vali': 1.0�[39;49;00m
INFO: �[01m=== finish epoch 19 / 100 : viame-frame-classifier ===�[39;49;00m
epoch lr:5e-05 | �[33mvloss: 0.0000 (n_bad=11, best=0.0000)�[39;49;00m 19/100... rate=0.07 Hz, eta=0:18:34, total=0:04:21, wall=2023-07-13 22:00 UTC
train loss:-1 | 0.00% of 64x1... rate=0 Hz, eta=?, total=0:00:00
train loss:0.2479 | 100.00% of 64x1... rate=5.98 Hz, eta=0:00:00, total=0:00:10
train loss:0.2479 | 100.00% of 64x1... rate=5.98 Hz, eta=0:00:00, total=0:00:10
OperationalError: [Errno -3] Temporary failure in name resolution
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 451, in trace_task
R = retval = fun(args, **kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/task.py", line 154, in call
results = super(Task, self).call(_t_args, **_t_kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 734, in protected_call
return self.run(*args, **kwargs)
File "/opt/dive/src/dive_tasks/tasks.py", line 371, in train_pipeline
utils.stream_subprocess(self, context, manager, popen_kwargs)
File "/opt/dive/src/dive_tasks/utils.py", line 101, in stream_subprocess
if check_canceled(task, context, force=False):
File "/opt/dive/src/dive_tasks/utils.py", line 54, in check_canceled
return task.canceled
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/task.py", line 122, in canceled
return is_revoked(self)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/utils.py", line 118, in is_revoked
return task.request.id in _revoked_tasks(task)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/utils.py", line 60, in _revoked_tasks
_revoked = _worker_inspector(task).revoked()
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/control.py", line 254, in revoked
return self._request('revoked')
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/control.py", line 106, in _request
return self._prepare(self.app.control.broadcast(
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/control.py", line 741, in broadcast
return self.mailbox(conn)._broadcast(
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/pidbox.py", line 328, in _broadcast
chan = channel or self.connection.default_channel
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/connection.py", line 895, in default_channel
self._ensure_connection(**conn_opts)
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/connection.py", line 433, in _ensure_connection
return retry_over_time(
File "/usr/lib/python3.8/contextlib.py", line 131, in exit
self.gen.throw(type, value, traceback)
File "/opt/dive/local/venv/lib/python3.8/site-packages/kombu/connection.py", line 450, in _reraise_as_library_errors
raise ConnectionError(str(exc)) from exc
Invalid state transition to '2', Current state is '4'.
Invalid state transition to '820', Current state is '4'.
StateTransitionException: ("Invalid state transition to '820', Current state is '4'.", HTTPError('400 Client Error: Bad Request for url: https://viame.kitware.com/api/v1/job/64b072770e4d81c00e1b7068'))
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 451, in trace_task
R = retval = fun(args, **kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/task.py", line 154, in call
results = super(Task, self).call(_t_args, **_t_kwargs)
File "/opt/dive/local/venv/lib/python3.8/site-packages/celery/app/trace.py", line 734, in protected_call
return self.run(*args, **kwargs)
File "/opt/dive/src/dive_tasks/tasks.py", line 295, in train_pipeline
manager.updateStatus(JobStatus.FETCHING_INPUT)
File "/opt/dive/local/venv/lib/python3.8/site-packages/girder_worker/utils.py", line 315, in updateStatus
raise StateTransitionException(json_response['message'], hex)
The text was updated successfully, but these errors were encountered: