Skip to content

Commit

Permalink
fix: Detect hanging in PENDING_REBOOT state
Browse files Browse the repository at this point in the history
If reboot was requested, but it hasn't come in a pre-defined
number of iterations to wait, we need to fail the deployment and
resume normal operation.

This also means we need to tell the scheduler to run the work
function even if it is just waiting for a reboot and has nothing
to do. Nothing else than checking if it has not been waiting for
too long.

Ticket: MEN-7555
Changelog: none
Signed-off-by: Vratislav Podzimek <[email protected]>
  • Loading branch information
vpodzime committed Nov 28, 2024
1 parent 50fdd39 commit 830dc33
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 3 deletions.
1 change: 1 addition & 0 deletions cmake/CMake_defaults.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ set(CONFIG_MENDER_PROVIDES_DEPENDS ON CACHE BOOL "Provides depends")
set(CONFIG_MENDER_COMMIT_REQUIRE_AUTH ON CACHE BOOL "Authentication required for update commit")
set(CONFIG_MENDER_ALL_WARNINGS_AS_ERRORS ON CACHE BOOL "All warnings as errors")
set(CONFIG_MENDER_ERRORS_THRESHOLD_NET 10 CACHE STRING "Network errors threshold")
set(CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT 5 CACHE STRING "Reboot errors threshold")
50 changes: 47 additions & 3 deletions core/src/mender-client.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,13 @@ static mender_err_t mender_client_update_work_function(void);
*/
static mender_err_t mender_client_publish_deployment_status(const char *id, mender_deployment_status_t deployment_status);

/**
* @brief Set state in deployment data and store it in permanent storage
* @param state State to set and store
* @return MENDER_OK in case of success, error code otherwise
*/
static mender_err_t set_and_store_state(const mender_update_state_t state);

char *
mender_client_version(void) {

Expand Down Expand Up @@ -440,23 +447,60 @@ mender_client_work_function(void) {
switch (mender_client_state) {
case MENDER_CLIENT_STATE_PENDING_REBOOT:
mender_log_info("Waiting for a reboot");
/* nothing to do */
return MENDER_DONE;
if (MENDER_OK != mender_err_count_reboot_inc()) {
/* It appears we are stuck in this state. The only thing we can do is to mark the
deployment as failed and revert to normal operation. */
mender_log_error("Waiting for reboot for too long, giving up");

if (NULL == mender_client_deployment_data) {
mender_log_error("No deployment data to use for deployment abortion");
} else {
mender_update_state_t update_state;
if (MENDER_OK != mender_deployment_data_get_state(mender_client_deployment_data, &update_state)) {
mender_log_error("Failed to get current update state, going to ROLLBACK state");
update_state = MENDER_UPDATE_STATE_ROLLBACK;
} else {
update_state = update_state_transitions[update_state].failure;
}
if (MENDER_OK != set_and_store_state(update_state)) {
mender_log_error("Failed to save new state");
}
}

mender_client_state = MENDER_CLIENT_STATE_OPERATIONAL;
}
/* else:
Nothing to do, but let's make sure we have a chance to detect we are stuck in this
state (i.e. MENDER_OK, not MENDER_DONE which would tell the scheduler we are
done and don't need to run again). */
return MENDER_OK;
case MENDER_CLIENT_STATE_INITIALIZATION:
/* Perform initialization of the client */
mender_err_count_reboot_reset();
if (MENDER_DONE != mender_client_initialization_work_function()) {
return MENDER_FAIL;
}
mender_client_state = MENDER_CLIENT_STATE_OPERATIONAL;
/* fallthrough */
case MENDER_CLIENT_STATE_OPERATIONAL:
if (MENDER_FAIL == (ret = mender_client_update_work_function())) {
mender_err_count_reboot_reset();
ret = mender_client_update_work_function();
if (MENDER_FAIL == ret) {
if (MENDER_FAIL == mender_err_count_net_check()) {
/* Try to release network so that it gets set up again next
time. */
mender_client_network_release();
}
}
if (MENDER_DONE == ret) {
/* We should only be done when waiting for a reboot. */
assert(MENDER_CLIENT_STATE_PENDING_REBOOT == mender_client_state);

/* We don't want to tell the scheduler we are done because
otherwise we won't have a chance to detect that we are
waiting for a reboot forever. */
ret = MENDER_OK;
}
return ret;
}

Expand Down
25 changes: 25 additions & 0 deletions core/src/mender-error-counters.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,28 @@ mender_err_count_net_reset(void) {
return MENDER_OK;
}
#endif /* CONFIG_MENDER_ERRORS_THRESHOLD_NET > 0 */

#if CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT > 0

static uint8_t reboot_errors = 0;
#if CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT > UINT8_MAX
#error "CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT must be <= UINT8_MAX"
#endif

mender_err_t
mender_err_count_reboot_inc(void) {
if (reboot_errors < UINT8_MAX) {
reboot_errors++;
}
if (reboot_errors > CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT) {
return MENDER_FAIL;
}
return MENDER_OK;
}

mender_err_t
mender_err_count_reboot_reset(void) {
reboot_errors = 0;
return MENDER_OK;
}
#endif /* CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT > 0 */
32 changes: 32 additions & 0 deletions include/mender-error-counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,38 @@ mender_err_count_net_reset(void) {

#endif /* CONFIG_MENDER_ERRORS_THRESHOLD_NET > 0 */

#ifndef CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT
#define CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT 0
#endif

#if CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT > 0

/**
* @brief Increment the pending reboot counter
* @return MENDER_OK if not too many errors, MENDER_FAIL if too many errors
*/
mender_err_t mender_err_count_reboot_inc(void);

/**
* @brief Reset the pending reboot counter
* @return MENDER_OK if successful, error otherwise
*/
mender_err_t mender_err_count_reboot_reset(void);

#else

/* Define the functions as inline noops so that the compiler can simply rule them out. */
inline mender_err_t
mender_err_count_reboot_inc(void) {
return MENDER_OK;
}
inline mender_err_t
mender_err_count_reboot_reset(void) {
return MENDER_OK;
}

#endif /* CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT > 0 */

#ifdef __cplusplus
}
#endif /* __cplusplus */
Expand Down
3 changes: 3 additions & 0 deletions target/posix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ endif()
if (CONFIG_MENDER_ERRORS_THRESHOLD_NET)
target_compile_definitions(mender-mcu-client PUBLIC CONFIG_MENDER_ERRORS_THRESHOLD_NET=${CONFIG_MENDER_ERRORS_THRESHOLD_NET})
endif()
if (CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT)
target_compile_definitions(mender-mcu-client PUBLIC CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT=${CONFIG_MENDER_ERRORS_THRESHOLD_REBOOT})
endif()

find_package(PkgConfig REQUIRED)

Expand Down
15 changes: 15 additions & 0 deletions target/zephyr/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,21 @@ if MENDER_MCU_CLIENT
help
The number of errors triggering network reset.

config MENDER_DETECT_REBOOT_ERRORS
bool "Try to detect reboot errors and revert to normal operations instead of waiting for reboot forever"
default y
help
Whether Mender should try to detect it is waiting for too long after requesting a reboot.
Then it can mark the deployment (update) as failed and revert to normal operation, waiting for the next one.

config MENDER_ERRORS_THRESHOLD_REBOOT
int "Reboot waiting iterations threshold"
range 1 255
default 5
depends on MENDER_DETECT_REBOOT_ERRORS
help
The number of iterations to wait for a reboot.

endmenu

endmenu
Expand Down

0 comments on commit 830dc33

Please sign in to comment.