Skip to content

Commit

Permalink
Support graceful shutdown (#422)
Browse files Browse the repository at this point in the history
* implement graceful shutdown

* regenerate charts

* add termination perdiod to smoke test

* add docs

* adapt changelog

* fix import order

* reorganize imports

* improve commands

* improve imports
  • Loading branch information
maltesander authored Nov 16, 2023
1 parent fbcfa95 commit 74d6910
Show file tree
Hide file tree
Showing 10 changed files with 120 additions and 24 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Default stackableVersion to operator version ([#390]).
- Support PodDisruptionBudgets ([#407]).
- Added support for versions 2.1.1, 3.0.1 ([#415]).
- Support graceful shutdown ([#422]).

### Changed

Expand All @@ -29,6 +30,7 @@
[#396]: https://github.com/stackabletech/superset-operator/pull/396
[#407]: https://github.com/stackabletech/superset-operator/pull/407
[#415]: https://github.com/stackabletech/superset-operator/pull/415
[#422]: https://github.com/stackabletech/superset-operator/pull/422

## [23.7.0] - 2023-07-14

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ clap = "4.3"
fnv = "1.0"
futures = { version = "0.3", features = ["compat"] }
indoc = "2.0"
product-config = { git = "https://github.com/stackabletech/product-config.git", tag = "0.6.0" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_yaml = "0.9"
snafu = "0.7"
stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag = "0.56.0" }
product-config = { git = "https://github.com/stackabletech/product-config.git", tag = "0.6.0" }
strum = { version = "0.25", features = ["derive"] }
tokio = { version = "1.29", features = ["full"] }
tracing = "0.1"
Expand Down
8 changes: 8 additions & 0 deletions deploy/helm/superset-operator/crds/crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,10 @@ spec:
type: array
type: object
type: object
gracefulShutdownTimeout:
description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details.
nullable: true
type: string
logging:
default:
enableVectorAgent: null
Expand Down Expand Up @@ -4079,6 +4083,10 @@ spec:
type: array
type: object
type: object
gracefulShutdownTimeout:
description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details.
nullable: true
type: string
logging:
default:
enableVectorAgent: null
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
= Graceful shutdown

Graceful shutdown of Superset nodes is either not supported by the product itself
or we have not implemented it yet.
You can configure the graceful shutdown as described in xref:concepts:operations/graceful_shutdown.adoc[].

Outstanding implementation work for the graceful shutdowns of all products where this functionality is relevant is tracked in
https://github.com/stackabletech/issues/issues/357
== Nodes

As a default, Superset nodes have `2 minutes` to shut down gracefully.

The Superset node process will receive a `SIGTERM` signal when Kubernetes wants to terminate the Pod.
It will log the received signal as shown in the log below and initiate a graceful shutdown.
After the graceful shutdown timeout runs out, and the process still didn't exit, Kubernetes will issue a `SIGKILL` signal.

[source,text]
----
superset [2023-11-08 13:14:39 +0000] [206] [INFO] Handling signal: term
metrics ts=2023-11-08T13:14:39.818Z caller=main.go:553 level=info msg="Received os signal, exiting" signal=terminated
superset [2023-11-08 13:14:39 +0000] [207] [INFO] Worker exiting (pid: 207)
superset Loaded your LOCAL configuration at [/stackable/app/pythonpath/superset_config.py]
superset [2023-11-08 13:14:40 +0000] [206] [INFO] Shutting down: Master
----
14 changes: 11 additions & 3 deletions rust/crd/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use stackable_operator::{
role_utils::{GenericRoleConfig, Role, RoleGroup, RoleGroupRef},
schemars::{self, JsonSchema},
status::condition::{ClusterCondition, HasStatusCondition},
time::Duration,
};
use strum::{Display, EnumIter, EnumString, IntoEnumIterator};

Expand All @@ -32,16 +33,18 @@ pub mod authentication;
pub mod druidconnection;

pub const APP_NAME: &str = "superset";
pub const CONFIG_DIR: &str = "/stackable/config";
pub const LOG_CONFIG_DIR: &str = "/stackable/log_config";
pub const LOG_DIR: &str = "/stackable/log";
pub const STACKABLE_CONFIG_DIR: &str = "/stackable/config";
pub const STACKABLE_LOG_CONFIG_DIR: &str = "/stackable/log_config";
pub const STACKABLE_LOG_DIR: &str = "/stackable/log";
pub const PYTHONPATH: &str = "/stackable/app/pythonpath";
pub const SUPERSET_CONFIG_FILENAME: &str = "superset_config.py";
pub const MAX_LOG_FILES_SIZE: MemoryQuantity = MemoryQuantity {
value: 10.0,
unit: BinaryMultiple::Mebi,
};

const DEFAULT_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_minutes_unchecked(2);

#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("unknown Superset role found {role}. Should be one of {roles:?}"))]
Expand Down Expand Up @@ -313,6 +316,10 @@ pub struct SupersetConfig {

#[fragment_attrs(serde(default))]
pub affinity: StackableAffinity,

/// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details.
#[fragment_attrs(serde(default))]
pub graceful_shutdown_timeout: Option<Duration>,
}

impl SupersetConfig {
Expand All @@ -334,6 +341,7 @@ impl SupersetConfig {
},
logging: product_logging::spec::default_logging(),
affinity: get_affinity(cluster_name, role),
graceful_shutdown_timeout: Some(DEFAULT_NODE_GRACEFUL_SHUTDOWN_TIMEOUT),
..Default::default()
}
}
Expand Down
26 changes: 26 additions & 0 deletions rust/operator-binary/src/operations/graceful_shutdown.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
use snafu::{ResultExt, Snafu};
use stackable_operator::builder::PodBuilder;
use stackable_superset_crd::SupersetConfig;

#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Failed to set terminationGracePeriod"))]
SetTerminationGracePeriod {
source: stackable_operator::builder::pod::Error,
},
}

pub fn add_graceful_shutdown_config(
merged_config: &SupersetConfig,
pod_builder: &mut PodBuilder,
) -> Result<(), Error> {
// This must be always set by the merge mechanism, as we provide a default value,
// users can not disable graceful shutdown.
if let Some(graceful_shutdown_timeout) = merged_config.graceful_shutdown_timeout {
pod_builder
.termination_grace_period(&graceful_shutdown_timeout)
.context(SetTerminationGracePeriodSnafu)?;
}

Ok(())
}
1 change: 1 addition & 0 deletions rust/operator-binary/src/operations/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pub mod graceful_shutdown;
pub mod pdb;
4 changes: 2 additions & 2 deletions rust/operator-binary/src/product_logging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use stackable_operator::{
},
role_utils::RoleGroupRef,
};
use stackable_superset_crd::LOG_DIR;
use stackable_superset_crd::STACKABLE_LOG_DIR;

#[derive(Snafu, Debug)]
pub enum Error {
Expand Down Expand Up @@ -91,7 +91,7 @@ where
choice: Some(ContainerLogConfigChoice::Automatic(log_config)),
}) = logging.containers.get(main_container)
{
let log_dir = format!("{LOG_DIR}/{main_container}");
let log_dir = format!("{STACKABLE_LOG_DIR}/{main_container}");
cm_builder.add_data(
LOG_CONFIG_FILE,
create_superset_config(log_config, &log_dir),
Expand Down
62 changes: 48 additions & 14 deletions rust/operator-binary/src/superset_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,31 @@ use stackable_operator::{
labels::{role_group_selector_labels, role_selector_labels},
logging::controller::ReconcilerError,
product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config},
product_logging::{self, spec::Logging},
product_logging::{
self,
framework::{create_vector_shutdown_file_command, remove_vector_shutdown_file_command},
spec::Logging,
},
role_utils::{GenericRoleConfig, RoleGroupRef},
status::condition::{
compute_conditions, operations::ClusterOperationsConditionBuilder,
statefulset::StatefulSetConditionBuilder,
},
time::Duration,
utils::COMMON_BASH_TRAP_FUNCTIONS,
};
use stackable_superset_crd::{
authentication::SuperSetAuthenticationConfigResolved, Container, SupersetCluster,
SupersetClusterStatus, SupersetConfig, SupersetConfigOptions, SupersetRole, APP_NAME,
CONFIG_DIR, LOG_CONFIG_DIR, LOG_DIR, PYTHONPATH, SUPERSET_CONFIG_FILENAME,
PYTHONPATH, STACKABLE_CONFIG_DIR, STACKABLE_LOG_CONFIG_DIR, STACKABLE_LOG_DIR,
SUPERSET_CONFIG_FILENAME,
};
use strum::{EnumDiscriminants, IntoStaticStr};

use crate::{
config::{self, PYTHON_IMPORTS},
controller_commons::{self, CONFIG_VOLUME_NAME, LOG_CONFIG_VOLUME_NAME, LOG_VOLUME_NAME},
operations::pdb::add_pdbs,
operations::{graceful_shutdown::add_graceful_shutdown_config, pdb::add_pdbs},
product_logging::{
extend_config_map_with_log_config, resolve_vector_aggregator_address, LOG_CONFIG_FILE,
},
Expand Down Expand Up @@ -202,6 +208,11 @@ pub enum Error {
FailedToCreatePdb {
source: crate::operations::pdb::Error,
},

#[snafu(display("failed to configure graceful shutdown"))]
GracefulShutdown {
source: crate::operations::graceful_shutdown::Error,
},
}

type Result<T, E = Error> = std::result::Result<T, E>;
Expand Down Expand Up @@ -656,21 +667,24 @@ fn build_server_rolegroup_statefulset(
superset_cb
.image_from_product_image(resolved_product_image)
.add_container_port("http", APP_PORT.into())
.add_volume_mount(CONFIG_VOLUME_NAME, CONFIG_DIR)
.add_volume_mount(LOG_CONFIG_VOLUME_NAME, LOG_CONFIG_DIR)
.add_volume_mount(LOG_VOLUME_NAME, LOG_DIR)
.add_volume_mount(CONFIG_VOLUME_NAME, STACKABLE_CONFIG_DIR)
.add_volume_mount(LOG_CONFIG_VOLUME_NAME, STACKABLE_LOG_CONFIG_DIR)
.add_volume_mount(LOG_VOLUME_NAME, STACKABLE_LOG_DIR)
.add_env_var_from_secret("ADMIN_USERNAME", secret, "adminUser.username")
.add_env_var_from_secret("ADMIN_FIRSTNAME", secret, "adminUser.firstname")
.add_env_var_from_secret("ADMIN_LASTNAME", secret, "adminUser.lastname")
.add_env_var_from_secret("ADMIN_EMAIL", secret, "adminUser.email")
.add_env_var_from_secret("ADMIN_PASSWORD", secret, "adminUser.password")
.command(vec![
"/bin/sh".to_string(),
"/bin/bash".to_string(),
"-x".to_string(),
"-euo".to_string(),
"pipefail".to_string(),
"-c".to_string(),
formatdoc! {"
mkdir --parents {PYTHONPATH} && \
cp {CONFIG_DIR}/* {PYTHONPATH} && \
cp {LOG_CONFIG_DIR}/{LOG_CONFIG_FILE} {PYTHONPATH} && \
cp {STACKABLE_CONFIG_DIR}/* {PYTHONPATH} && \
cp {STACKABLE_LOG_CONFIG_DIR}/{LOG_CONFIG_FILE} {PYTHONPATH} && \
superset db upgrade && \
superset fab create-admin \
--username \"$ADMIN_USERNAME\" \
Expand All @@ -679,18 +693,26 @@ fn build_server_rolegroup_statefulset(
--email \"$ADMIN_EMAIL\" \
--password \"$ADMIN_PASSWORD\" && \
superset init && \
{COMMON_BASH_TRAP_FUNCTIONS}
{remove_vector_shutdown_file_command}
prepare_signal_handlers
gunicorn \
--bind 0.0.0.0:${{SUPERSET_PORT}} \
--worker-class gthread \
--threads 20 \
--timeout {webserver_timeout} \
--limit-request-line 0 \
--limit-request-field_size 0 \
'superset.app:create_app()'
"},
'superset.app:create_app()' &
wait_for_termination $!
{create_vector_shutdown_file_command}",
remove_vector_shutdown_file_command =
remove_vector_shutdown_file_command(STACKABLE_LOG_DIR),
create_vector_shutdown_file_command =
create_vector_shutdown_file_command(STACKABLE_LOG_DIR),
},
])
.resources(merged_config.resources.clone().into());

let probe = Probe {
http_get: Some(HTTPGetAction {
port: IntOrString::Int(APP_PORT.into()),
Expand All @@ -708,12 +730,24 @@ fn build_server_rolegroup_statefulset(
superset_cb.liveness_probe(probe);

pb.add_container(superset_cb.build());
add_graceful_shutdown_config(merged_config, &mut pb).context(GracefulShutdownSnafu)?;

let metrics_container = ContainerBuilder::new("metrics")
.context(InvalidContainerNameSnafu)?
.image_from_product_image(resolved_product_image)
.command(vec!["/bin/bash".to_string(), "-c".to_string()])
.args(vec!["/stackable/statsd_exporter".to_string()])
.command(vec![
"/bin/bash".to_string(),
"-x".to_string(),
"-euo".to_string(),
"pipefail".to_string(),
"-c".to_string(),
])
.args(vec![formatdoc! {"
{COMMON_BASH_TRAP_FUNCTIONS}
prepare_signal_handlers
/stackable/statsd_exporter &
wait_for_termination $!
"}])
.add_container_port(METRICS_PORT_NAME, METRICS_PORT)
.resources(
ResourceRequirementsBuilder::new()
Expand Down
4 changes: 4 additions & 0 deletions tests/templates/kuttl/smoke/30-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ apiVersion: apps/v1
kind: StatefulSet
metadata:
name: superset-node-default
spec:
template:
spec:
terminationGracePeriodSeconds: 120
status:
readyReplicas: 1
replicas: 1
Expand Down

0 comments on commit 74d6910

Please sign in to comment.