Skip to content

Commit

Permalink
Change the liveness probes to use the web UI port and to fail after o…
Browse files Browse the repository at this point in the history
…ne minute (#491)

* Use the web UI port for liveness probes

* Use defined web UI pages for liveness probes
  • Loading branch information
siegfriedweber authored Mar 6, 2024
1 parent 8f0a7c4 commit c261516
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 18 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file.
### Changed

- Use new label builders ([#454]).
- Change the liveness probes to use the web UI port and to fail after
one minute ([#491]).

### Removed

Expand All @@ -35,6 +37,7 @@ All notable changes to this project will be documented in this file.
[#462]: https://github.com/stackabletech/hdfs-operator/pull/462
[#474]: https://github.com/stackabletech/hdfs-operator/pull/474
[#475]: https://github.com/stackabletech/hdfs-operator/pull/475
[#491]: https://github.com/stackabletech/hdfs-operator/pull/491

## [23.11.0] - 2023-11-24

Expand Down
7 changes: 7 additions & 0 deletions rust/crd/src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ pub const DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration =
pub const DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration =
Duration::from_minutes_unchecked(30);

pub const READINESS_PROBE_INITIAL_DELAY_SECONDS: i32 = 10;
pub const READINESS_PROBE_PERIOD_SECONDS: i32 = 10;
pub const READINESS_PROBE_FAILURE_THRESHOLD: i32 = 3;
pub const LIVENESS_PROBE_INITIAL_DELAY_SECONDS: i32 = 10;
pub const LIVENESS_PROBE_PERIOD_SECONDS: i32 = 10;
pub const LIVENESS_PROBE_FAILURE_THRESHOLD: i32 = 5;

// hdfs-site.xml
pub const DFS_NAMENODE_NAME_DIR: &str = "dfs.namenode.name.dir";
pub const DFS_NAMENODE_SHARED_EDITS_DIR: &str = "dfs.namenode.shared.edits.dir";
Expand Down
114 changes: 96 additions & 18 deletions rust/operator-binary/src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ use stackable_hdfs_crd::{
constants::{
DATANODE_ROOT_DATA_DIR_PREFIX, DEFAULT_DATA_NODE_METRICS_PORT,
DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_NAME_NODE_METRICS_PORT,
JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME, LOG4J_PROPERTIES,
NAMENODE_ROOT_DATA_DIR, SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC,
STACKABLE_ROOT_DATA_DIR,
JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME,
LIVENESS_PROBE_FAILURE_THRESHOLD, LIVENESS_PROBE_INITIAL_DELAY_SECONDS,
LIVENESS_PROBE_PERIOD_SECONDS, LOG4J_PROPERTIES, NAMENODE_ROOT_DATA_DIR,
READINESS_PROBE_FAILURE_THRESHOLD, READINESS_PROBE_INITIAL_DELAY_SECONDS,
READINESS_PROBE_PERIOD_SECONDS, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS,
SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR,
},
storage::DataNodeStorageConfig,
AnyNodeConfig, DataNodeContainer, HdfsCluster, HdfsPodRef, HdfsRole, NameNodeContainer,
Expand All @@ -35,8 +38,9 @@ use stackable_operator::{
k8s_openapi::{
api::core::v1::{
ConfigMapKeySelector, ConfigMapVolumeSource, Container, ContainerPort,
EmptyDirVolumeSource, EnvVar, EnvVarSource, ObjectFieldSelector, PersistentVolumeClaim,
Probe, ResourceRequirements, TCPSocketAction, Volume, VolumeMount,
EmptyDirVolumeSource, EnvVar, EnvVarSource, HTTPGetAction, ObjectFieldSelector,
PersistentVolumeClaim, Probe, ResourceRequirements, TCPSocketAction, Volume,
VolumeMount,
},
apimachinery::pkg::util::intstr::IntOrString,
},
Expand Down Expand Up @@ -114,8 +118,21 @@ pub enum ContainerConfig {
container_name: String,
/// Volume mounts for config and logging.
volume_mounts: ContainerVolumeDirs,
/// Readiness and liveness probe service port name.
tcp_socket_action_port_name: &'static str,
/// Port name of the IPC/RPC port, used for the readiness probe.
ipc_port_name: &'static str,
/// Port name of the web UI HTTP port, used for the liveness probe.
web_ui_http_port_name: &'static str,
/// Port name of the web UI HTTPS port, used for the liveness probe.
web_ui_https_port_name: &'static str,
/// Path of the web UI URL; The path defaults to / in Kubernetes
/// and the kubelet follows redirects. The default would work if
/// the location header is set properly but that is not the case
/// for the DataNode. On a TLS-enabled DataNode, calling
/// https://127.0.0.1:9865/ redirects to the non-TLS URL
/// http://127.0.0.1:9865/index.html which causes the liveness
/// probe to fail. So it is best to not rely on the location
/// header but instead provide the resolved path directly.
web_ui_path: &'static str,
/// The JMX Exporter metrics port.
metrics_port: u16,
},
Expand Down Expand Up @@ -390,11 +407,23 @@ impl ContainerConfig {
cb.resources(resources);
}

if let Some(probe) = self.tcp_socket_action_probe(10, 10) {
cb.readiness_probe(probe.clone());
if let Some(probe) = self.web_ui_port_probe(
hdfs,
LIVENESS_PROBE_PERIOD_SECONDS,
LIVENESS_PROBE_INITIAL_DELAY_SECONDS,
LIVENESS_PROBE_FAILURE_THRESHOLD,
) {
cb.liveness_probe(probe);
}

if let Some(probe) = self.ipc_port_probe(
READINESS_PROBE_PERIOD_SECONDS,
READINESS_PROBE_INITIAL_DELAY_SECONDS,
READINESS_PROBE_FAILURE_THRESHOLD,
) {
cb.readiness_probe(probe.clone());
}

Ok(cb.build())
}

Expand Down Expand Up @@ -788,24 +817,64 @@ wait_for_termination $!
}
}

/// Creates a probe for [`stackable_operator::k8s_openapi::api::core::v1::TCPSocketAction`]
/// for liveness or readiness probes
fn tcp_socket_action_probe(
/// Creates a probe for the web UI port
fn web_ui_port_probe(
&self,
hdfs: &HdfsCluster,
period_seconds: i32,
initial_delay_seconds: i32,
failure_threshold: i32,
) -> Option<Probe> {
match self {
ContainerConfig::Hdfs {
tcp_socket_action_port_name,
web_ui_http_port_name,
web_ui_https_port_name,
web_ui_path,
..
} => Some(Probe {
} => {
let http_get_action = if hdfs.has_https_enabled() {
HTTPGetAction {
port: IntOrString::String(web_ui_https_port_name.to_string()),
scheme: Some("HTTPS".into()),
path: Some(web_ui_path.to_string()),
..HTTPGetAction::default()
}
} else {
HTTPGetAction {
port: IntOrString::String(web_ui_http_port_name.to_string()),
scheme: Some("HTTP".into()),
path: Some(web_ui_path.to_string()),
..HTTPGetAction::default()
}
};
Some(Probe {
http_get: Some(http_get_action),
period_seconds: Some(period_seconds),
initial_delay_seconds: Some(initial_delay_seconds),
failure_threshold: Some(failure_threshold),
..Probe::default()
})
}
_ => None,
}
}

/// Creates a probe for the IPC/RPC port
fn ipc_port_probe(
&self,
period_seconds: i32,
initial_delay_seconds: i32,
failure_threshold: i32,
) -> Option<Probe> {
match self {
ContainerConfig::Hdfs { ipc_port_name, .. } => Some(Probe {
tcp_socket: Some(TCPSocketAction {
port: IntOrString::String(String::from(*tcp_socket_action_port_name)),
port: IntOrString::String(ipc_port_name.to_string()),
..TCPSocketAction::default()
}),
period_seconds: Some(period_seconds),
initial_delay_seconds: Some(initial_delay_seconds),
failure_threshold: Some(failure_threshold),
..Probe::default()
}),
_ => None,
Expand Down Expand Up @@ -1177,21 +1246,30 @@ impl From<HdfsRole> for ContainerConfig {
role: role.clone(),
container_name: role.to_string(),
volume_mounts: ContainerVolumeDirs::from(role),
tcp_socket_action_port_name: SERVICE_PORT_NAME_RPC,
ipc_port_name: SERVICE_PORT_NAME_RPC,
web_ui_http_port_name: SERVICE_PORT_NAME_HTTP,
web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS,
web_ui_path: "/dfshealth.html",
metrics_port: DEFAULT_NAME_NODE_METRICS_PORT,
},
HdfsRole::DataNode => Self::Hdfs {
role: role.clone(),
container_name: role.to_string(),
volume_mounts: ContainerVolumeDirs::from(role),
tcp_socket_action_port_name: SERVICE_PORT_NAME_IPC,
ipc_port_name: SERVICE_PORT_NAME_IPC,
web_ui_http_port_name: SERVICE_PORT_NAME_HTTP,
web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS,
web_ui_path: "/datanode.html",
metrics_port: DEFAULT_DATA_NODE_METRICS_PORT,
},
HdfsRole::JournalNode => Self::Hdfs {
role: role.clone(),
container_name: role.to_string(),
volume_mounts: ContainerVolumeDirs::from(role),
tcp_socket_action_port_name: SERVICE_PORT_NAME_RPC,
ipc_port_name: SERVICE_PORT_NAME_RPC,
web_ui_http_port_name: SERVICE_PORT_NAME_HTTP,
web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS,
web_ui_path: "/journalnode.html",
metrics_port: DEFAULT_JOURNAL_NODE_METRICS_PORT,
},
}
Expand Down

0 comments on commit c261516

Please sign in to comment.