From c261516c33bfb360d781c8d012e970a2f2de7198 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Wed, 6 Mar 2024 17:03:12 +0100 Subject: [PATCH] Change the liveness probes to use the web UI port and to fail after one minute (#491) * Use the web UI port for liveness probes * Use defined web UI pages for liveness probes --- CHANGELOG.md | 3 + rust/crd/src/constants.rs | 7 ++ rust/operator-binary/src/container.rs | 114 ++++++++++++++++++++++---- 3 files changed, 106 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 536c1541..fc26cf1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file. ### Changed - Use new label builders ([#454]). +- Change the liveness probes to use the web UI port and to fail after + one minute ([#491]). ### Removed @@ -35,6 +37,7 @@ All notable changes to this project will be documented in this file. [#462]: https://github.com/stackabletech/hdfs-operator/pull/462 [#474]: https://github.com/stackabletech/hdfs-operator/pull/474 [#475]: https://github.com/stackabletech/hdfs-operator/pull/475 +[#491]: https://github.com/stackabletech/hdfs-operator/pull/491 ## [23.11.0] - 2023-11-24 diff --git a/rust/crd/src/constants.rs b/rust/crd/src/constants.rs index d1fda8bd..0a70807c 100644 --- a/rust/crd/src/constants.rs +++ b/rust/crd/src/constants.rs @@ -49,6 +49,13 @@ pub const DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = pub const DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_minutes_unchecked(30); +pub const READINESS_PROBE_INITIAL_DELAY_SECONDS: i32 = 10; +pub const READINESS_PROBE_PERIOD_SECONDS: i32 = 10; +pub const READINESS_PROBE_FAILURE_THRESHOLD: i32 = 3; +pub const LIVENESS_PROBE_INITIAL_DELAY_SECONDS: i32 = 10; +pub const LIVENESS_PROBE_PERIOD_SECONDS: i32 = 10; +pub const LIVENESS_PROBE_FAILURE_THRESHOLD: i32 = 5; + // hdfs-site.xml pub const DFS_NAMENODE_NAME_DIR: &str = "dfs.namenode.name.dir"; pub const DFS_NAMENODE_SHARED_EDITS_DIR: &str = "dfs.namenode.shared.edits.dir"; diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 5fe61492..896a2fff 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -17,9 +17,12 @@ use stackable_hdfs_crd::{ constants::{ DATANODE_ROOT_DATA_DIR_PREFIX, DEFAULT_DATA_NODE_METRICS_PORT, DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, - JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME, LOG4J_PROPERTIES, - NAMENODE_ROOT_DATA_DIR, SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, - STACKABLE_ROOT_DATA_DIR, + JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME, + LIVENESS_PROBE_FAILURE_THRESHOLD, LIVENESS_PROBE_INITIAL_DELAY_SECONDS, + LIVENESS_PROBE_PERIOD_SECONDS, LOG4J_PROPERTIES, NAMENODE_ROOT_DATA_DIR, + READINESS_PROBE_FAILURE_THRESHOLD, READINESS_PROBE_INITIAL_DELAY_SECONDS, + READINESS_PROBE_PERIOD_SECONDS, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, + SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, }, storage::DataNodeStorageConfig, AnyNodeConfig, DataNodeContainer, HdfsCluster, HdfsPodRef, HdfsRole, NameNodeContainer, @@ -35,8 +38,9 @@ use stackable_operator::{ k8s_openapi::{ api::core::v1::{ ConfigMapKeySelector, ConfigMapVolumeSource, Container, ContainerPort, - EmptyDirVolumeSource, EnvVar, EnvVarSource, ObjectFieldSelector, PersistentVolumeClaim, - Probe, ResourceRequirements, TCPSocketAction, Volume, VolumeMount, + EmptyDirVolumeSource, EnvVar, EnvVarSource, HTTPGetAction, ObjectFieldSelector, + PersistentVolumeClaim, Probe, ResourceRequirements, TCPSocketAction, Volume, + VolumeMount, }, apimachinery::pkg::util::intstr::IntOrString, }, @@ -114,8 +118,21 @@ pub enum ContainerConfig { container_name: String, /// Volume mounts for config and logging. volume_mounts: ContainerVolumeDirs, - /// Readiness and liveness probe service port name. - tcp_socket_action_port_name: &'static str, + /// Port name of the IPC/RPC port, used for the readiness probe. + ipc_port_name: &'static str, + /// Port name of the web UI HTTP port, used for the liveness probe. + web_ui_http_port_name: &'static str, + /// Port name of the web UI HTTPS port, used for the liveness probe. + web_ui_https_port_name: &'static str, + /// Path of the web UI URL; The path defaults to / in Kubernetes + /// and the kubelet follows redirects. The default would work if + /// the location header is set properly but that is not the case + /// for the DataNode. On a TLS-enabled DataNode, calling + /// https://127.0.0.1:9865/ redirects to the non-TLS URL + /// http://127.0.0.1:9865/index.html which causes the liveness + /// probe to fail. So it is best to not rely on the location + /// header but instead provide the resolved path directly. + web_ui_path: &'static str, /// The JMX Exporter metrics port. metrics_port: u16, }, @@ -390,11 +407,23 @@ impl ContainerConfig { cb.resources(resources); } - if let Some(probe) = self.tcp_socket_action_probe(10, 10) { - cb.readiness_probe(probe.clone()); + if let Some(probe) = self.web_ui_port_probe( + hdfs, + LIVENESS_PROBE_PERIOD_SECONDS, + LIVENESS_PROBE_INITIAL_DELAY_SECONDS, + LIVENESS_PROBE_FAILURE_THRESHOLD, + ) { cb.liveness_probe(probe); } + if let Some(probe) = self.ipc_port_probe( + READINESS_PROBE_PERIOD_SECONDS, + READINESS_PROBE_INITIAL_DELAY_SECONDS, + READINESS_PROBE_FAILURE_THRESHOLD, + ) { + cb.readiness_probe(probe.clone()); + } + Ok(cb.build()) } @@ -788,24 +817,64 @@ wait_for_termination $! } } - /// Creates a probe for [`stackable_operator::k8s_openapi::api::core::v1::TCPSocketAction`] - /// for liveness or readiness probes - fn tcp_socket_action_probe( + /// Creates a probe for the web UI port + fn web_ui_port_probe( &self, + hdfs: &HdfsCluster, period_seconds: i32, initial_delay_seconds: i32, + failure_threshold: i32, ) -> Option { match self { ContainerConfig::Hdfs { - tcp_socket_action_port_name, + web_ui_http_port_name, + web_ui_https_port_name, + web_ui_path, .. - } => Some(Probe { + } => { + let http_get_action = if hdfs.has_https_enabled() { + HTTPGetAction { + port: IntOrString::String(web_ui_https_port_name.to_string()), + scheme: Some("HTTPS".into()), + path: Some(web_ui_path.to_string()), + ..HTTPGetAction::default() + } + } else { + HTTPGetAction { + port: IntOrString::String(web_ui_http_port_name.to_string()), + scheme: Some("HTTP".into()), + path: Some(web_ui_path.to_string()), + ..HTTPGetAction::default() + } + }; + Some(Probe { + http_get: Some(http_get_action), + period_seconds: Some(period_seconds), + initial_delay_seconds: Some(initial_delay_seconds), + failure_threshold: Some(failure_threshold), + ..Probe::default() + }) + } + _ => None, + } + } + + /// Creates a probe for the IPC/RPC port + fn ipc_port_probe( + &self, + period_seconds: i32, + initial_delay_seconds: i32, + failure_threshold: i32, + ) -> Option { + match self { + ContainerConfig::Hdfs { ipc_port_name, .. } => Some(Probe { tcp_socket: Some(TCPSocketAction { - port: IntOrString::String(String::from(*tcp_socket_action_port_name)), + port: IntOrString::String(ipc_port_name.to_string()), ..TCPSocketAction::default() }), period_seconds: Some(period_seconds), initial_delay_seconds: Some(initial_delay_seconds), + failure_threshold: Some(failure_threshold), ..Probe::default() }), _ => None, @@ -1177,21 +1246,30 @@ impl From for ContainerConfig { role: role.clone(), container_name: role.to_string(), volume_mounts: ContainerVolumeDirs::from(role), - tcp_socket_action_port_name: SERVICE_PORT_NAME_RPC, + ipc_port_name: SERVICE_PORT_NAME_RPC, + web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, + web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, + web_ui_path: "/dfshealth.html", metrics_port: DEFAULT_NAME_NODE_METRICS_PORT, }, HdfsRole::DataNode => Self::Hdfs { role: role.clone(), container_name: role.to_string(), volume_mounts: ContainerVolumeDirs::from(role), - tcp_socket_action_port_name: SERVICE_PORT_NAME_IPC, + ipc_port_name: SERVICE_PORT_NAME_IPC, + web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, + web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, + web_ui_path: "/datanode.html", metrics_port: DEFAULT_DATA_NODE_METRICS_PORT, }, HdfsRole::JournalNode => Self::Hdfs { role: role.clone(), container_name: role.to_string(), volume_mounts: ContainerVolumeDirs::from(role), - tcp_socket_action_port_name: SERVICE_PORT_NAME_RPC, + ipc_port_name: SERVICE_PORT_NAME_RPC, + web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, + web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, + web_ui_path: "/journalnode.html", metrics_port: DEFAULT_JOURNAL_NODE_METRICS_PORT, }, }