feat: custom log directory (#479)

* Use a generic ResolvedLogDir instead of the concrete S3LogDir * Add a test for customLogDirectory * Document the property customLogDirectory * Use "log directory" instead of "S3 log directory" when the resolved log directory structure is used * Update CRD docs * Update docs/modules/spark-k8s/pages/index.adoc Co-authored-by: Razvan-Daniel Mihai <[email protected]> * Update CHANGELOG.md Co-authored-by: Razvan-Daniel Mihai <[email protected]> --------- Co-authored-by: Razvan-Daniel Mihai <[email protected]>
stackabletech · Oct 21, 2024 · 45c1e7a · 45c1e7a
1 parent 2d68f63
commit 45c1e7a
Show file tree

Hide file tree

Showing 27 changed files with 619 additions and 124 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file.
 ### Added
 
 - Make spark-env.sh configurable via `configOverrides` ([#473]).
+- The Spark history server can now service logs from HDFS compatible systems ([#479]).
 
 ### Changed
 
@@ -33,6 +34,7 @@ All notable changes to this project will be documented in this file.
 [#460]: https://github.com/stackabletech/spark-k8s-operator/pull/460
 [#472]: https://github.com/stackabletech/spark-k8s-operator/pull/472
 [#473]: https://github.com/stackabletech/spark-k8s-operator/pull/473
+[#479]: https://github.com/stackabletech/spark-k8s-operator/pull/479
 
 ## [24.7.0] - 2024-07-24
 

diff --git a/deploy/helm/spark-k8s-operator/crds/crds.yaml b/deploy/helm/spark-k8s-operator/crds/crds.yaml
@@ -615,13 +615,19 @@ spec:
                       x-kubernetes-preserve-unknown-fields: true
                   type: object
                 logFileDirectory:
-                  description: The log file directory definition used by the Spark history server. Currently only S3 buckets are supported.
+                  description: The log file directory definition used by the Spark history server.
                   nullable: true
                   oneOf:
                     - required:
                         - s3
+                    - required:
+                        - customLogDirectory
                   properties:
+                    customLogDirectory:
+                      description: A custom log directory
+                      type: string
                     s3:
+                      description: An S3 bucket storing the log events
                       properties:
                         bucket:
                           oneOf:
@@ -1065,12 +1071,18 @@ spec:
                       type: string
                   type: object
                 logFileDirectory:
-                  description: The log file directory definition used by the Spark history server. Currently only S3 buckets are supported.
+                  description: The log file directory definition used by the Spark history server.
                   oneOf:
                     - required:
                         - s3
+                    - required:
+                        - customLogDirectory
                   properties:
+                    customLogDirectory:
+                      description: A custom log directory
+                      type: string
                     s3:
+                      description: An S3 bucket storing the log events
                       properties:
                         bucket:
                           oneOf:

diff --git a/docs/modules/spark-k8s/pages/index.adoc b/docs/modules/spark-k8s/pages/index.adoc
@@ -37,8 +37,8 @@ The SparkApplication resource is the main point of interaction with the operator
 An exhaustive list of options is given in the {crd}[SparkApplication CRD reference {external-link-icon}^].
 
 The xref:usage-guide/history-server.adoc[SparkHistoryServer] has a single `node` role.
-It is used to deploy a https://spark.apache.org/docs/latest/monitoring.html#viewing-after-the-fact[Spark history server] that displays application logs from S3 buckets.
-Of course, your applications need to write their logs to the same buckets.
+It is used to deploy a https://spark.apache.org/docs/latest/monitoring.html#viewing-after-the-fact[Spark history server] that displays application logs.
+Of course, your applications need to write their logs to the same location.
 
 === Kubernetes resources
 

diff --git a/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc b/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc
@@ -17,9 +17,7 @@ For more details on how the Stackable Data Platform manages S3 resources see the
 include::example$example-history-server.yaml[]
 ----
 
-<1> The location of the event logs.
-    Must be an S3 bucket.
-    Future implementations might add support for other shared filesystems such as HDFS.
+<1> The location of the event logs, see <<log-dir-variants>> for other options.
 <2> Directory within the S3 bucket where the log files are located.
     This directory is required and must exist before setting up the history server.
 <3> The S3 bucket definition, here provided in-line.
@@ -56,7 +54,91 @@ include::example$example-history-app.yaml[]
 <5> Bucket to store logs. This must match the bucket used by the history server.
 <6> Credentials used to write event logs. These can, of course, differ from the credentials used to process data.
 
+[#log-dir-variants]
+== Supported file systems for storing log events
 
+=== S3
+
+As already shown in the example above, the event logs can be stored in an S3 bucket:
+
+[source,yaml]
+----
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkHistoryServer
+spec:
+  logFileDirectory:
+    s3:
+      prefix: eventlogs/
+      bucket:
+        ...
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+spec:
+  logFileDirectory:
+    s3:
+      prefix: eventlogs/
+      bucket:
+        ...
+----
+
+=== Custom log directory
+
+If there is no structure provided for the desired file system, it can nevertheless be set with the property `customLogDirectory`.
+Additional configuration overrides may be necessary in this case.
+
+For instance, to store the Spark event logs in HDFS, the following configuration could be used:
+
+[source,yaml]
+----
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkHistoryServer
+spec:
+  logFileDirectory:
+    customLogDirectory: hdfs://simple-hdfs/eventlogs/  # <1>
+  nodes:
+    envOverrides:
+      HADOOP_CONF_DIR: /stackable/hdfs-config  # <2>
+    podOverrides:
+      spec:
+        containers:
+        - name: spark-history
+          volumeMounts:
+          - name: hdfs-config
+            mountPath: /stackable/hdfs-config
+        volumes:
+        - name: hdfs-config
+          configMap:
+            name: hdfs  # <3>
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+spec:
+  logFileDirectory:
+    customLogDirectory: hdfs://simple-hdfs/eventlogs/  # <4>
+  sparkConf:
+    spark.driver.extraClassPath: /stackable/hdfs-config  # <5>
+  driver:
+    config:
+      volumeMounts:
+      - name: hdfs-config
+        mountPath: /stackable/hdfs-config
+  volumes:
+  - name: hdfs-config
+    configMap:
+      name: hdfs
+----
+
+<1> A custom log directory that is used for the Spark option `spark.history.fs.logDirectory`.
+    The required dependencies must be on the class path.
+    This is the case for HDFS.
+<2> The Spark History Server looks for the Hadoop configuration in the directory defined by the environment variable `HADOOP_CONF_DIR`.
+<3> The ConfigMap containing the Hadoop configuration files `core-site.xml` and `hdfs-site.xml`.
+<4> A custom log directory that is used for the Spark option `spark.eventLog.dir`.
+    Additionally, the Spark option `spark.eventLog.enabled` is set to `true`.
+<5> The Spark driver looks for the Hadoop configuration on the class path.
 
 == History Web UI
 

diff --git a/rust/crd/src/history.rs b/rust/crd/src/history.rs
@@ -1,5 +1,4 @@
-use crate::s3logdir::S3LogDir;
-use crate::tlscerts;
+use crate::logdir::ResolvedLogDir;
 use crate::{affinity::history_affinity, constants::*};
 
 use product_config::{types::PropertyNameKind, ProductConfigManager};
@@ -78,7 +77,6 @@ pub struct SparkHistoryServerSpec {
     pub vector_aggregator_config_map_name: Option<String>,
 
     /// The log file directory definition used by the Spark history server.
-    /// Currently only S3 buckets are supported.
     pub log_file_directory: LogFileDirectorySpec,
 
     /// A map of key/value strings that will be passed directly to Spark when deploying the history server.
@@ -235,7 +233,7 @@ impl SparkHistoryServer {
 
     pub fn merged_env(
         &self,
-        s3logdir: &S3LogDir,
+        logdir: &ResolvedLogDir,
         role_group_env_overrides: HashMap<String, String>,
     ) -> Vec<EnvVar> {
         // Maps env var name to env var object. This allows env_overrides to work
@@ -271,7 +269,7 @@ impl SparkHistoryServer {
     ];
 
         // if TLS is enabled build truststore
-        if tlscerts::tls_secret_name(&s3logdir.bucket.connection).is_some() {
+        if logdir.tls_enabled() {
             history_opts.extend(vec![
                 format!("-Djavax.net.ssl.trustStore={STACKABLE_TRUST_STORE}/truststore.p12"),
                 format!("-Djavax.net.ssl.trustStorePassword={STACKABLE_TLS_STORE_PASSWORD}"),
@@ -327,8 +325,11 @@ impl SparkHistoryServer {
 #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize, Display)]
 #[serde(rename_all = "camelCase")]
 pub enum LogFileDirectorySpec {
+    /// An S3 bucket storing the log events
     #[strum(serialize = "s3")]
     S3(S3LogFileDirectorySpec),
+    /// A custom log directory
+    CustomLogDirectory(String),
 }
 
 #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)]
@@ -456,6 +457,8 @@ impl Configuration for HistoryConfigFragment {
 
 #[cfg(test)]
 mod test {
+    use crate::logdir::S3LogDir;
+
     use super::*;
     use indoc::indoc;
     use stackable_operator::commons::{
@@ -495,7 +498,7 @@ mod test {
         let history: SparkHistoryServer =
             serde_yaml::with::singleton_map_recursive::deserialize(deserializer).unwrap();
 
-        let s3_log_dir: S3LogDir = S3LogDir {
+        let log_dir = ResolvedLogDir::S3(S3LogDir {
             bucket: ResolvedS3Bucket {
                 bucket_name: "my-bucket".to_string(),
                 connection: ResolvedS3Connection {
@@ -507,10 +510,10 @@ mod test {
                 },
             },
             prefix: "prefix".to_string(),
-        };
+        });
 
         let merged_env = history.merged_env(
-            &s3_log_dir,
+            &log_dir,
             history
                 .spec
                 .nodes