From a49b4294c54ee163305b0bf48ddcab208ec83aca Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Fri, 20 Dec 2024 10:45:06 +0100 Subject: [PATCH] Test metrics in the smoke test (#635) * test: Fix metrics in smoke test * test: Update metrics configuration in smoke test * test: Check metrics in smoke test * test: Remove the custom JMX exporter config * test: Fix smoke test for Hadoop 3.4.0 * chore: Format Python code in the smoke test --- tests/templates/kuttl/smoke/40-assert.yaml | 2 +- ...bhdfs.yaml => 40-install-test-runner.yaml} | 10 +- tests/templates/kuttl/smoke/50-assert.yaml | 2 +- .../templates/kuttl/smoke/50-create-file.yaml | 6 +- tests/templates/kuttl/smoke/51-assert.yaml.j2 | 12 ++ .../smoke/51-copy-metrics-test-script.yaml | 5 + tests/templates/kuttl/smoke/test_metrics.py | 124 ++++++++++++++++++ 7 files changed, 151 insertions(+), 10 deletions(-) rename tests/templates/kuttl/smoke/{40-webhdfs.yaml => 40-install-test-runner.yaml} (82%) create mode 100644 tests/templates/kuttl/smoke/51-assert.yaml.j2 create mode 100644 tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml create mode 100755 tests/templates/kuttl/smoke/test_metrics.py diff --git a/tests/templates/kuttl/smoke/40-assert.yaml b/tests/templates/kuttl/smoke/40-assert.yaml index 6237bcac..64d967e4 100644 --- a/tests/templates/kuttl/smoke/40-assert.yaml +++ b/tests/templates/kuttl/smoke/40-assert.yaml @@ -6,7 +6,7 @@ timeout: 300 apiVersion: apps/v1 kind: StatefulSet metadata: - name: webhdfs + name: test-runner status: readyReplicas: 1 replicas: 1 diff --git a/tests/templates/kuttl/smoke/40-webhdfs.yaml b/tests/templates/kuttl/smoke/40-install-test-runner.yaml similarity index 82% rename from tests/templates/kuttl/smoke/40-webhdfs.yaml rename to tests/templates/kuttl/smoke/40-install-test-runner.yaml index e929d756..cc1f296b 100644 --- a/tests/templates/kuttl/smoke/40-webhdfs.yaml +++ b/tests/templates/kuttl/smoke/40-install-test-runner.yaml @@ -2,22 +2,22 @@ apiVersion: apps/v1 kind: StatefulSet metadata: - name: webhdfs + name: test-runner labels: - app: webhdfs + app: test-runner spec: replicas: 1 selector: matchLabels: - app: webhdfs + app: test-runner template: metadata: labels: - app: webhdfs + app: test-runner spec: shareProcessNamespace: true containers: - - name: webhdfs + - name: test-runner image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable0.0.0-dev args: [sleep, infinity] stdin: true diff --git a/tests/templates/kuttl/smoke/50-assert.yaml b/tests/templates/kuttl/smoke/50-assert.yaml index 1c4860b9..166e0e21 100644 --- a/tests/templates/kuttl/smoke/50-assert.yaml +++ b/tests/templates/kuttl/smoke/50-assert.yaml @@ -2,4 +2,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestAssert commands: - - script: kubectl exec -n $NAMESPACE webhdfs-0 -- python /tmp/webhdfs.py $NAMESPACE ls + - script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/webhdfs.py $NAMESPACE ls diff --git a/tests/templates/kuttl/smoke/50-create-file.yaml b/tests/templates/kuttl/smoke/50-create-file.yaml index d72fb348..80a710a7 100644 --- a/tests/templates/kuttl/smoke/50-create-file.yaml +++ b/tests/templates/kuttl/smoke/50-create-file.yaml @@ -2,6 +2,6 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl cp -n $NAMESPACE ./webhdfs.py webhdfs-0:/tmp - - script: kubectl cp -n $NAMESPACE ./testdata.txt webhdfs-0:/tmp - - script: kubectl exec -n $NAMESPACE webhdfs-0 -- python /tmp/webhdfs.py $NAMESPACE create + - script: kubectl cp -n $NAMESPACE ./webhdfs.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./testdata.txt test-runner-0:/tmp + - script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/webhdfs.py $NAMESPACE create diff --git a/tests/templates/kuttl/smoke/51-assert.yaml.j2 b/tests/templates/kuttl/smoke/51-assert.yaml.j2 new file mode 100644 index 00000000..4a20065d --- /dev/null +++ b/tests/templates/kuttl/smoke/51-assert.yaml.j2 @@ -0,0 +1,12 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +commands: + - script: | +{% if test_scenario['values']['hadoop'].find(",") > 0 %} + PRODUCT_VERSION={{ test_scenario['values']['hadoop'].split(',')[0] }} +{% else %} + PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }} +{% endif %} + kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ + python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml new file mode 100644 index 00000000..fa17cd19 --- /dev/null +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/test_metrics.py b/tests/templates/kuttl/smoke/test_metrics.py new file mode 100755 index 00000000..5129e2dd --- /dev/null +++ b/tests/templates/kuttl/smoke/test_metrics.py @@ -0,0 +1,124 @@ +# Every rule in the JMX configuration is covered by one expected metric. + +import re +import sys +import logging + +import requests + + +def check_metrics( + namespace: str, role: str, port: int, expected_metrics: list[str] +) -> None: + response: requests.Response = requests.get( + f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics", + timeout=10, + ) + assert response.ok, "Requesting metrics failed" + + for metric in expected_metrics: + assert ( + re.search(f"^{metric}", response.text, re.MULTILINE) is not None + ), f"Metric '{metric}' not found for {role}" + + +def check_namenode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_namenode_num_active_sources{kind="MetricsSystem",role="NameNode",service="HDFS",sub="Stats"}', + # Attribute "Total" + 'hadoop_namenode_total{kind="NameNodeInfo",role="NameNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_namenode_files_total{kind="FSNamesystem",role="NameNode",service="HDFS"}', + # Metric suffixed with "_created" + 'hadoop_namenode_files_created_{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + # Non-special metric + 'hadoop_namenode_files_deleted{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend( + [ + # Metric suffixed with "_info" + 'hadoop_namenode_log_info_{kind="JvmMetrics",role="NameNode",service="HDFS"}', + ] + ) + + check_metrics(namespace, "namenode", 8183, expected_metrics) + + +def check_datanode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_datanode_num_active_sources{kind="MetricsSystem",role="DataNode",service="HDFS",sub="Stats"}', + # Kind "FSDatasetState" suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "FSDatasetState" + 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" suffixed with "_info" + 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" + 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Non-special metric + 'hadoop_datanode_gc_count{kind="JvmMetrics",role="DataNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend( + [ + # Metric suffixed with "_info" + 'hadoop_datanode_log_info_{kind="JvmMetrics",role="DataNode",service="HDFS"}', + ] + ) + + check_metrics(namespace, "datanode", 8082, expected_metrics) + + +def check_journalnode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_journalnode_num_active_sources{kind="MetricsSystem",role="JournalNode",service="HDFS",sub="Stats"}', + # Non-special metric + 'hadoop_journalnode_bytes_written{kind="Journal-hdfs",role="JournalNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend( + [ + # Metric suffixed with "_info" + 'hadoop_journalnode_log_info_{kind="JvmMetrics",role="JournalNode",service="HDFS"}', + ] + ) + + check_metrics(namespace, "journalnode", 8081, expected_metrics) + + +if __name__ == "__main__": + namespace_arg: str = sys.argv[1] + product_version_arg: str = sys.argv[2] + + logging.basicConfig( + level="DEBUG", + format="%(asctime)s %(levelname)s: %(message)s", + stream=sys.stdout, + ) + + check_namenode_metrics(namespace_arg, product_version_arg) + check_datanode_metrics(namespace_arg, product_version_arg) + check_journalnode_metrics(namespace_arg, product_version_arg) + + print("All expected metrics found")