From b2758955030a185a86c21a527ac9da4a569afe66 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 13:07:54 +0100 Subject: [PATCH 01/14] Current config from the metrics-improve-scrape-times branch, see https://github.com/Yolean/kubernetes-kafka/pull/49 --- jmx/10-metrics-config.yml | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 jmx/10-metrics-config.yml diff --git a/jmx/10-metrics-config.yml b/jmx/10-metrics-config.yml new file mode 100644 index 00000000..4416cceb --- /dev/null +++ b/jmx/10-metrics-config.yml @@ -0,0 +1,44 @@ +kind: ConfigMap +metadata: + name: jmx-config + namespace: kafka +apiVersion: v1 +data: + + jmx-kafka-prometheus.yml: |+ + lowercaseOutputName: true + jmxUrl: service:jmx:rmi:///jndi/rmi://127.0.0.1:5555/jmxrmi + ssl: false + whitelistObjectNames: ["kafka.server:*","java.lang:*"] + rules: + - pattern : kafka.server<>Value + - pattern : kafka.server<>OneMinuteRate + - pattern : kafka.server<>OneMinuteRate + - pattern : kafka.server<>queue-size + - pattern : kafka.server<>(Value|OneMinuteRate) + - pattern : kafka.server<>(.*) + - pattern : kafka.server<>(.*) + - pattern : kafka.server<>queue-size + - pattern : kafka.server<>OneMinuteRate + - pattern : java.lang<>SystemCpuLoad + - pattern : java.langused + - pattern : java.lang<>FreePhysicalMemorySize + + jmx-zookeeper-prometheus.yaml: |+ + rules: + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$2" + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$3" + labels: + replicaId: "$2" + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$4" + labels: + replicaId: "$2" + memberType: "$3" + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$4_$5" + labels: + replicaId: "$2" + memberType: "$3" \ No newline at end of file From d82b419d3ea798c23c3a85ff793d03ef8e483314 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 13:13:06 +0100 Subject: [PATCH 02/14] The metrics part of #49 --- jmx/50kafka.yml | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 jmx/50kafka.yml diff --git a/jmx/50kafka.yml b/jmx/50kafka.yml new file mode 100644 index 00000000..9dea8bb4 --- /dev/null +++ b/jmx/50kafka.yml @@ -0,0 +1,38 @@ +apiVersion: apps/v1beta2 +kind: StatefulSet +metadata: + name: kafka + namespace: kafka +spec: + template: + metadata: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "5556" + spec: + containers: + - name: metrics + image: solsson/kafka-prometheus-jmx-exporter@sha256:40a6ab24ccac0ed5acb8c02dccfbb1f5924fd97f46c0450e0245686c24138b53 + command: + - java + - -Xmx64M + - -XX:MaxMetaspaceSize=32m + - -jar + - jmx_prometheus_httpserver.jar + - "5556" + - /etc/jmx-kafka/jmx-kafka-prometheus.yml + ports: + - containerPort: 5556 + resources: + requests: + cpu: 0m + memory: 100Mi + limits: + memory: 150Mi + volumeMounts: + - name: jmx-config + mountPath: /etc/jmx-kafka + volumes: + - name: jmx-config + configMap: + name: jmx-config From 162902ca01e5245448fa9c9c8ecf69e138bd66da Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 13:13:36 +0100 Subject: [PATCH 03/14] Upgrades jmx exporter to 0.2.0 --- jmx/50kafka.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jmx/50kafka.yml b/jmx/50kafka.yml index 9dea8bb4..3e19b6fe 100644 --- a/jmx/50kafka.yml +++ b/jmx/50kafka.yml @@ -12,7 +12,7 @@ spec: spec: containers: - name: metrics - image: solsson/kafka-prometheus-jmx-exporter@sha256:40a6ab24ccac0ed5acb8c02dccfbb1f5924fd97f46c0450e0245686c24138b53 + image: solsson/kafka-prometheus-jmx-exporter@sha256:a23062396cd5af1acdf76512632c20ea6be76885dfc20cd9ff40fb23846557e8 command: - java - -Xmx64M From 74a5177270301f3cbe0342657d8bcf70d1ae76ed Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 15:23:33 +0100 Subject: [PATCH 04/14] Adds liveness probe from the metrics-improve-scrape-times branch --- jmx/50kafka.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/jmx/50kafka.yml b/jmx/50kafka.yml index 3e19b6fe..0e475c21 100644 --- a/jmx/50kafka.yml +++ b/jmx/50kafka.yml @@ -23,6 +23,11 @@ spec: - /etc/jmx-kafka/jmx-kafka-prometheus.yml ports: - containerPort: 5556 + livenessProbe: + httpGet: + path: /liveness + port: 5556 + periodSeconds: 60 resources: requests: cpu: 0m From da113d29ecdfb9c1d6308743b0c29dd68252b177 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 15:31:08 +0100 Subject: [PATCH 05/14] This is an optional feature, but belongs to the broker pods, so let's evaluate kubectl patch to add it --- jmx/50kafka.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jmx/50kafka.yml b/jmx/50kafka.yml index 0e475c21..91e1765f 100644 --- a/jmx/50kafka.yml +++ b/jmx/50kafka.yml @@ -1,3 +1,5 @@ +# meant to be applied using +# kubectl --namespace kafka patch statefulset kafka --patch "$(cat jmx/50kafka.yml)" apiVersion: apps/v1beta2 kind: StatefulSet metadata: From eaf9ebd4527a0ce2706c14c92638e8cd00484280 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 15:32:59 +0100 Subject: [PATCH 06/14] Evaluates JVM memory limit awareness Interesting input for #112, for use with broker and zk pods in addition to KAFKA_HEAP_OPTS. --- jmx/50kafka.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jmx/50kafka.yml b/jmx/50kafka.yml index 91e1765f..30a390b1 100644 --- a/jmx/50kafka.yml +++ b/jmx/50kafka.yml @@ -17,8 +17,10 @@ spec: image: solsson/kafka-prometheus-jmx-exporter@sha256:a23062396cd5af1acdf76512632c20ea6be76885dfc20cd9ff40fb23846557e8 command: - java - - -Xmx64M - - -XX:MaxMetaspaceSize=32m + - -XX:+UnlockExperimentalVMOptions + - -XX:+UseCGroupMemoryLimitForHeap + - -XX:MaxRAMFraction=1 + - -XshowSettings:vm - -jar - jmx_prometheus_httpserver.jar - "5556" From 66255af447373f9b14e338a4b819dddad483a257 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 15:33:45 +0100 Subject: [PATCH 07/14] On start the metrics pod gesses that it has ~45% of the memory limit as "Max. Heap Size (Estimated)" Reducing limits as experiment for #112. --- jmx/50kafka.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jmx/50kafka.yml b/jmx/50kafka.yml index 30a390b1..9da1d41b 100644 --- a/jmx/50kafka.yml +++ b/jmx/50kafka.yml @@ -35,9 +35,9 @@ spec: resources: requests: cpu: 0m - memory: 100Mi + memory: 60Mi limits: - memory: 150Mi + memory: 120Mi volumeMounts: - name: jmx-config mountPath: /etc/jmx-kafka From e05b7900335bdcc35fb91d2df1c8257ae21360f5 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 15:43:19 +0100 Subject: [PATCH 08/14] With kubectl patch, this folder kan be scoped as Prometheus support --- {jmx => prometheus}/10-metrics-config.yml | 0 jmx/50kafka.yml => prometheus/50-kafka-jmx-exporter-patch.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename {jmx => prometheus}/10-metrics-config.yml (100%) rename jmx/50kafka.yml => prometheus/50-kafka-jmx-exporter-patch.yml (96%) diff --git a/jmx/10-metrics-config.yml b/prometheus/10-metrics-config.yml similarity index 100% rename from jmx/10-metrics-config.yml rename to prometheus/10-metrics-config.yml diff --git a/jmx/50kafka.yml b/prometheus/50-kafka-jmx-exporter-patch.yml similarity index 96% rename from jmx/50kafka.yml rename to prometheus/50-kafka-jmx-exporter-patch.yml index 9da1d41b..c17e7919 100644 --- a/jmx/50kafka.yml +++ b/prometheus/50-kafka-jmx-exporter-patch.yml @@ -1,5 +1,5 @@ # meant to be applied using -# kubectl --namespace kafka patch statefulset kafka --patch "$(cat jmx/50kafka.yml)" +# kubectl --namespace kafka patch statefulset kafka --patch "$(cat prometheus/50-kafka-jmx-exporter-patch.yml )" apiVersion: apps/v1beta2 kind: StatefulSet metadata: From f1e6e96231465775ffa0ded5b745774182fec5ff Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 16:02:39 +0100 Subject: [PATCH 09/14] With 120M you get OOMKilled even on brokers with modest metrics volumes This reverts commit 66255af447373f9b14e338a4b819dddad483a257. --- prometheus/50-kafka-jmx-exporter-patch.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prometheus/50-kafka-jmx-exporter-patch.yml b/prometheus/50-kafka-jmx-exporter-patch.yml index c17e7919..f257dbe9 100644 --- a/prometheus/50-kafka-jmx-exporter-patch.yml +++ b/prometheus/50-kafka-jmx-exporter-patch.yml @@ -35,9 +35,9 @@ spec: resources: requests: cpu: 0m - memory: 60Mi + memory: 100Mi limits: - memory: 120Mi + memory: 150Mi volumeMounts: - name: jmx-config mountPath: /etc/jmx-kafka From 0d78e08f6f929997f741ff79b6ec7a63c4c7cda9 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 19 Jan 2018 16:11:47 +0100 Subject: [PATCH 10/14] Removes liveness probes, to focus on memory limits, and rely on metric staleness alerts instead for exporter liveness. This reverts commit 74a5177270301f3cbe0342657d8bcf70d1ae76ed. --- prometheus/50-kafka-jmx-exporter-patch.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/prometheus/50-kafka-jmx-exporter-patch.yml b/prometheus/50-kafka-jmx-exporter-patch.yml index f257dbe9..f5a9cf1f 100644 --- a/prometheus/50-kafka-jmx-exporter-patch.yml +++ b/prometheus/50-kafka-jmx-exporter-patch.yml @@ -27,11 +27,6 @@ spec: - /etc/jmx-kafka/jmx-kafka-prometheus.yml ports: - containerPort: 5556 - livenessProbe: - httpGet: - path: /liveness - port: 5556 - periodSeconds: 60 resources: requests: cpu: 0m From cfe434c55bc44d53ff6a569fa480ef4d92bd1144 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Sat, 20 Jan 2018 07:55:09 +0100 Subject: [PATCH 11/14] It was the liveness probe that killed the metrics container, and we might not need liveness if we have alerts for stale metrics. This reverts commit f1e6e96231465775ffa0ded5b745774182fec5ff. --- prometheus/50-kafka-jmx-exporter-patch.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prometheus/50-kafka-jmx-exporter-patch.yml b/prometheus/50-kafka-jmx-exporter-patch.yml index f5a9cf1f..7876ef47 100644 --- a/prometheus/50-kafka-jmx-exporter-patch.yml +++ b/prometheus/50-kafka-jmx-exporter-patch.yml @@ -30,9 +30,9 @@ spec: resources: requests: cpu: 0m - memory: 100Mi + memory: 60Mi limits: - memory: 150Mi + memory: 120Mi volumeMounts: - name: jmx-config mountPath: /etc/jmx-kafka From 152bb19a7cdd448941e07c3a7c23e5a16f9c6b56 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 2 Feb 2018 12:33:27 +0100 Subject: [PATCH 12/14] =?UTF-8?q?=E2=80=9DIn=20a=20production=20Kafka=20cl?= =?UTF-8?q?uster,=20an=20offline=20partition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit may be impacting the producer clients, losing messages or causing back-pressure in the application. This is most often a “site down” type of problem and will need to be addressed immediately.” Excerpt from: Neha Narkhede, Gwen Shapira, and Todd Palino. ”Kafka: The Definitive Guide”. We now export kafka_controller_kafkacontroller_value{name="OfflinePartitionsCount",} and friends. See #140 for why. --- prometheus/10-metrics-config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prometheus/10-metrics-config.yml b/prometheus/10-metrics-config.yml index 4416cceb..33bed507 100644 --- a/prometheus/10-metrics-config.yml +++ b/prometheus/10-metrics-config.yml @@ -9,8 +9,9 @@ data: lowercaseOutputName: true jmxUrl: service:jmx:rmi:///jndi/rmi://127.0.0.1:5555/jmxrmi ssl: false - whitelistObjectNames: ["kafka.server:*","java.lang:*"] + whitelistObjectNames: ["kafka.controller:*","kafka.server:*","java.lang:*"] rules: + - pattern : kafka.controller<>(.*) - pattern : kafka.server<>Value - pattern : kafka.server<>OneMinuteRate - pattern : kafka.server<>OneMinuteRate From f02213898558c4363e8e294f4a38e7849446ec27 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Sat, 3 Feb 2018 14:13:44 +0100 Subject: [PATCH 13/14] At first i suspected that order matters, but this also works --- prometheus/10-metrics-config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prometheus/10-metrics-config.yml b/prometheus/10-metrics-config.yml index 33bed507..7d77b163 100644 --- a/prometheus/10-metrics-config.yml +++ b/prometheus/10-metrics-config.yml @@ -9,9 +9,8 @@ data: lowercaseOutputName: true jmxUrl: service:jmx:rmi:///jndi/rmi://127.0.0.1:5555/jmxrmi ssl: false - whitelistObjectNames: ["kafka.controller:*","kafka.server:*","java.lang:*"] + whitelistObjectNames: ["kafka.server:*","kafka.controller:*","java.lang:*"] rules: - - pattern : kafka.controller<>(.*) - pattern : kafka.server<>Value - pattern : kafka.server<>OneMinuteRate - pattern : kafka.server<>OneMinuteRate @@ -21,6 +20,7 @@ data: - pattern : kafka.server<>(.*) - pattern : kafka.server<>queue-size - pattern : kafka.server<>OneMinuteRate + - pattern : kafka.controller<>(.*) - pattern : java.lang<>SystemCpuLoad - pattern : java.langused - pattern : java.lang<>FreePhysicalMemorySize From a564ed271d95058f23d3230042865b918b03214d Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Sat, 3 Feb 2018 14:25:07 +0100 Subject: [PATCH 14/14] There's only values in this metric type --- prometheus/10-metrics-config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus/10-metrics-config.yml b/prometheus/10-metrics-config.yml index 7d77b163..345e1929 100644 --- a/prometheus/10-metrics-config.yml +++ b/prometheus/10-metrics-config.yml @@ -20,7 +20,7 @@ data: - pattern : kafka.server<>(.*) - pattern : kafka.server<>queue-size - pattern : kafka.server<>OneMinuteRate - - pattern : kafka.controller<>(.*) + - pattern : kafka.controller<>Value - pattern : java.lang<>SystemCpuLoad - pattern : java.langused - pattern : java.lang<>FreePhysicalMemorySize