From ec434f7aa43ec6730a44d71e7f2c6b338464ac0f Mon Sep 17 00:00:00 2001 From: Evans Mungai Date: Wed, 19 Apr 2023 16:34:13 +0300 Subject: [PATCH] Default host collector --- host/cri.yaml | 70 ----- host/{cluster-down.yaml => default.yaml} | 352 ++++++++++++++++++++++- host/kubeadm-bootstrap.yaml | 30 -- host/networking-issues.yaml | 167 ----------- host/resource-contention.yaml | 103 ------- 5 files changed, 350 insertions(+), 372 deletions(-) delete mode 100644 host/cri.yaml rename host/{cluster-down.yaml => default.yaml} (63%) delete mode 100644 host/kubeadm-bootstrap.yaml delete mode 100644 host/networking-issues.yaml delete mode 100644 host/resource-contention.yaml diff --git a/host/cri.yaml b/host/cri.yaml deleted file mode 100644 index e4fda11..0000000 --- a/host/cri.yaml +++ /dev/null @@ -1,70 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: networking-issues-non-airgap -spec: - hostCollectors: - - diskUsage: - collectorName: root - path: / - - diskUsage: - collectorName: tmp - path: /tmp - - diskUsage: - collectorName: var-lib-kubelet - path: /var/lib/kubelet - - diskUsage: - collectorName: var-lib-docker - path: /var/lib/docker - - diskUsage: - collectorName: var-lib-containerd - path: /var/lib/containerd - - run: - collectorName: "docker-info" - command: "docker" - args: ["info"] - - run: - collectorName: "crictl-info" - command: "crictl" - args: ["info"] - - run: - collectorName: "crictl-ps" - command: "crictl" - args: ["ps", "-a"] - - run: - collectorName: "docker-ps" - command: "docker" - args: ["ps", "-a"] - - run: - collectorName: "docker-system-df" - command: "docker" - args: ["system", "df", "-v"] - - run: - collectorName: "systemctl-docker-status" - command: "systemctl" - args: ["status", "docker"] - - run: - collectorName: "systemctl-kubelet-status" - command: "systemctl" - args: ["status", "kubelet"] - - run: - collectorName: "systemctl-containerd-status" - command: "systemctl" - args: ["status", "containerd"] - # Logs for CRI, Kubelet, Kernel - - run: - collectorName: "journalctl-containerd" - command: "journalctl" - args: ["-u", "containerd", "--no-pager", "-S", "7 days ago"] - - run: - collectorName: "journalctl-kubelet" - command: "journalctl" - args: ["-u", "kubelet", "--no-pager", "-S", "7 days ago"] - - run: - collectorName: "journalctl-docker" - command: "journalctl" - args: ["-u", "docker", "--no-pager", "-S", "7 days ago"] - - run: - collectorName: "journalctl-dmesg" - command: "journalctl" - args: ["--dmesg", "--no-pager", "-S", "7 days ago"] diff --git a/host/cluster-down.yaml b/host/default.yaml similarity index 63% rename from host/cluster-down.yaml rename to host/default.yaml index 2b5c10b..54391c9 100644 --- a/host/cluster-down.yaml +++ b/host/default.yaml @@ -2,9 +2,9 @@ apiVersion: troubleshoot.sh/v1beta2 kind: SupportBundle metadata: - name: cluster-down + name: default spec: - uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/host/cluster-down.yaml + uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/host/default.yaml hostCollectors: # System Info Collectors - blockDevices: {} @@ -282,6 +282,225 @@ spec: - copy: collectorName: "kurl-logs" path: /var/log/kurl/* + #### cri.yaml #### + - diskUsage: + collectorName: root + path: / + - diskUsage: + collectorName: tmp + path: /tmp + - diskUsage: + collectorName: var-lib-kubelet + path: /var/lib/kubelet + - diskUsage: + collectorName: var-lib-docker + path: /var/lib/docker + - diskUsage: + collectorName: var-lib-containerd + path: /var/lib/containerd + - run: + collectorName: "docker-info" + command: "docker" + args: ["info"] + - run: + collectorName: "crictl-info" + command: "crictl" + args: ["info"] + - run: + collectorName: "crictl-ps" + command: "crictl" + args: ["ps", "-a"] + - run: + collectorName: "docker-ps" + command: "docker" + args: ["ps", "-a"] + - run: + collectorName: "docker-system-df" + command: "docker" + args: ["system", "df", "-v"] + - run: + collectorName: "systemctl-docker-status" + command: "systemctl" + args: ["status", "docker"] + - run: + collectorName: "systemctl-kubelet-status" + command: "systemctl" + args: ["status", "kubelet"] + - run: + collectorName: "systemctl-containerd-status" + command: "systemctl" + args: ["status", "containerd"] + # Logs for CRI, Kubelet, Kernel + - run: + collectorName: "journalctl-containerd" + command: "journalctl" + args: ["-u", "containerd", "--no-pager", "-S", "7 days ago"] + - run: + collectorName: "journalctl-kubelet" + command: "journalctl" + args: ["-u", "kubelet", "--no-pager", "-S", "7 days ago"] + - run: + collectorName: "journalctl-docker" + command: "journalctl" + args: ["-u", "docker", "--no-pager", "-S", "7 days ago"] + - run: + collectorName: "journalctl-dmesg" + command: "journalctl" + args: ["--dmesg", "--no-pager", "-S", "7 days ago"] + ### kubeadm-bootstrap.yaml ### + - run: + collectorName: "kubeadm.conf" + command: "cat" + args: ["/opt/replicated/kubeadm.conf"] + - run: + collectorName: "kubeadm-init-raw.yaml" + command: "cat" + args: ["/opt/replicated/kubeadm-init-raw.yaml"] + - run: + collectorName: "kubeadm-flags.env" + command: "cat" + args: ["/var/lib/kubelet/kubeadm-flags.env"] + - run: + collectorName: "kurl-host-preflights" + command: "tail" + args: ["-n", "+1", "/var/lib/kurl/host-preflights/*"] + - run: + collectorName: "kubeadm-kustomize-patches" + command: "sh" + args: ["-c", "find /var/lib/kurl/kustomize -type f -exec tail -n +1 {} +;"] + - run: + collectorName: "tmp-kubeadm.conf" + command: "cat" + args: ["/var/lib/kubelet/tmp-kubeadm.conf"] + ### networking-issues.yaml ### + - ipv4Interfaces: {} + - certificate: + collectorName: k8s-api-keypair + certificatePath: /etc/kubernetes/pki/apiserver.crt + keyPath: /etc/kubernetes/pki/apiserver.key + - certificate: + collectorName: etcd-keypair + certificatePath: /etc/kubernetes/pki/etcd/server.crt + keyPath: /etc/kubernetes/pki/etcd/server.key + - http: + collectorName: curl-api-replicated-com + get: + url: https://api.replicated.com/healthz + - http: + collectorName: curl-get-replicated-com + get: + url: https://get.replicated.com/healthz + - http: + collectorName: curl-registry-replicated-com + get: + url: https://registry.replicated.com/healthz + - http: + collectorName: curl-proxy-replicated-com + get: + url: https://proxy.replicated.com/healthz + - http: + collectorName: curl-k8s-kurl-sh + get: + url: https://k8s.kurl.sh/healthz + - http: + collectorName: curl-replicated-app + get: + url: https://replicated.app/healthz + - run: + collectorName: "sysctl" + command: "sysctl" + args: ["-a"] + - run: + collectorName: "iptables" + command: "iptables" + args: ["-L", "-v"] + - run: + collectorName: "netstat-route-table" + command: "netstat" + args: ["-r", "-n"] + - run: + collectorName: "netstat-ports" + command: "netstat" + args: ["-t", "-u", "-l", "-p", "-n"] + - run: + collectorName: "systemctl-firewalld-status" + command: "systemctl" + args: ["status", "firewalld"] + - run: + collectorName: "journalctl-dmesg" + command: "journalctl" + args: ["--dmesg", "--no-pager", "-S", "7 days ago"] + ### resource-contention.yaml ### + # System Info Collectors + - blockDevices: {} + - cpu: {} + - hostOS: {} + - hostServices: {} + - ipv4Interfaces: {} + - memory: {} + - time: {} + - run: + collectorName: "uptime" + command: "uptime" + args: [] + - run: + collectorName: "free" + command: "free" + args: ["-m"] + - run: + collectorName: "top" + command: "top" + args: ["-b", "-n", "1"] + - run: + collectorName: "uname" + command: "uname" + args: ["-a"] + - run: + collectorName: "df" + command: "df" + args: ["-h"] + - run: + collectorName: "du-root" + command: "sh" + args: ["-c", "du -Shax / --exclude /proc | sort -rh | head -20"] + - run: + collectorName: "mount" + command: "mount" + args: ["-l"] + - run: + collectorName: "iostat" + command: "iostat" + args: ["-x"] + - run: + collectorName: "vmstat" + command: "vmstat" + args: ["-w"] + - run: + collectorName: "iostat" + command: "iostat" + args: ["-x"] + - run: + collectorName: "ps-high-load" + command: "sh" + args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"] + - run: + collectorName: "journalctl-dmesg" + command: "journalctl" + args: ["--dmesg", "--no-pager", "-S", "7 days ago"] + - filesystemPerformance: + collectorName: filesystem-latency-two-minute-benchmark + timeout: 2m + directory: /var/lib/etcd + fileSize: 22Mi + operationSizeBytes: 2300 + datasync: true + enableBackgroundIOPS: true + backgroundIOPSWarmupSeconds: 10 + backgroundWriteIOPS: 300 + backgroundWriteIOPSJobs: 6 + backgroundReadIOPS: 50 + backgroundReadIOPSJobs: 1 + exclude: true hostAnalyzers: - certificate: collectorName: k8s-api-keypair @@ -473,6 +692,135 @@ spec: message: curl -k https://localhost:6443/healthz returned HTTP CODE response 200. - warn: message: "Unexpected response. HTTP CODE response is not 200. Please, run `curl -ki https://localhost:6443/healthz` to check further information." + ### networking-issues.yaml ### + - certificate: + collectorName: k8s-api-keypair + outcomes: + - fail: + when: "key-pair-missing" + message: Certificate key pair not found in /etc/kubernetes/pki/apiserver.* + - fail: + when: "key-pair-switched" + message: Cert and key pair are switched + - fail: + when: "key-pair-encrypted" + message: Private key is encrypted + - fail: + when: "key-pair-mismatch" + message: Cert and key do not match + - fail: + when: "key-pair-invalid" + message: Certificate key pair is invalid + - pass: + when: "key-pair-valid" + message: Certificate key pair is valid + - certificate: + collectorName: etcd-keypair + outcomes: + - fail: + when: "key-pair-missing" + message: Certificate key pair not found in /etc/kubernetes/pki/etcd/server.* + - fail: + when: "key-pair-switched" + message: Cert and key pair are switched + - fail: + when: "key-pair-encrypted" + message: Private key is encrypted + - fail: + when: "key-pair-mismatch" + message: Cert and key do not match + - fail: + when: "key-pair-invalid" + message: Certificate key pair is invalid + - pass: + when: "key-pair-valid" + message: Certificate key pair is valid + - http: + checkName: curl-api-replicated-com + collectorName: curl-api-replicated-com + outcomes: + - warn: + when: "error" + message: Error connecting to https://api.replicated.com/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://api.replicated.com/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-registry-replicated-com + collectorName: curl-registry-replicated-com + outcomes: + - warn: + when: "error" + message: Error connecting to https://registry.replicated.com/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://registry.replicated.com/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-proxy-replicated-com + collectorName: curl-proxy-replicated-com + outcomes: + - warn: + when: "error" + message: Error connecting to https://proxy.replicated.com/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://proxy.replicated.com/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-k8s-kurl-sh + collectorName: curl-k8s-kurl-sh + outcomes: + - warn: + when: "error" + message: Error connecting to https://k8s.kurl.sh/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://k8s.kurl.sh/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-replicated-app + collectorName: curl-replicated-app + outcomes: + - warn: + when: "error" + message: Error connecting to https://replicated.app/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://replicated.app/healthz + - warn: + message: "Unexpected response" + ### resource-contention.yaml ### + - cpu: + checkName: "Number of CPUs" + outcomes: + - warn: + when: "count < 4" + message: At least 4 CPU cores are recommended for kURL https://kurl.sh/docs/install-with-kurl/system-requirements + - pass: + message: This server has at least 4 CPU cores + - memory: + checkName: "Amount of Memory" + outcomes: + - warn: + when: "< 8G" + message: At least 8G of memory is recommended for kURL https://kurl.sh/docs/install-with-kurl/system-requirements + - pass: + message: The system has at least 8G of memory + - filesystemPerformance: + collectorName: filesystem-latency-two-minute-benchmark + outcomes: + - pass: + when: "p99 < 10ms" + message: "Write latency is ok (p99 target < 10ms)" + - warn: + message: "Write latency is high. p99 target >= 10ms)" + exclude: true analyzers: - textAnalyze: checkName: Hostname Mismatch diff --git a/host/kubeadm-bootstrap.yaml b/host/kubeadm-bootstrap.yaml deleted file mode 100644 index 0a08dfa..0000000 --- a/host/kubeadm-bootstrap.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: kubeadm-bootstrap -spec: - hostCollectors: - - run: - collectorName: "kubeadm.conf" - command: "cat" - args: ["/opt/replicated/kubeadm.conf"] - - run: - collectorName: "kubeadm-init-raw.yaml" - command: "cat" - args: ["/opt/replicated/kubeadm-init-raw.yaml"] - - run: - collectorName: "kubeadm-flags.env" - command: "cat" - args: ["/var/lib/kubelet/kubeadm-flags.env"] - - run: - collectorName: "kurl-host-preflights" - command: "tail" - args: ["-n", "+1", "/var/lib/kurl/host-preflights/*"] - - run: - collectorName: "kubeadm-kustomize-patches" - command: "sh" - args: ["-c", "find /var/lib/kurl/kustomize -type f -exec tail -n +1 {} +;"] - - run: - collectorName: "tmp-kubeadm.conf" - command: "cat" - args: ["/var/lib/kubelet/tmp-kubeadm.conf"] diff --git a/host/networking-issues.yaml b/host/networking-issues.yaml deleted file mode 100644 index 6b7559f..0000000 --- a/host/networking-issues.yaml +++ /dev/null @@ -1,167 +0,0 @@ -# Spec to identify issues with networking on the host in a non-airgapped deployment -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: networking-issues-non-airgap -spec: - hostCollectors: - - ipv4Interfaces: {} - - certificate: - collectorName: k8s-api-keypair - certificatePath: /etc/kubernetes/pki/apiserver.crt - keyPath: /etc/kubernetes/pki/apiserver.key - - certificate: - collectorName: etcd-keypair - certificatePath: /etc/kubernetes/pki/etcd/server.crt - keyPath: /etc/kubernetes/pki/etcd/server.key - - http: - collectorName: curl-api-replicated-com - get: - url: https://api.replicated.com/healthz - - http: - collectorName: curl-get-replicated-com - get: - url: https://get.replicated.com/healthz - - http: - collectorName: curl-registry-replicated-com - get: - url: https://registry.replicated.com/healthz - - http: - collectorName: curl-proxy-replicated-com - get: - url: https://proxy.replicated.com/healthz - - http: - collectorName: curl-k8s-kurl-sh - get: - url: https://k8s.kurl.sh/healthz - - http: - collectorName: curl-replicated-app - get: - url: https://replicated.app/healthz - - run: - collectorName: "sysctl" - command: "sysctl" - args: ["-a"] - - run: - collectorName: "iptables" - command: "iptables" - args: ["-L", "-v"] - - run: - collectorName: "netstat-route-table" - command: "netstat" - args: ["-r", "-n"] - - run: - collectorName: "netstat-ports" - command: "netstat" - args: ["-t", "-u", "-l", "-p", "-n"] - - run: - collectorName: "systemctl-firewalld-status" - command: "systemctl" - args: ["status", "firewalld"] - - run: - collectorName: "journalctl-dmesg" - command: "journalctl" - args: ["--dmesg", "--no-pager", "-S", "7 days ago"] - hostAnalyzers: - - certificate: - collectorName: k8s-api-keypair - outcomes: - - fail: - when: "key-pair-missing" - message: Certificate key pair not found in /etc/kubernetes/pki/apiserver.* - - fail: - when: "key-pair-switched" - message: Cert and key pair are switched - - fail: - when: "key-pair-encrypted" - message: Private key is encrypted - - fail: - when: "key-pair-mismatch" - message: Cert and key do not match - - fail: - when: "key-pair-invalid" - message: Certificate key pair is invalid - - pass: - when: "key-pair-valid" - message: Certificate key pair is valid - - certificate: - collectorName: etcd-keypair - outcomes: - - fail: - when: "key-pair-missing" - message: Certificate key pair not found in /etc/kubernetes/pki/etcd/server.* - - fail: - when: "key-pair-switched" - message: Cert and key pair are switched - - fail: - when: "key-pair-encrypted" - message: Private key is encrypted - - fail: - when: "key-pair-mismatch" - message: Cert and key do not match - - fail: - when: "key-pair-invalid" - message: Certificate key pair is invalid - - pass: - when: "key-pair-valid" - message: Certificate key pair is valid - - http: - checkName: curl-api-replicated-com - collectorName: curl-api-replicated-com - outcomes: - - warn: - when: "error" - message: Error connecting to https://api.replicated.com/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://api.replicated.com/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-registry-replicated-com - collectorName: curl-registry-replicated-com - outcomes: - - warn: - when: "error" - message: Error connecting to https://registry.replicated.com/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://registry.replicated.com/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-proxy-replicated-com - collectorName: curl-proxy-replicated-com - outcomes: - - warn: - when: "error" - message: Error connecting to https://proxy.replicated.com/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://proxy.replicated.com/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-k8s-kurl-sh - collectorName: curl-k8s-kurl-sh - outcomes: - - warn: - when: "error" - message: Error connecting to https://k8s.kurl.sh/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://k8s.kurl.sh/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-replicated-app - collectorName: curl-replicated-app - outcomes: - - warn: - when: "error" - message: Error connecting to https://replicated.app/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://replicated.app/healthz - - warn: - message: "Unexpected response" diff --git a/host/resource-contention.yaml b/host/resource-contention.yaml deleted file mode 100644 index 10d9bf4..0000000 --- a/host/resource-contention.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Spec to gather additional information about cpu, memory, and disk on the system to identify potential resource contention and performance issues -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: resource-contention -spec: - hostCollectors: - # System Info Collectors - - blockDevices: {} - - cpu: {} - - hostOS: {} - - hostServices: {} - - ipv4Interfaces: {} - - memory: {} - - time: {} - - run: - collectorName: "uptime" - command: "uptime" - args: [] - - run: - collectorName: "free" - command: "free" - args: ["-m"] - - run: - collectorName: "top" - command: "top" - args: ["-b", "-n", "1"] - - run: - collectorName: "uname" - command: "uname" - args: ["-a"] - - run: - collectorName: "df" - command: "df" - args: ["-h"] - - run: - collectorName: "du-root" - command: "sh" - args: ["-c", "du -Shax / --exclude /proc | sort -rh | head -20"] - - run: - collectorName: "mount" - command: "mount" - args: ["-l"] - - run: - collectorName: "iostat" - command: "iostat" - args: ["-x"] - - run: - collectorName: "vmstat" - command: "vmstat" - args: ["-w"] - - run: - collectorName: "iostat" - command: "iostat" - args: ["-x"] - - run: - collectorName: "ps-high-load" - command: "sh" - args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"] - - run: - collectorName: "journalctl-dmesg" - command: "journalctl" - args: ["--dmesg", "--no-pager", "-S", "7 days ago"] - - filesystemPerformance: - collectorName: filesystem-latency-two-minute-benchmark - timeout: 2m - directory: /var/lib/etcd - fileSize: 22Mi - operationSizeBytes: 2300 - datasync: true - enableBackgroundIOPS: true - backgroundIOPSWarmupSeconds: 10 - backgroundWriteIOPS: 300 - backgroundWriteIOPSJobs: 6 - backgroundReadIOPS: 50 - backgroundReadIOPSJobs: 1 - exclude: true - hostAnalyzers: - - cpu: - checkName: "Number of CPUs" - outcomes: - - warn: - when: "count < 4" - message: At least 4 CPU cores are recommended for kURL https://kurl.sh/docs/install-with-kurl/system-requirements - - pass: - message: This server has at least 4 CPU cores - - memory: - checkName: "Amount of Memory" - outcomes: - - warn: - when: "< 8G" - message: At least 8G of memory is recommended for kURL https://kurl.sh/docs/install-with-kurl/system-requirements - - pass: - message: The system has at least 8G of memory - - filesystemPerformance: - collectorName: filesystem-latency-two-minute-benchmark - outcomes: - - pass: - when: "p99 < 10ms" - message: "Write latency is ok (p99 target < 10ms)" - - warn: - message: "Write latency is high. p99 target >= 10ms)" - exclude: true