diff --git a/README.md b/README.md index 8ad34988..f0733122 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,8 @@ Read chapter 4 of the full [thesis](https://github.com/timebertt/thesis-controll - [samples-generator](webhosting-operator/cmd/samples-generator): a tool for generating a given amount of random `Website` objects - [monitoring setup](hack/config/monitoring): a setup for monitoring and measuring load test experiments for the sample operator - includes [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) - - [webhosting-exporter](webhosting-operator/config/monitoring/webhosting-exporter) (based on the [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) [custom resource metrics feature](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/customresourcestate-metrics.md)) for metrics on the state of the webhosting-operator's API objects + - [sharding-exporter](config/monitoring/sharding-exporter): (based on the [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) [custom resource metrics feature](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/customresourcestate-metrics.md)) for metrics on the state of shards + - [webhosting-exporter](webhosting-operator/config/monitoring/webhosting-exporter) for metrics on the state of the webhosting-operator's API objects (similar to above) - [grafana](https://github.com/grafana/grafana) along with some dashboards for [controller-runtime](hack/config/monitoring/default/dashboards) and [webhosting-operator and sharding](webhosting-operator/config/monitoring/default/dashboards) - [experiment](webhosting-operator/cmd/experiment): a tool (based on controller-runtime) for executing load test scenarios for the webhosting-operator - [measure](webhosting-operator/cmd/measure): a tool for retrieving configurable measurements from prometheus and storing them in csv-formatted files for further analysis (with `numpy`) and visualization (with `matplotlib`) diff --git a/config/monitoring/default/kustomization.yaml b/config/monitoring/default/kustomization.yaml new file mode 100644 index 00000000..1c6dda75 --- /dev/null +++ b/config/monitoring/default/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../sharder +- ../sharding-exporter diff --git a/config/monitoring/kustomization.yaml b/config/monitoring/sharder/kustomization.yaml similarity index 100% rename from config/monitoring/kustomization.yaml rename to config/monitoring/sharder/kustomization.yaml diff --git a/config/monitoring/prometheus_rbac.yaml b/config/monitoring/sharder/prometheus_rbac.yaml similarity index 100% rename from config/monitoring/prometheus_rbac.yaml rename to config/monitoring/sharder/prometheus_rbac.yaml diff --git a/config/monitoring/sharder_servicemonitor.yaml b/config/monitoring/sharder/sharder_servicemonitor.yaml similarity index 100% rename from config/monitoring/sharder_servicemonitor.yaml rename to config/monitoring/sharder/sharder_servicemonitor.yaml diff --git a/config/monitoring/sharding-exporter/clusterrole.yaml b/config/monitoring/sharding-exporter/clusterrole.yaml new file mode 100644 index 00000000..de830883 --- /dev/null +++ b/config/monitoring/sharding-exporter/clusterrole.yaml @@ -0,0 +1,29 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: sharding:exporter +rules: +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch +- apiGroups: + - sharding.timebertt.dev + resources: + - clusterrings + verbs: + - get + - list + - watch diff --git a/config/monitoring/sharding-exporter/clusterrolebinding.yaml b/config/monitoring/sharding-exporter/clusterrolebinding.yaml new file mode 100644 index 00000000..1d74f927 --- /dev/null +++ b/config/monitoring/sharding-exporter/clusterrolebinding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: sharding:exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sharding:exporter +subjects: +- kind: ServiceAccount + name: sharding-exporter diff --git a/config/monitoring/sharding-exporter/config.yaml b/config/monitoring/sharding-exporter/config.yaml new file mode 100644 index 00000000..df447da0 --- /dev/null +++ b/config/monitoring/sharding-exporter/config.yaml @@ -0,0 +1,71 @@ +kind: CustomResourceStateMetrics +spec: + resources: + # shard metrics + - metricNamePrefix: kube_shard + groupVersionKind: + group: coordination.k8s.io + version: v1 + kind: Lease + labelsFromPath: + namespace: [metadata, namespace] + shard: [metadata, name] + clusterring: [metadata, labels, sharding.alpha.kubernetes.io/clusterring] + metrics: + - name: info + help: "Information about a Shard" + each: + type: Info + info: + labelsFromPath: + uid: [metadata, uid] + - name: state + help: "The Shard's current state" + each: + type: StateSet + stateSet: + labelName: state + path: [metadata, labels, sharding.alpha.kubernetes.io/state] + list: [orphaned, dead, uncertain, expired, ready, unknown] + # The usual leader election leases don't have the state label making the generator log errors. + # Hence, decrease verbosity of such errors to reduce distraction. + errorLogV: 4 + # clusterring metrics + - metricNamePrefix: kube_clusterring + groupVersionKind: + group: sharding.timebertt.dev + version: v1alpha1 + kind: ClusterRing + labelsFromPath: + clusterring: [metadata, name] + uid: [metadata, uid] + metrics: + - name: info + help: "Information about a ClusterRing" + each: + type: Info + info: {} + - name: metadata_generation + help: "The generation of a ClusterRing" + each: + type: Gauge + gauge: + path: [metadata, generation] + - name: observed_generation + help: "The latest generation observed by the ClusterRing controller" + each: + type: Gauge + gauge: + path: [status, observedGeneration] + - name: status_shards + help: "The ClusterRing's total number of shards" + each: + type: Gauge + gauge: + path: [status, shards] + - name: status_shards + help: "The ClusterRing's total number of available shards" + each: + type: Gauge + gauge: + path: [status, availableShards] diff --git a/config/monitoring/sharding-exporter/deployment.yaml b/config/monitoring/sharding-exporter/deployment.yaml new file mode 100644 index 00000000..9cde50b0 --- /dev/null +++ b/config/monitoring/sharding-exporter/deployment.yaml @@ -0,0 +1,98 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sharding-exporter +spec: + replicas: 1 + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: sharding-exporter + spec: + automountServiceAccountToken: true + containers: + - args: + - --host=127.0.0.1 + - --port=8081 + - --telemetry-host=127.0.0.1 + - --telemetry-port=8082 + - --custom-resource-state-only + - --custom-resource-state-config-file=/etc/kube-state-metrics/config/config.yaml + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.9.2 + name: sharding-exporter + resources: + limits: + cpu: 250m + memory: 400Mi + requests: + cpu: 100m + memory: 200Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 65534 + volumeMounts: + - name: config + mountPath: /etc/kube-state-metrics/config + - args: + - --logtostderr + - --secure-listen-address=:8443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8081/ + image: quay.io/brancz/kube-rbac-proxy:v0.14.0 + name: kube-rbac-proxy-main + ports: + - containerPort: 8443 + name: https-main + resources: + limits: + cpu: 40m + memory: 40Mi + requests: + cpu: 20m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + - args: + - --logtostderr + - --secure-listen-address=:9443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8082/ + image: quay.io/brancz/kube-rbac-proxy:v0.14.0 + name: kube-rbac-proxy-self + ports: + - containerPort: 9443 + name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + volumes: + - name: config + configMap: + name: sharding-exporter-config + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: sharding-exporter diff --git a/config/monitoring/sharding-exporter/kustomization.yaml b/config/monitoring/sharding-exporter/kustomization.yaml new file mode 100644 index 00000000..7bd8da6d --- /dev/null +++ b/config/monitoring/sharding-exporter/kustomization.yaml @@ -0,0 +1,28 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: sharding-system + +generatorOptions: + disableNameSuffixHash: true + +labels: +- includeSelectors: true + pairs: + app.kubernetes.io/name: controller-sharding + app.kubernetes.io/component: sharding-exporter + +resources: +- clusterrole.yaml +- clusterrolebinding.yaml +- rbac-proxy_clusterrole.yaml +- rbac-proxy_clusterrolebinding.yaml +- serviceaccount.yaml +- service.yaml +- deployment.yaml +- servicemonitor.yaml + +configMapGenerator: +- name: sharding-exporter-config + files: + - config.yaml diff --git a/config/monitoring/sharding-exporter/rbac-proxy_clusterrole.yaml b/config/monitoring/sharding-exporter/rbac-proxy_clusterrole.yaml new file mode 100644 index 00000000..29a0e038 --- /dev/null +++ b/config/monitoring/sharding-exporter/rbac-proxy_clusterrole.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: sharding:exporter:rbac-proxy +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/config/monitoring/sharding-exporter/rbac-proxy_clusterrolebinding.yaml b/config/monitoring/sharding-exporter/rbac-proxy_clusterrolebinding.yaml new file mode 100644 index 00000000..52e780a2 --- /dev/null +++ b/config/monitoring/sharding-exporter/rbac-proxy_clusterrolebinding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: sharding:exporter:rbac-proxy +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sharding:exporter:rbac-proxy +subjects: +- kind: ServiceAccount + name: sharding-exporter diff --git a/config/monitoring/sharding-exporter/service.yaml b/config/monitoring/sharding-exporter/service.yaml new file mode 100644 index 00000000..92d018f2 --- /dev/null +++ b/config/monitoring/sharding-exporter/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: sharding-exporter +spec: + clusterIP: None + ports: + - name: https-main + port: 8443 + targetPort: https-main + - name: https-self + port: 9443 + targetPort: https-self diff --git a/config/monitoring/sharding-exporter/serviceaccount.yaml b/config/monitoring/sharding-exporter/serviceaccount.yaml new file mode 100644 index 00000000..7a2c3306 --- /dev/null +++ b/config/monitoring/sharding-exporter/serviceaccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sharding-exporter +automountServiceAccountToken: false diff --git a/config/monitoring/sharding-exporter/servicemonitor.yaml b/config/monitoring/sharding-exporter/servicemonitor.yaml new file mode 100644 index 00000000..ed1d471b --- /dev/null +++ b/config/monitoring/sharding-exporter/servicemonitor.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: sharding-exporter +spec: + jobLabel: app.kubernetes.io/component + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 10s + port: https-main + relabelings: + - action: labeldrop + regex: (pod|service|endpoint|namespace) + scheme: https + scrapeTimeout: 10s + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https-self + scheme: https + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/name: controller-sharding + app.kubernetes.io/component: sharding-exporter diff --git a/hack/config/skaffold.yaml b/hack/config/skaffold.yaml index 7220303e..997c3d2c 100644 --- a/hack/config/skaffold.yaml +++ b/hack/config/skaffold.yaml @@ -212,7 +212,7 @@ manifests: kustomize: paths: - config/default - - config/monitoring + - config/monitoring/default deploy: kubectl: flags: diff --git a/webhosting-operator/config/monitoring/default/dashboards/sharding.json b/webhosting-operator/config/monitoring/default/dashboards/sharding.json index 598e70ca..228fdb59 100644 --- a/webhosting-operator/config/monitoring/default/dashboards/sharding.json +++ b/webhosting-operator/config/monitoring/default/dashboards/sharding.json @@ -96,7 +96,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(kube_shard_info{app=\"$app\"}) or vector(0)", + "expr": "sum(kube_shard_info{clusterring=\"$clusterring\"}) or vector(0)", "refId": "A" } ], @@ -285,7 +285,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(kube_shard_state{app=\"$app\"}) by (state)", + "expr": "sum(kube_shard_state{clusterring=\"$clusterring\"}) by (state)", "hide": false, "instant": false, "interval": "", @@ -363,7 +363,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "(sum(kube_shard_state{app=\"$app\",state=\"ready\"}) or vector(0)) / (sum(kube_shard_info{app=\"$app\"}) or vector(0)) * 100", + "expr": "(sum(kube_shard_state{clusterring=\"$clusterring\",state=\"ready\"}) or vector(0)) / (sum(kube_shard_info{clusterring=\"$clusterring\"}) or vector(0)) * 100", "instant": true, "range": false, "refId": "A" @@ -432,7 +432,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\"} or 0*sum(kube_shard_info{app=\"$app\"}) by (shard)) by (shard) / ignoring(shard) group_left sum(kube_website_shard{namespace=~\"$project\"}) or vector(0)", + "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\"} or 0*sum(kube_shard_info{clusterring=\"clusterring\"}) by (shard)) by (shard) / ignoring(shard) group_left sum(kube_website_shard{namespace=~\"$project\"}) or vector(0)", "instant": true, "legendFormat": "__auto", "range": false, @@ -544,7 +544,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\"} or 0*sum(kube_shard_info{app=\"$app\"}) by (shard)) by (shard)", + "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\"} or 0*sum(kube_shard_info{clusterring=\"$clusterring\"}) by (shard)) by (shard)", "instant": false, "interval": "", "legendFormat": "{{shard}}", @@ -646,7 +646,7 @@ }, "editorMode": "code", "exemplar": true, - "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\"} or 0*sum(kube_shard_info{app=\"$app\"}) by (shard)) by (shard) / ignoring(shard) group_left sum(kube_website_shard{namespace=~\"$project\"})", + "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\"} or 0*sum(kube_shard_info{clusterring=\"$clusterring\"}) by (shard)) by (shard) / ignoring(shard) group_left sum(kube_website_shard{namespace=~\"$project\"})", "format": "time_series", "instant": false, "interval": "", @@ -810,7 +810,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\",drain=\"\"} * on(shard) group_left kube_shard_state{app=\"$app\",state=\"ready\"}) or vector(0)", + "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\",drain=\"\"} * on(shard) group_left kube_shard_state{clusterring=\"$clusterring\",state=\"ready\"}) or vector(0)", "instant": false, "interval": "", "legendFormat": "Assigned", @@ -824,7 +824,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\",drain=\"\"} * on(shard) group_left max(kube_shard_state{app=\"$app\",state!=\"ready\"}) by (shard)) or vector(0)", + "expr": "sum(kube_website_shard{namespace=~\"$project\",shard!=\"\",drain=\"\"} * on(shard) group_left max(kube_shard_state{clusterring=\"$clusterring\",state!=\"ready\"}) by (shard)) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -1211,14 +1211,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(kube_shard_info{job=\"webhosting-exporter\"}, app)", + "definition": "label_values(kube_shard_info, clusterring)", "hide": 0, "includeAll": true, "multi": true, - "name": "app", + "name": "clusterring", "options": [], "query": { - "query": "label_values(kube_shard_info{job=\"webhosting-exporter\"}, app)", + "query": "label_values(kube_shard_info, clusterring)", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -1242,14 +1242,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(controller_runtime_sharding_assignments_total{job=\"webhosting-operator\"}, kind)", + "definition": "label_values(controller_runtime_sharding_assignments_total, kind)", "hide": 0, "includeAll": true, "multi": true, "name": "kind", "options": [], "query": { - "query": "label_values(controller_runtime_sharding_assignments_total{job=\"webhosting-operator\"}, kind)", + "query": "label_values(controller_runtime_sharding_assignments_total, kind)", "refId": "StandardVariableQuery" }, "refresh": 2,