Skip to content

Commit

Permalink
Merge pull request #2442 from rexagod/readyz
Browse files Browse the repository at this point in the history
fix: add `readyz` endpoint
  • Loading branch information
k8s-ci-robot authored Jul 15, 2024
2 parents a1fb0ce + dbb0276 commit f7618df
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 39 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,13 @@ After running the above, if you see `Clusterrolebinding "cluster-admin-binding"

#### Healthcheck Endpoints

The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
The following healthcheck endpoints are available (`self` refers to the telemetry port, while `main` refers to the exposition port):

* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
* `/healthz` (exposed on `main`): Returns a 200 status code if the application is running. We recommend to use this for the startup probe.
* `/livez` (exposed on `main`): Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to using this for the liveness probe.
* `/readyz` (exposed on `self`): Returns a 200 status code if the application is ready to accept requests and expose metrics. We recommend using this for the readiness probe.

Note that it is discouraged to use the telemetry metrics endpoint for any probe when proxying the exposition data.

#### Limited privileges environment

Expand Down
10 changes: 6 additions & 4 deletions README.md.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -347,11 +347,13 @@ After running the above, if you see `Clusterrolebinding "cluster-admin-binding"

#### Healthcheck Endpoints

The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:
The following healthcheck endpoints are available (`self` refers to the telemetry port, while `main` refers to the exposition port):

* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.
* `/healthz` (exposed on `main`): Returns a 200 status code if the application is running. We recommend to use this for the startup probe.
* `/livez` (exposed on `main`): Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to using this for the liveness probe.
* `/readyz` (exposed on `self`): Returns a 200 status code if the application is ready to accept requests and expose metrics. We recommend using this for the readiness probe.

Note that it is discouraged to use the telemetry metrics endpoint for any probe when proxying the exposition data.

#### Limited privileges environment

Expand Down
6 changes: 3 additions & 3 deletions examples/autosharding/statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ spec:
livenessProbe:
httpGet:
path: /livez
port: 8080
port: http-metrics
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
Expand All @@ -49,8 +49,8 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /metrics
port: 8081
path: /readyz
port: telemetry
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
Expand Down
6 changes: 3 additions & 3 deletions examples/daemonsetsharding/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
livenessProbe:
httpGet:
path: /livez
port: 8080
port: http-metrics
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics-shard
Expand All @@ -44,8 +44,8 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /metrics
port: 8081
path: /readyz
port: telemetry
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
Expand Down
6 changes: 3 additions & 3 deletions examples/daemonsetsharding/deployment-no-node-pods.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
livenessProbe:
httpGet:
path: /livez
port: 8080
port: http-metrics
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
Expand All @@ -39,8 +39,8 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /metrics
port: 8081
path: /readyz
port: telemetry
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
Expand Down
6 changes: 3 additions & 3 deletions examples/daemonsetsharding/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
livenessProbe:
httpGet:
path: /livez
port: 8080
port: http-metrics
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
Expand All @@ -38,8 +38,8 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /metrics
port: 8081
path: /readyz
port: telemetry
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
Expand Down
6 changes: 3 additions & 3 deletions examples/standard/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ spec:
livenessProbe:
httpGet:
path: /livez
port: 8080
port: http-metrics
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
Expand All @@ -36,8 +36,8 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /metrics
port: 8081
path: /readyz
port: telemetry
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
Expand Down
6 changes: 3 additions & 3 deletions jsonnet/kube-state-metrics/kube-state-metrics.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,12 @@
seccompProfile: { type: 'RuntimeDefault' },
},
livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8080,
port: "http-metrics",
path: '/livez',
} },
readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8081,
path: '/metrics',
port: "telemetry",
path: '/readyz',
} },
};

Expand Down
39 changes: 27 additions & 12 deletions pkg/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ const (
metricsPath = "/metrics"
healthzPath = "/healthz"
livezPath = "/livez"
readyzPath = "/readyz"
)

// promLogger implements promhttp.Logger
Expand Down Expand Up @@ -376,6 +377,18 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
// Add metricsPath
mux.Handle(metricsPath, promhttp.HandlerFor(registry, promhttp.HandlerOpts{ErrorLog: promLogger{}}))

// Add readyzPath
mux.Handle(readyzPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
count, err := util.GatherAndCount(registry)
if err != nil || count == 0 {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(http.StatusText(http.StatusOK)))
}))

// Add index
landingConfig := web.LandingConfig{
Name: "kube-state-metrics",
Expand All @@ -396,6 +409,19 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
return mux
}

func handleClusterDelegationForProber(client kubernetes.Interface, probeType string) http.HandlerFunc {
return func(w http.ResponseWriter, _ *http.Request) {
got := client.CoreV1().RESTClient().Get().AbsPath(probeType).Do(context.Background())
if got.Error() != nil {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(http.StatusText(http.StatusOK)))
}
}

func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
mux := http.NewServeMux()

Expand All @@ -410,18 +436,7 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))

// Add livezPath
mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {

// Query the Kube API to make sure we are not affected by a network outage.
got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
if got.Error() != nil {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(http.StatusText(http.StatusOK)))
}))
mux.Handle(livezPath, handleClusterDelegationForProber(client, livezPath))

// Add healthzPath
mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {
Expand Down
20 changes: 19 additions & 1 deletion pkg/util/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"runtime"
"strings"

"github.com/prometheus/common/version"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/discovery"
Expand All @@ -32,6 +31,9 @@ import (
"k8s.io/klog/v2"
testUnstructuredMock "k8s.io/sample-controller/pkg/apis/samplecontroller/v1alpha1"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/version"

"k8s.io/kube-state-metrics/v2/pkg/customresource"
)

Expand Down Expand Up @@ -154,3 +156,19 @@ func GVRFromType(resourceName string, expectedType interface{}) *schema.GroupVer
Resource: r,
}
}

// GatherAndCount gathers all metrics from the provided Gatherer and counts
// them. It returns the number of metric children in all gathered metric
// families together.
func GatherAndCount(g prometheus.Gatherer) (int, error) {
got, err := g.Gather()
if err != nil {
return 0, fmt.Errorf("gathering metrics failed: %w", err)
}

result := 0
for _, mf := range got {
result += len(mf.GetMetric())
}
return result, nil
}

0 comments on commit f7618df

Please sign in to comment.