From fca032adc3a26d7ab8f2dc50be5925168a390a7a Mon Sep 17 00:00:00 2001 From: Lucas Kent Date: Tue, 24 Sep 2024 08:53:12 +1000 Subject: [PATCH] fix `shotover_chain_messages_per_batch_count` metric (#1633) --- docs/src/user-guide/observability.md | 3 +- .../tests/runner/observability_int_tests.rs | 22 +++++++++++ shotover/src/transforms/chain.rs | 39 +++++++++++++------ 3 files changed, 52 insertions(+), 12 deletions(-) diff --git a/docs/src/user-guide/observability.md b/docs/src/user-guide/observability.md index 357808ad3..a823024d0 100644 --- a/docs/src/user-guide/observability.md +++ b/docs/src/user-guide/observability.md @@ -10,7 +10,8 @@ This optional interface will serve Prometheus metrics from `/metrics`. It will b | `shotover_chain_total_count` | `chain` | [counter](#counter) | Counts the amount of times `chain` is used | | `shotover_chain_failures_count` | `chain` | [counter](#counter) | Counts the amount of times `chain` fails | | `shotover_chain_latency_seconds` | `chain` | [histogram](#histogram) | The latency for running `chain` | -| `shotover_chain_messages_per_batch_count` | `chain` | [histogram](#histogram) | The number of messages in each batch passing through `chain`. | +| `shotover_chain_requests_batch_size` | `chain` | [histogram](#histogram) | The number of requests in each request batch passing through `chain`. | +| `shotover_chain_responses_batch_size` | `chain` | [histogram](#histogram) | The number of responses in each response batch passing through `chain`. | | `shotover_available_connections_count` | `source` | [gauge](#gauge) | How many more connections can be opened to `source` before new connections will be rejected. | | `connections_opened` | `source` | [counter](#counter) | Counts the total number of connections that clients have opened against this source. | | `shotover_source_to_sink_latency_seconds` | `sink` | [histogram](#histogram) | The milliseconds between reading a request from a source TCP connection and writing it to a sink TCP connection | diff --git a/shotover-proxy/tests/runner/observability_int_tests.rs b/shotover-proxy/tests/runner/observability_int_tests.rs index 17dc7e68a..dab8f7591 100644 --- a/shotover-proxy/tests/runner/observability_int_tests.rs +++ b/shotover-proxy/tests/runner/observability_int_tests.rs @@ -15,6 +15,8 @@ async fn test_metrics() { # TYPE shotover_available_connections_count gauge # TYPE shotover_chain_failures_count counter # TYPE shotover_chain_messages_per_batch_count summary +# TYPE shotover_chain_requests_batch_size summary +# TYPE shotover_chain_responses_batch_size summary # TYPE shotover_chain_total_count counter # TYPE shotover_query_count counter # TYPE shotover_sink_to_source_latency_seconds summary @@ -34,6 +36,26 @@ shotover_chain_messages_per_batch_count{chain="redis",quantile="0.95"} shotover_chain_messages_per_batch_count{chain="redis",quantile="0.99"} shotover_chain_messages_per_batch_count{chain="redis",quantile="0.999"} shotover_chain_messages_per_batch_count{chain="redis",quantile="1"} +shotover_chain_requests_batch_size_count{chain="redis"} +shotover_chain_requests_batch_size_sum{chain="redis"} +shotover_chain_requests_batch_size{chain="redis",quantile="0"} +shotover_chain_requests_batch_size{chain="redis",quantile="0.1"} +shotover_chain_requests_batch_size{chain="redis",quantile="0.5"} +shotover_chain_requests_batch_size{chain="redis",quantile="0.9"} +shotover_chain_requests_batch_size{chain="redis",quantile="0.95"} +shotover_chain_requests_batch_size{chain="redis",quantile="0.99"} +shotover_chain_requests_batch_size{chain="redis",quantile="0.999"} +shotover_chain_requests_batch_size{chain="redis",quantile="1"} +shotover_chain_responses_batch_size_count{chain="redis"} +shotover_chain_responses_batch_size_sum{chain="redis"} +shotover_chain_responses_batch_size{chain="redis",quantile="0"} +shotover_chain_responses_batch_size{chain="redis",quantile="0.1"} +shotover_chain_responses_batch_size{chain="redis",quantile="0.5"} +shotover_chain_responses_batch_size{chain="redis",quantile="0.9"} +shotover_chain_responses_batch_size{chain="redis",quantile="0.95"} +shotover_chain_responses_batch_size{chain="redis",quantile="0.99"} +shotover_chain_responses_batch_size{chain="redis",quantile="0.999"} +shotover_chain_responses_batch_size{chain="redis",quantile="1"} shotover_chain_total_count{chain="redis"} shotover_query_count{name="redis-chain"} shotover_sink_to_source_latency_seconds_count{source="redis"} diff --git a/shotover/src/transforms/chain.rs b/shotover/src/transforms/chain.rs index f27325568..35a9b2e91 100644 --- a/shotover/src/transforms/chain.rs +++ b/shotover/src/transforms/chain.rs @@ -58,7 +58,8 @@ pub struct TransformChain { chain_total: Counter, chain_failures: Counter, - chain_batch_size: Histogram, + chain_requests_batch_size: Histogram, + chain_responses_batch_size: Histogram, chain_latency_seconds: Histogram, } @@ -165,12 +166,21 @@ impl TransformChain { let start = Instant::now(); chain_state.reset(&mut self.chain); - self.chain_batch_size - .record(chain_state.requests.len() as f64); + if !chain_state.requests.is_empty() { + self.chain_requests_batch_size + .record(chain_state.requests.len() as f64); + } + let result = chain_state.call_next_transform().await; self.chain_total.increment(1); - if result.is_err() { - self.chain_failures.increment(1); + match &result { + Ok(responses) => { + if !responses.is_empty() { + self.chain_responses_batch_size + .record(responses.len() as f64); + } + } + Err(_) => self.chain_failures.increment(1), } self.chain_latency_seconds.record(start.elapsed()); @@ -221,7 +231,8 @@ pub struct TransformChainBuilder { chain_total: Counter, chain_failures: Counter, - chain_batch_size: Histogram, + chain_responses_batch_size: Histogram, + chain_requests_batch_size: Histogram, } impl TransformChainBuilder { @@ -235,18 +246,23 @@ impl TransformChainBuilder { } ).collect(); - let chain_batch_size = - histogram!("shotover_chain_messages_per_batch_count", "chain" => name); + // This is deprecated but give users some time to migrate to the requests/responses versions that have replaced this metric + histogram!("shotover_chain_messages_per_batch_count", "chain" => name).record(0); + + let chain_requests_batch_size = + histogram!("shotover_chain_requests_batch_size", "chain" => name); + let chain_responses_batch_size = + histogram!("shotover_chain_responses_batch_size", "chain" => name); let chain_total = counter!("shotover_chain_total_count", "chain" => name); let chain_failures = counter!("shotover_chain_failures_count", "chain" => name); - // Cant register shotover_chain_latency_seconds because a unique one is created for each client ip address TransformChainBuilder { name, chain, chain_total, chain_failures, - chain_batch_size, + chain_requests_batch_size, + chain_responses_batch_size, } } @@ -376,7 +392,8 @@ impl TransformChainBuilder { chain, chain_total: self.chain_total.clone(), chain_failures: self.chain_failures.clone(), - chain_batch_size: self.chain_batch_size.clone(), + chain_requests_batch_size: self.chain_requests_batch_size.clone(), + chain_responses_batch_size: self.chain_responses_batch_size.clone(), chain_latency_seconds: histogram!( "shotover_chain_latency_seconds", "chain" => self.name,