From b1e6d3ad8c0783b3675397a3c86ce6e497ab04e5 Mon Sep 17 00:00:00 2001 From: Lucas Kent Date: Wed, 29 Nov 2023 13:55:43 +1100 Subject: [PATCH] Add shotover metrics profiler --- Cargo.lock | 14 ++ shotover-proxy/Cargo.toml | 2 + shotover-proxy/benches/windsock/cassandra.rs | 3 +- shotover-proxy/benches/windsock/kafka.rs | 6 +- shotover-proxy/benches/windsock/profilers.rs | 39 ++++- .../windsock/profilers/shotover_metrics.rs | 149 ++++++++++++++++++ shotover-proxy/benches/windsock/redis.rs | 6 +- windsock/src/report.rs | 7 + 8 files changed, 219 insertions(+), 7 deletions(-) create mode 100644 shotover-proxy/benches/windsock/profilers/shotover_metrics.rs diff --git a/Cargo.lock b/Cargo.lock index 10962e41b..025b77ea6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3753,6 +3753,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prometheus-parse" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2aa5feb83bf4b2c8919eaf563f51dbab41183de73ba2353c0e03cd7b6bd892" +dependencies = [ + "chrono", + "itertools 0.10.5", + "once_cell", + "regex", +] + [[package]] name = "quanta" version = "0.11.1" @@ -4792,11 +4804,13 @@ dependencies = [ "itertools 0.12.0", "nix 0.27.1", "opensearch", + "prometheus-parse", "rand 0.8.5", "rand_distr", "redis", "redis-protocol", "regex", + "reqwest", "rstest", "rstest_reuse", "rustls-pemfile", diff --git a/shotover-proxy/Cargo.toml b/shotover-proxy/Cargo.toml index 583886d28..f1b834330 100644 --- a/shotover-proxy/Cargo.toml +++ b/shotover-proxy/Cargo.toml @@ -8,9 +8,11 @@ license = "Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +prometheus-parse = "0.2.4" shotover = { path = "../shotover" } [dev-dependencies] +reqwest.workspace = true scylla.workspace = true anyhow.workspace = true tokio.workspace = true diff --git a/shotover-proxy/benches/windsock/cassandra.rs b/shotover-proxy/benches/windsock/cassandra.rs index eceb452ac..785aad11f 100644 --- a/shotover-proxy/benches/windsock/cassandra.rs +++ b/shotover-proxy/benches/windsock/cassandra.rs @@ -546,7 +546,8 @@ impl Bench for CassandraBench { } } let mut profiler = - CloudProfilerRunner::new(self.name(), profiling, profiler_instances).await; + CloudProfilerRunner::new(self.name(), profiling, profiler_instances, &shotover_ip) + .await; let cassandra_nodes = vec![ AwsNodeInfo { diff --git a/shotover-proxy/benches/windsock/kafka.rs b/shotover-proxy/benches/windsock/kafka.rs index 1e46c5420..c5141d41e 100644 --- a/shotover-proxy/benches/windsock/kafka.rs +++ b/shotover-proxy/benches/windsock/kafka.rs @@ -130,13 +130,13 @@ impl Bench for KafkaBench { // TODO: enable when testing KafkaSinkCluster //profiler_instances.insert("shotover".to_owned(), &shotover_instance.instance); - let mut profiler = - CloudProfilerRunner::new(self.name(), profiling, profiler_instances).await; - let kafka_ip = kafka_instance.instance.private_ip().to_string(); // TODO: make use of this when we start benching KafkaSinkCluster let _shotover_ip = shotover_instance.instance.private_ip().to_string(); + let mut profiler = + CloudProfilerRunner::new(self.name(), profiling, profiler_instances, &kafka_ip).await; + let (_, running_shotover) = futures::join!( run_aws_kafka(kafka_instance.clone(), 9192), self.run_aws_shotover(kafka_instance) diff --git a/shotover-proxy/benches/windsock/profilers.rs b/shotover-proxy/benches/windsock/profilers.rs index c84a031fd..f5eb02bf8 100644 --- a/shotover-proxy/benches/windsock/profilers.rs +++ b/shotover-proxy/benches/windsock/profilers.rs @@ -1,4 +1,4 @@ -use self::samply::Samply; +use self::{samply::Samply, shotover_metrics::ShotoverMetrics}; use crate::common::Shotover; use anyhow::Result; use aws_throwaway::Ec2Instance; @@ -11,14 +11,17 @@ use windsock::Profiling; mod perf_flamegraph; mod samply; mod sar; +mod shotover_metrics; pub struct ProfilerRunner { bench_name: String, run_flamegraph: bool, run_samply: bool, + run_shotover_metrics: bool, run_sys_monitor: bool, results_path: PathBuf, perf: Option, + shotover_metrics: Option, samply: Option, sys_monitor: Option>>, } @@ -32,14 +35,19 @@ impl ProfilerRunner { let run_sys_monitor = profiling .profilers_to_use .contains(&"sys_monitor".to_owned()); + let run_shotover_metrics = profiling + .profilers_to_use + .contains(&"shotover_metrics".to_owned()); ProfilerRunner { bench_name, run_flamegraph, run_sys_monitor, run_samply, + run_shotover_metrics, results_path: profiling.results_path, perf: None, + shotover_metrics: None, samply: None, sys_monitor: None, } @@ -58,6 +66,15 @@ impl ProfilerRunner { } else { None }; + self.shotover_metrics = if self.run_shotover_metrics { + if shotover.is_some() { + Some(ShotoverMetrics::new(self.bench_name.clone(), "localhost")) + } else { + panic!("shotover_metrics not supported when benching without shotover") + } + } else { + None + }; self.samply = if self.run_samply { if let Some(shotover) = &shotover { Some(Samply::run(self.results_path.clone(), shotover.child().id().unwrap()).await) @@ -91,6 +108,9 @@ impl Drop for ProfilerRunner { if let Some(samply) = self.samply.take() { samply.wait(); } + if let Some(shotover_metrics) = self.shotover_metrics.take() { + shotover_metrics.insert_results_to_bench_archive(); + } if let Some(mut rx) = self.sys_monitor.take() { sar::insert_sar_results_to_bench_archive(&self.bench_name, "", sar::parse_sar(&mut rx)); } @@ -100,6 +120,7 @@ impl Drop for ProfilerRunner { pub struct CloudProfilerRunner { bench_name: String, monitor_instances: HashMap>>, + shotover_metrics: Option, } impl CloudProfilerRunner { @@ -107,11 +128,16 @@ impl CloudProfilerRunner { bench_name: String, profiling: Profiling, instances: HashMap, + shotover_ip: &str, ) -> Self { let run_sys_monitor = profiling .profilers_to_use .contains(&"sys_monitor".to_owned()); + let run_shotover_metrics = profiling + .profilers_to_use + .contains(&"shotover_metrics".to_owned()); + let mut monitor_instances = HashMap::new(); if run_sys_monitor { for (name, instance) in instances { @@ -119,9 +145,16 @@ impl CloudProfilerRunner { } } + let shotover_metrics = if run_shotover_metrics { + Some(ShotoverMetrics::new(bench_name.clone(), shotover_ip)) + } else { + None + }; + CloudProfilerRunner { bench_name, monitor_instances, + shotover_metrics, } } @@ -133,6 +166,9 @@ impl CloudProfilerRunner { sar::parse_sar(instance_rx), ); } + if let Some(shotover_metrics) = self.shotover_metrics.take() { + shotover_metrics.insert_results_to_bench_archive(); + } } } @@ -144,6 +180,7 @@ pub fn supported_profilers(shotover: Shotover) -> Vec { "flamegraph".to_owned(), "samply".to_owned(), "sys_monitor".to_owned(), + "shotover_metrics".to_owned(), ] } } diff --git a/shotover-proxy/benches/windsock/profilers/shotover_metrics.rs b/shotover-proxy/benches/windsock/profilers/shotover_metrics.rs new file mode 100644 index 000000000..22a2deb31 --- /dev/null +++ b/shotover-proxy/benches/windsock/profilers/shotover_metrics.rs @@ -0,0 +1,149 @@ +use prometheus_parse::Value; +use std::{collections::HashMap, time::Duration}; +use tokio::sync::mpsc; +use tokio::task::JoinHandle; +use windsock::{Goal, Metric, ReportArchive}; + +pub struct ShotoverMetrics { + shutdown_tx: mpsc::Sender<()>, + task: JoinHandle<()>, +} +type ParsedMetrics = HashMap>; + +impl ShotoverMetrics { + pub fn new(bench_name: String, shotover_ip: &str) -> Self { + let url = format!("http://{shotover_ip}:9001/metrics"); + let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1); + let task = tokio::spawn(async move { + let mut results: ParsedMetrics = HashMap::new(); + loop { + tokio::select! { + _ = shutdown_rx.recv() => { + break; + } + _ = Self::get_metrics(&mut results, &url) => {} + } + } + + // The bench will start after sar has started so we need to throw away all sar metrics that were recorded before the bench started. + // let time_diff = report.bench_started_at - sar.started_at; + // let inital_values_to_discard = time_diff.as_seconds_f32().round() as usize; + // for values in sar.named_values.values_mut() { + // values.drain(0..inital_values_to_discard); + // } + + let mut new_metrics = vec![]; + for (name, value) in results { + match value[0] { + Value::Gauge(_) => { + new_metrics.push(Metric::EachSecond { + name, + values: value + .iter() + .map(|x| { + let Value::Gauge(x) = x else { + panic!("metric type changed during bench run") + }; + (*x, x.to_string(), Goal::SmallerIsBetter) // TODO: Goal::None + }) + .collect(), + }); + } + Value::Counter(_) => { + let mut prev = 0.0; + new_metrics.push(Metric::EachSecond { + name, + values: value + .iter() + .map(|x| { + let Value::Counter(x) = x else { + panic!("metric type changed during bench run") + }; + let diff = x - prev; + prev = *x; + (diff, diff.to_string(), Goal::SmallerIsBetter) + // TODO: Goal::None + }) + .collect(), + }); + } + Value::Summary(_) => { + let last = value.last().unwrap(); + let Value::Summary(summary) = last else { + panic!("metric type changed during bench run") + }; + let values = summary + .iter() + .map(|x| { + ( + x.count, + format!("{} - {:.4}ms", x.quantile, x.count * 1000.0), + Goal::SmallerIsBetter, + ) + }) + .collect(); + // TODO: add a Metric::QuantileLatency and use instead + new_metrics.push(Metric::EachSecond { name, values }); + } + _ => { + tracing::warn!("Unused shotover metric: {name}") + } + } + } + new_metrics.sort_by_key(|x| { + let name = x.name(); + // move latency metrics to the top + if name.starts_with("chain_latency") || name.starts_with("transform_latency") { + format!("aaa_{name}") + } + // move failure metrics to the bottom. + else if name.starts_with("transform_failures") + || name.starts_with("failed_requests") + || name.starts_with("chain_failures") + { + format!("zzz_{name}") + } else { + name.to_owned() + } + }); + + let mut report = ReportArchive::load(&bench_name).unwrap(); + report.metrics.extend(new_metrics); + report.save(); + }); + ShotoverMetrics { task, shutdown_tx } + } + + async fn get_metrics(results: &mut ParsedMetrics, url: &str) { + let metrics = tokio::time::timeout(Duration::from_secs(3), reqwest::get(url)) + .await + .unwrap() + .unwrap() + .text() + .await + .unwrap(); + + let metrics = + prometheus_parse::Scrape::parse(metrics.lines().map(|x| Ok(x.to_owned()))).unwrap(); + for sample in metrics.samples { + let key = format!( + "{}{{{}}}", + sample + .metric + // TODO: maybe just remove this prefix upstream, seems redundant? + .strip_prefix("shotover_") + .unwrap_or(&sample.metric), + sample.labels + ); + + results.entry(key).or_default().push(sample.value); + } + tokio::time::sleep(Duration::from_secs(1)).await + } + + pub fn insert_results_to_bench_archive(self) { + std::mem::drop(self.shutdown_tx); + // TODO: asyncify it all or something + futures::executor::block_on(async { self.task.await.unwrap() }) + } +} diff --git a/shotover-proxy/benches/windsock/redis.rs b/shotover-proxy/benches/windsock/redis.rs index bb13653ee..6e7b51485 100644 --- a/shotover-proxy/benches/windsock/redis.rs +++ b/shotover-proxy/benches/windsock/redis.rs @@ -222,12 +222,14 @@ impl Bench for RedisBench { profiler_instances.insert("redis".to_owned(), &instance.instance); } } - let mut profiler = - CloudProfilerRunner::new(self.name(), profiling, profiler_instances).await; let redis_ip = redis_instances.private_ips()[0].to_string(); let shotover_ip = shotover_instance.instance.private_ip().to_string(); + let mut profiler = + CloudProfilerRunner::new(self.name(), profiling, profiler_instances, &shotover_ip) + .await; + let (_, running_shotover) = futures::join!( redis_instances.run(self.encryption), self.run_aws_shotover(shotover_instance.clone(), redis_ip.clone()) diff --git a/windsock/src/report.rs b/windsock/src/report.rs index 8a69893e8..a9b6db51f 100644 --- a/windsock/src/report.rs +++ b/windsock/src/report.rs @@ -161,6 +161,13 @@ pub enum Metric { } impl Metric { + pub fn name(&self) -> &str { + match self { + Metric::Total { name, .. } => name, + Metric::EachSecond { name, .. } => name, + } + } + pub(crate) fn identifier(&self) -> MetricIdentifier { match self { Metric::Total { name, .. } => MetricIdentifier::Total {