diff --git a/Cargo.lock b/Cargo.lock index 160c8aacf1..4548d0a3d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5822,6 +5822,7 @@ dependencies = [ "num-traits", "once_cell", "openapiv3", + "peg-runtime", "pem-rfc7468", "petgraph", "postgres-types", @@ -6189,22 +6190,26 @@ name = "oximeter-db" version = "0.1.0" dependencies = [ "anyhow", + "async-recursion", "async-trait", "bcs", "bytes", "camino", "chrono", "clap 4.5.1", + "crossterm", "dropshot", "expectorate", "futures", "highway", "indexmap 2.2.5", "itertools 0.12.1", + "num", "omicron-common", "omicron-test-utils", "omicron-workspace-hack", "oximeter", + "peg", "reedline", "regex", "reqwest", @@ -6511,6 +6516,33 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +[[package]] +name = "peg" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "400bcab7d219c38abf8bd7cc2054eb9bbbd4312d66f6a5557d572a203f646f61" +dependencies = [ + "peg-macros", + "peg-runtime", +] + +[[package]] +name = "peg-macros" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46e61cce859b76d19090f62da50a9fe92bab7c2a5f09e183763559a2ac392c90" +dependencies = [ + "peg-runtime", + "proc-macro2", + "quote", +] + +[[package]] +name = "peg-runtime" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36bae92c60fa2398ce4678b98b2c4b5a7c61099961ca1fa305aec04a9ad28922" + [[package]] name = "pem" version = "3.0.2" diff --git a/Cargo.toml b/Cargo.toml index 3237cc79bd..0d66583a82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -257,7 +257,6 @@ ipcc = { path = "ipcc" } ipnet = "2.9" itertools = "0.12.1" internet-checksum = "0.2" -ipcc-key-value = { path = "ipcc-key-value" } ipnetwork = { version = "0.20", features = ["schemars"] } ispf = { git = "https://github.com/oxidecomputer/ispf" } key-manager = { path = "key-manager" } @@ -313,7 +312,6 @@ openapiv3 = "2.0.0" # must match samael's crate! openssl = "0.10" openssl-sys = "0.9" -openssl-probe = "0.1.5" opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "7ee353a470ea59529ee1b34729681da887aa88ce" } oso = "0.27" owo-colors = "4.0.0" @@ -330,6 +328,7 @@ partial-io = { version = "0.5.4", features = ["proptest1", "tokio1"] } parse-size = "1.0.0" paste = "1.0.14" percent-encoding = "2.3.1" +peg = "0.8.2" pem = "3.0" petgraph = "0.6.4" postgres-protocol = "0.6.6" @@ -368,7 +367,6 @@ schemars = "0.8.16" secrecy = "0.8.0" semver = { version = "1.0.22", features = ["std", "serde"] } serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] } -serde_derive = "1.0" serde_human_bytes = { git = "http://github.com/oxidecomputer/serde_human_bytes", branch = "main" } serde_json = "1.0.114" serde_path_to_error = "0.1.16" @@ -394,12 +392,12 @@ slog-envlogger = "2.2" slog-error-chain = { git = "https://github.com/oxidecomputer/slog-error-chain", branch = "main", features = ["derive"] } slog-term = "2.9" smf = "0.2" -snafu = "0.7" socket2 = { version = "0.5", features = ["all"] } sp-sim = { path = "sp-sim" } sprockets-common = { git = "http://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" } sprockets-host = { git = "http://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" } sprockets-rot = { git = "http://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" } +sqlformat = "0.2.3" sqlparser = { version = "0.43.1", features = [ "visitor" ] } static_assertions = "1.1.0" # Please do not change the Steno version to a Git dependency. It makes it diff --git a/nexus/src/app/metrics.rs b/nexus/src/app/metrics.rs index 94fb232892..3728a3bdc1 100644 --- a/nexus/src/app/metrics.rs +++ b/nexus/src/app/metrics.rs @@ -13,7 +13,9 @@ use nexus_db_queries::{ db::{fixed_data::FLEET_ID, lookup}, }; use omicron_common::api::external::{Error, InternalContext}; -use oximeter_db::Measurement; +use oximeter_db::{ + oxql, Measurement, TimeseriesSchema, TimeseriesSchemaPaginationParams, +}; use std::num::NonZeroU32; impl super::Nexus { @@ -96,4 +98,85 @@ impl super::Nexus { ) .await } + + /// List available timeseries schema. + pub(crate) async fn timeseries_schema_list( + &self, + opctx: &OpContext, + pagination: &TimeseriesSchemaPaginationParams, + limit: NonZeroU32, + ) -> Result, Error> { + // Must be a fleet user to list timeseries schema. + // + // TODO-security: We need to figure out how to implement proper security + // checks here, letting less-privileged users fetch data for the + // resources they have access to. + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + self.timeseries_client + .get() + .await + .map_err(|e| { + Error::internal_error(&format!( + "Cannot access timeseries DB: {}", + e + )) + })? + .timeseries_schema_list(&pagination.page, limit) + .await + .map_err(|e| match e { + oximeter_db::Error::DatabaseUnavailable(_) => { + Error::ServiceUnavailable { + internal_message: e.to_string(), + } + } + _ => Error::InternalError { internal_message: e.to_string() }, + }) + } + + /// Run an OxQL query against the timeseries database. + pub(crate) async fn timeseries_query( + &self, + opctx: &OpContext, + query: impl AsRef, + ) -> Result, Error> { + // Must be a fleet user to list timeseries schema. + // + // TODO-security: We need to figure out how to implement proper security + // checks here, letting less-privileged users fetch data for the + // resources they have access to. + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + self.timeseries_client + .get() + .await + .map_err(|e| { + Error::internal_error(&format!( + "Cannot access timeseries DB: {}", + e + )) + })? + .oxql_query(query) + .await + .map(|result| { + // TODO-observability: The query method returns information + // about the duration of the OxQL query and the database + // resource usage for each contained SQL query. We should + // publish this as a timeseries itself, so that we can track + // improvements to query processing. + // + // For now, simply return the tables alone. + result.tables + }) + .map_err(|e| match e { + oximeter_db::Error::DatabaseUnavailable(_) => { + Error::ServiceUnavailable { + internal_message: e.to_string(), + } + } + oximeter_db::Error::Oxql(_) + | oximeter_db::Error::TimeseriesNotFound(_) => { + Error::invalid_request(e.to_string()) + } + _ => Error::InternalError { internal_message: e.to_string() }, + }) + } } diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 6fa530b49d..a570cd60c4 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -321,6 +321,8 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(system_metric)?; api.register(silo_metric)?; + api.register(timeseries_schema_list)?; + api.register(timeseries_query)?; api.register(system_update_put_repository)?; api.register(system_update_get_repository)?; @@ -5626,6 +5628,56 @@ async fn silo_metric( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// List available timeseries schema. +#[endpoint { + method = GET, + path = "/v1/timeseries/schema", + tags = ["metrics"], +}] +async fn timeseries_schema_list( + rqctx: RequestContext>, + pag_params: Query, +) -> Result>, HttpError> +{ + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.nexus; + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let pagination = pag_params.into_inner(); + let limit = rqctx.page_limit(&pagination)?; + nexus + .timeseries_schema_list(&opctx, &pagination, limit) + .await + .map(HttpResponseOk) + .map_err(HttpError::from) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Run a timeseries query, written OxQL. +#[endpoint { + method = POST, + path = "/v1/timeseries/query", + tags = ["metrics"], +}] +async fn timeseries_query( + rqctx: RequestContext>, + body: TypedBody, +) -> Result>, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.nexus; + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let query = body.into_inner().query; + nexus + .timeseries_query(&opctx, &query) + .await + .map(HttpResponseOk) + .map_err(HttpError::from) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Updates /// Upload TUF repository diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 1003722723..02ab1385e3 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -848,6 +848,17 @@ pub static DEMO_SILO_METRICS_URL: Lazy = Lazy::new(|| { ) }); +pub static TIMESERIES_LIST_URL: Lazy = + Lazy::new(|| String::from("/v1/timeseries/schema")); + +pub static TIMESERIES_QUERY_URL: Lazy = + Lazy::new(|| String::from("/v1/timeseries/query")); + +pub static DEMO_TIMESERIES_QUERY: Lazy = + Lazy::new(|| params::TimeseriesQuery { + query: String::from("get http_service:request_latency_histogram"), + }); + // Users pub static DEMO_USER_CREATE: Lazy = Lazy::new(|| params::UserCreate { @@ -2023,6 +2034,26 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { ], }, + VerifyEndpoint { + url: &TIMESERIES_LIST_URL, + visibility: Visibility::Public, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Get, + ], + }, + + VerifyEndpoint { + url: &TIMESERIES_QUERY_URL, + visibility: Visibility::Public, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Post( + serde_json::to_value(&*DEMO_TIMESERIES_QUERY).unwrap() + ), + ], + }, + /* Silo identity providers */ VerifyEndpoint { diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 73f11ce49a..c96cf9b0fb 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -16,6 +16,7 @@ use nexus_test_utils::ControlPlaneTestContext; use nexus_test_utils_macros::nexus_test; use oximeter::types::Datum; use oximeter::types::Measurement; +use oximeter::TimeseriesSchema; use uuid::Uuid; pub async fn query_for_metrics( @@ -238,3 +239,27 @@ async fn test_metrics( // project 1 unaffected by project 2's resources assert_silo_metrics(&cptestctx, Some(project1_id), GIB, 4, GIB).await; } + +/// Test that we can correctly list some timeseries schema. +#[nexus_test] +async fn test_timeseries_schema_list( + cptestctx: &ControlPlaneTestContext, +) { + // We should be able to fetch the list of timeseries, and it should include + // Nexus's HTTP latency distribution. This is defined in Nexus itself, and + // should always exist after we've registered as a producer and start + // producing data. Force a collection to ensure that happens. + cptestctx.server.register_as_producer().await; + cptestctx.oximeter.force_collect().await; + let client = &cptestctx.external_client; + let url = "/v1/timeseries/schema"; + let schema = + objects_list_page_authz::(client, &url).await; + schema + .items + .iter() + .find(|sc| { + sc.timeseries_name == "http_service:request_latency_histogram" + }) + .expect("Failed to find HTTP request latency histogram schema"); +} diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 91d2504a57..3e40e8293d 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -73,6 +73,8 @@ login_saml POST /login/{silo_name}/saml/{provi API operations found with tag "metrics" OPERATION ID METHOD URL PATH silo_metric GET /v1/metrics/{metric_name} +timeseries_query POST /v1/timeseries/query +timeseries_schema_list GET /v1/timeseries/schema API operations found with tag "policy" OPERATION ID METHOD URL PATH diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 1ba373ff56..3829484a27 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -2055,3 +2055,10 @@ pub struct ProbeListSelector { /// A name or id to use when selecting a probe. pub name_or_id: Option, } + +/// A timeseries query string, written in the Oximeter query language. +#[derive(Deserialize, JsonSchema, Serialize)] +pub struct TimeseriesQuery { + /// A timeseries query string, written in the Oximeter query language. + pub query: String, +} diff --git a/openapi/nexus.json b/openapi/nexus.json index 3cc991126d..e7e4c1d31c 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -7929,6 +7929,99 @@ } } }, + "/v1/timeseries/query": { + "post": { + "tags": [ + "metrics" + ], + "summary": "Run a timeseries query, written OxQL.", + "operationId": "timeseries_query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TimeseriesQuery" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_Table", + "type": "array", + "items": { + "$ref": "#/components/schemas/Table" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/v1/timeseries/schema": { + "get": { + "tags": [ + "metrics" + ], + "summary": "List available timeseries schema.", + "operationId": "timeseries_schema_list", + "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TimeseriesSchemaResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + } + }, "/v1/users": { "get": { "tags": [ @@ -11917,6 +12010,56 @@ } ] }, + "Distributiondouble": { + "description": "A distribution is a sequence of bins and counts in those bins.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "type": "number", + "format": "double" + } + }, + "counts": { + "type": "array", + "items": { + "type": "integer", + "format": "uint64", + "minimum": 0 + } + } + }, + "required": [ + "bins", + "counts" + ] + }, + "Distributionint64": { + "description": "A distribution is a sequence of bins and counts in those bins.", + "type": "object", + "properties": { + "bins": { + "type": "array", + "items": { + "type": "integer", + "format": "int64" + } + }, + "counts": { + "type": "array", + "items": { + "type": "integer", + "format": "uint64", + "minimum": 0 + } + } + }, + "required": [ + "bins", + "counts" + ] + }, "EphemeralIpCreate": { "description": "Parameters for creating an ephemeral IP address for an instance.", "type": "object", @@ -12080,33 +12223,314 @@ } }, "required": [ - "floating_ip", - "type" + "floating_ip", + "type" + ] + } + ] + }, + "ExternalIpResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/ExternalIp" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, + "FieldSchema": { + "description": "The name and type information for a field of a timeseries schema.", + "type": "object", + "properties": { + "field_type": { + "$ref": "#/components/schemas/FieldType" + }, + "name": { + "type": "string" + }, + "source": { + "$ref": "#/components/schemas/FieldSource" + } + }, + "required": [ + "field_type", + "name", + "source" + ] + }, + "FieldSource": { + "description": "The source from which a field is derived, the target or metric.", + "type": "string", + "enum": [ + "target", + "metric" + ] + }, + "FieldType": { + "description": "The `FieldType` identifies the data type of a target or metric field.", + "type": "string", + "enum": [ + "string", + "i8", + "u8", + "i16", + "u16", + "i32", + "u32", + "i64", + "u64", + "ip_addr", + "uuid", + "bool" + ] + }, + "FieldValue": { + "description": "The `FieldValue` contains the value of a target or metric field.", + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "string" + ] + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "i8" + ] + }, + "value": { + "type": "integer", + "format": "int8" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "u8" + ] + }, + "value": { + "type": "integer", + "format": "uint8", + "minimum": 0 + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "i16" + ] + }, + "value": { + "type": "integer", + "format": "int16" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "u16" + ] + }, + "value": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "i32" + ] + }, + "value": { + "type": "integer", + "format": "int32" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "u32" + ] + }, + "value": { + "type": "integer", + "format": "uint32", + "minimum": 0 + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "i64" + ] + }, + "value": { + "type": "integer", + "format": "int64" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "u64" + ] + }, + "value": { + "type": "integer", + "format": "uint64", + "minimum": 0 + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "ip_addr" + ] + }, + "value": { + "type": "string", + "format": "ip" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "uuid" + ] + }, + "value": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "bool" + ] + }, + "value": { + "type": "boolean" + } + }, + "required": [ + "type", + "value" ] } ] }, - "ExternalIpResultsPage": { - "description": "A single page of results", - "type": "object", - "properties": { - "items": { - "description": "list of items on this page of results", - "type": "array", - "items": { - "$ref": "#/components/schemas/ExternalIp" - } - }, - "next_page": { - "nullable": true, - "description": "token used to fetch the next page of results (if any)", - "type": "string" - } - }, - "required": [ - "items" - ] - }, "FinalizeDisk": { "description": "Parameters for finalizing a disk", "type": "object", @@ -14279,6 +14703,32 @@ "items" ] }, + "MetricType": { + "description": "The type of the metric itself, indicating what its values represent.", + "oneOf": [ + { + "description": "The value represents an instantaneous measurement in time.", + "type": "string", + "enum": [ + "gauge" + ] + }, + { + "description": "The value represents a difference between two points in time.", + "type": "string", + "enum": [ + "delta" + ] + }, + { + "description": "The value represents an accumulation between two points in time.", + "type": "string", + "enum": [ + "cumulative" + ] + } + ] + }, "MissingDatum": { "type": "object", "properties": { @@ -14614,6 +15064,37 @@ "ok" ] }, + "Points": { + "description": "Timepoints and values for one timeseries.", + "type": "object", + "properties": { + "start_times": { + "nullable": true, + "type": "array", + "items": { + "type": "string", + "format": "date-time" + } + }, + "timestamps": { + "type": "array", + "items": { + "type": "string", + "format": "date-time" + } + }, + "values": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Values" + } + } + }, + "required": [ + "timestamps", + "values" + ] + }, "Probe": { "description": "Identity-related metadata that's included in nearly all public API objects", "type": "object", @@ -16965,6 +17446,113 @@ "vlan_id" ] }, + "Table": { + "description": "A table represents one or more timeseries with the same schema.\n\nA table is the result of an OxQL query. It contains a name, usually the name of the timeseries schema from which the data is derived, and any number of timeseries, which contain the actual data.", + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "timeseries": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Timeseries" + } + } + }, + "required": [ + "name", + "timeseries" + ] + }, + "Timeseries": { + "description": "A timeseries contains a timestamped set of values from one source.\n\nThis includes the typed key-value pairs that uniquely identify it, and the set of timestamps and data values from it.", + "type": "object", + "properties": { + "fields": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/FieldValue" + } + }, + "points": { + "$ref": "#/components/schemas/Points" + } + }, + "required": [ + "fields", + "points" + ] + }, + "TimeseriesName": { + "title": "The name of a timeseries", + "description": "Names are constructed by concatenating the target and metric names with ':'. Target and metric names must be lowercase alphanumeric characters with '_' separating words.", + "type": "string", + "pattern": "^(([a-z]+[a-z0-9]*)(_([a-z0-9]+))*):(([a-z]+[a-z0-9]*)(_([a-z0-9]+))*)$" + }, + "TimeseriesQuery": { + "description": "A timeseries query string, written in the Oximeter query language.", + "type": "object", + "properties": { + "query": { + "description": "A timeseries query string, written in the Oximeter query language.", + "type": "string" + } + }, + "required": [ + "query" + ] + }, + "TimeseriesSchema": { + "description": "The schema for a timeseries.\n\nThis includes the name of the timeseries, as well as the datum type of its metric and the schema for each field.", + "type": "object", + "properties": { + "created": { + "type": "string", + "format": "date-time" + }, + "datum_type": { + "$ref": "#/components/schemas/DatumType" + }, + "field_schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/FieldSchema" + }, + "uniqueItems": true + }, + "timeseries_name": { + "$ref": "#/components/schemas/TimeseriesName" + } + }, + "required": [ + "created", + "datum_type", + "field_schema", + "timeseries_name" + ] + }, + "TimeseriesSchemaResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/TimeseriesSchema" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", @@ -17246,6 +17834,169 @@ "provisioned" ] }, + "ValueArray": { + "description": "List of data values for one timeseries.\n\nEach element is an option, where `None` represents a missing sample.", + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "integer" + ] + }, + "values": { + "type": "array", + "items": { + "nullable": true, + "type": "integer", + "format": "int64" + } + } + }, + "required": [ + "type", + "values" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "double" + ] + }, + "values": { + "type": "array", + "items": { + "nullable": true, + "type": "number", + "format": "double" + } + } + }, + "required": [ + "type", + "values" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "boolean" + ] + }, + "values": { + "type": "array", + "items": { + "nullable": true, + "type": "boolean" + } + } + }, + "required": [ + "type", + "values" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "string" + ] + }, + "values": { + "type": "array", + "items": { + "nullable": true, + "type": "string" + } + } + }, + "required": [ + "type", + "values" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "integer_distribution" + ] + }, + "values": { + "type": "array", + "items": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/Distributionint64" + } + ] + } + } + }, + "required": [ + "type", + "values" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "double_distribution" + ] + }, + "values": { + "type": "array", + "items": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/Distributiondouble" + } + ] + } + } + }, + "required": [ + "type", + "values" + ] + } + ] + }, + "Values": { + "description": "A single list of values, for one dimension of a timeseries.", + "type": "object", + "properties": { + "metric_type": { + "$ref": "#/components/schemas/MetricType" + }, + "values": { + "$ref": "#/components/schemas/ValueArray" + } + }, + "required": [ + "metric_type", + "values" + ] + }, "VirtualResourceCounts": { "description": "A collection of resource counts used to describe capacity and utilization", "type": "object", diff --git a/oximeter/db/Cargo.toml b/oximeter/db/Cargo.toml index c4ee44acb6..88a2ab8a89 100644 --- a/oximeter/db/Cargo.toml +++ b/oximeter/db/Cargo.toml @@ -7,6 +7,7 @@ license = "MPL-2.0" [dependencies] anyhow.workspace = true +async-recursion = "1.0.5" async-trait.workspace = true bcs.workspace = true camino.workspace = true @@ -15,21 +16,16 @@ clap.workspace = true dropshot.workspace = true futures.workspace = true highway.workspace = true -indexmap.workspace = true omicron-common.workspace = true omicron-workspace-hack.workspace = true oximeter.workspace = true -reedline.workspace = true regex.workspace = true -rustyline.workspace = true serde.workspace = true serde_json.workspace = true slog.workspace = true slog-async.workspace = true +slog-dtrace.workspace = true slog-term.workspace = true -sqlparser.workspace = true -sqlformat = "0.2.3" -tabled.workspace = true thiserror.workspace = true usdt.workspace = true uuid.workspace = true @@ -38,26 +34,82 @@ uuid.workspace = true workspace = true features = [ "serde" ] +[dependencies.crossterm] +workspace = true +optional = true + +[dependencies.indexmap] +workspace = true +optional = true + +[dependencies.num] +workspace = true +optional = true + +[dependencies.peg] +workspace = true +optional = true + +[dependencies.reedline] +workspace = true +optional = true + [dependencies.reqwest] workspace = true features = [ "json" ] +[dependencies.rustyline] +workspace = true +optional = true + [dependencies.schemars] workspace = true features = [ "uuid1", "bytes", "chrono" ] +[dependencies.sqlformat] +workspace = true +optional = true + +[dependencies.sqlparser] +workspace = true +optional = true + [dependencies.tokio] workspace = true features = [ "rt-multi-thread", "macros" ] +[dependencies.tabled] +workspace = true +optional = true + [dev-dependencies] expectorate.workspace = true +indexmap.workspace = true itertools.workspace = true omicron-test-utils.workspace = true slog-dtrace.workspace = true +sqlparser.workspace = true strum.workspace = true tempfile.workspace = true +[features] +default = [ "oxql", "sql" ] +sql = [ + "dep:indexmap", + "dep:reedline", + "dep:rustyline", + "dep:sqlformat", + "dep:sqlparser", + "dep:tabled" +] +oxql = [ + "dep:crossterm", + "dep:num", + "dep:peg", + "dep:reedline", + "dep:tabled", +] + [[bin]] name = "oxdb" doc = false diff --git a/oximeter/db/src/bin/oxdb.rs b/oximeter/db/src/bin/oxdb/main.rs similarity index 50% rename from oximeter/db/src/bin/oxdb.rs rename to oximeter/db/src/bin/oxdb/main.rs index 02a8054da0..ca11dd18a3 100644 --- a/oximeter/db/src/bin/oxdb.rs +++ b/oximeter/db/src/bin/oxdb/main.rs @@ -4,31 +4,27 @@ //! Tool for developing against the Oximeter timeseries database, populating data and querying. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company use anyhow::{bail, Context}; use chrono::{DateTime, Utc}; use clap::{Args, Parser}; -use dropshot::EmptyScanParams; -use dropshot::WhichPage; use oximeter::{ types::{Cumulative, Sample}, Metric, Target, }; -use oximeter_db::sql::function_allow_list; -use oximeter_db::QueryMetadata; -use oximeter_db::QueryResult; -use oximeter_db::Table; use oximeter_db::{query, Client, DbWrite}; -use reedline::DefaultPrompt; -use reedline::DefaultPromptSegment; -use reedline::Reedline; -use reedline::Signal; use slog::{debug, info, o, Drain, Level, Logger}; use std::net::IpAddr; use std::net::SocketAddr; use uuid::Uuid; +#[cfg(feature = "sql")] +mod sql; + +#[cfg(feature = "oxql")] +mod oxql; + // Samples are inserted in chunks of this size, to avoid large allocations when inserting huge // numbers of timeseries. const INSERT_CHUNK_SIZE: usize = 100_000; @@ -151,9 +147,17 @@ enum Subcommand { }, /// Enter a SQL shell for interactive querying. + #[cfg(feature = "sql")] Sql { #[clap(flatten)] - opts: ShellOptions, + opts: crate::sql::ShellOptions, + }, + + /// Enter the Oximeter Query Language shell for interactive querying. + #[cfg(feature = "oxql")] + Oxql { + #[clap(flatten)] + opts: crate::oxql::ShellOptions, }, } @@ -312,281 +316,6 @@ async fn query( Ok(()) } -fn print_basic_commands() { - println!("Basic commands:"); - println!(" \\?, \\h, help - Print this help"); - println!(" \\q, quit, exit, ^D - Exit the shell"); - println!(" \\l - List tables"); - println!(" \\d - Describe a table"); - println!( - " \\f - List or describe ClickHouse SQL functions" - ); - println!(); - println!("Or try entering a SQL `SELECT` statement"); -} - -async fn list_virtual_tables(client: &Client) -> anyhow::Result<()> { - let mut page = WhichPage::First(EmptyScanParams {}); - let limit = 100.try_into().unwrap(); - loop { - let results = client.timeseries_schema_list(&page, limit).await?; - for schema in results.items.iter() { - println!("{}", schema.timeseries_name); - } - if results.next_page.is_some() { - if let Some(last) = results.items.last() { - page = WhichPage::Next(last.timeseries_name.clone()); - } else { - return Ok(()); - } - } else { - return Ok(()); - } - } -} - -async fn describe_virtual_table( - client: &Client, - table: &str, -) -> anyhow::Result<()> { - match table.parse() { - Err(_) => println!("Invalid timeseries name: {table}"), - Ok(name) => { - if let Some(schema) = client.schema_for_timeseries(&name).await? { - let mut cols = - Vec::with_capacity(schema.field_schema.len() + 2); - let mut types = cols.clone(); - for field in schema.field_schema.iter() { - cols.push(field.name.clone()); - types.push(field.field_type.to_string()); - } - cols.push("timestamp".into()); - types.push("DateTime64".into()); - - if schema.datum_type.is_histogram() { - cols.push("start_time".into()); - types.push("DateTime64".into()); - - cols.push("bins".into()); - types.push(format!( - "Array[{}]", - schema - .datum_type - .to_string() - .strip_prefix("Histogram") - .unwrap() - .to_lowercase(), - )); - - cols.push("counts".into()); - types.push("Array[u64]".into()); - } else if schema.datum_type.is_cumulative() { - cols.push("start_time".into()); - types.push("DateTime64".into()); - cols.push("datum".into()); - types.push(schema.datum_type.to_string()); - } else { - cols.push("datum".into()); - types.push(schema.datum_type.to_string()); - } - - let mut builder = tabled::builder::Builder::default(); - builder.push_record(cols); // first record is the header - builder.push_record(types); - println!( - "{}", - builder.build().with(tabled::settings::Style::psql()) - ); - } else { - println!("No such timeseries: {table}"); - } - } - } - Ok(()) -} - -#[derive(Clone, Debug, Args)] -struct ShellOptions { - /// Print query metadata. - #[clap(long = "metadata")] - print_metadata: bool, - /// Print the original SQL query. - #[clap(long = "original")] - print_original_query: bool, - /// Print the rewritten SQL query that is actually run on the DB. - #[clap(long = "rewritten")] - print_rewritten_query: bool, - /// Print the transformed query, but do not run it. - #[clap(long)] - transform: Option, -} - -impl Default for ShellOptions { - fn default() -> Self { - Self { - print_metadata: true, - print_original_query: false, - print_rewritten_query: false, - transform: None, - } - } -} - -fn list_supported_functions() { - println!("Subset of ClickHouse SQL functions currently supported"); - println!( - "See https://clickhouse.com/docs/en/sql-reference/functions for more" - ); - println!(); - for func in function_allow_list().iter() { - println!(" {func}"); - } -} - -fn show_supported_function(name: &str) { - if let Some(func) = function_allow_list().iter().find(|f| f.name == name) { - println!("{}", func.name); - println!(" {}", func.usage); - println!(" {}", func.description); - } else { - println!("No supported function '{name}'"); - } -} - -fn print_sql_query(query: &str) { - println!( - "{}", - sqlformat::format( - &query, - &sqlformat::QueryParams::None, - sqlformat::FormatOptions { uppercase: true, ..Default::default() } - ) - ); - println!(); -} - -fn print_query_metadata(table: &Table, metadata: &QueryMetadata) { - println!("Metadata"); - println!(" Query ID: {}", metadata.id); - println!(" Result rows: {}", table.rows.len()); - println!(" Time: {:?}", metadata.elapsed); - println!(" Read: {}\n", metadata.summary.read); -} - -async fn sql_shell( - address: IpAddr, - port: u16, - log: Logger, - opts: ShellOptions, -) -> anyhow::Result<()> { - let client = make_client(address, port, &log).await?; - - // A workaround to ensure the client has all available timeseries when the - // shell starts. - let dummy = "foo:bar".parse().unwrap(); - let _ = client.schema_for_timeseries(&dummy).await; - - // Possibly just transform the query, but do not execute it. - if let Some(query) = &opts.transform { - let transformed = client.transform_query(query).await?; - println!( - "{}", - sqlformat::format( - &transformed, - &sqlformat::QueryParams::None, - sqlformat::FormatOptions { - uppercase: true, - ..Default::default() - } - ) - ); - return Ok(()); - } - - let mut ed = Reedline::create(); - let prompt = DefaultPrompt::new( - DefaultPromptSegment::Basic("0x".to_string()), - DefaultPromptSegment::Empty, - ); - println!("Oximeter SQL shell"); - println!(); - print_basic_commands(); - loop { - let sig = ed.read_line(&prompt); - match sig { - Ok(Signal::Success(buf)) => { - let cmd = buf.as_str().trim(); - match cmd { - "" => continue, - "\\?" | "\\h" | "help" => print_basic_commands(), - "\\q" | "quit" | "exit" => return Ok(()), - "\\l" | "\\d" => list_virtual_tables(&client).await?, - _ => { - if let Some(table_name) = cmd.strip_prefix("\\d") { - if table_name.is_empty() { - list_virtual_tables(&client).await?; - } else { - describe_virtual_table( - &client, - table_name.trim().trim_end_matches(';'), - ) - .await?; - } - } else if let Some(func_name) = cmd.strip_prefix("\\f") - { - if func_name.is_empty() { - list_supported_functions(); - } else { - show_supported_function( - func_name.trim().trim_end_matches(';'), - ); - } - } else { - match client.query(&buf).await { - Err(e) => println!("Query failed: {e:#?}"), - Ok(QueryResult { - original_query, - rewritten_query, - metadata, - table, - }) => { - println!(); - let mut builder = - tabled::builder::Builder::default(); - builder.push_record(&table.column_names); // first record is the header - for row in table.rows.iter() { - builder.push_record( - row.iter().map(ToString::to_string), - ); - } - if opts.print_original_query { - print_sql_query(&original_query); - } - if opts.print_rewritten_query { - print_sql_query(&rewritten_query); - } - println!( - "{}\n", - builder.build().with( - tabled::settings::Style::psql() - ) - ); - if opts.print_metadata { - print_query_metadata(&table, &metadata); - } - } - } - } - } - } - } - Ok(Signal::CtrlD) => return Ok(()), - Ok(Signal::CtrlC) => continue, - err => println!("err: {err:?}"), - } - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { usdt::register_probes().context("Failed to register USDT probes")?; @@ -598,6 +327,7 @@ async fn main() -> anyhow::Result<()> { .filter_level(args.log_level) .fuse(); let drain = slog_async::Async::new(drain).build().fuse(); + let drain = slog_dtrace::with_drain(drain).0.fuse(); let log = Logger::root(drain, o!("component" => "oxdb")); match args.cmd { Subcommand::Describe => describe_data(), @@ -636,8 +366,13 @@ async fn main() -> anyhow::Result<()> { ) .await?; } + #[cfg(feature = "sql")] Subcommand::Sql { opts } => { - sql_shell(args.address, args.port, log, opts).await? + crate::sql::sql_shell(args.address, args.port, log, opts).await? + } + #[cfg(feature = "oxql")] + Subcommand::Oxql { opts } => { + crate::oxql::oxql_shell(args.address, args.port, log, opts).await? } } Ok(()) diff --git a/oximeter/db/src/bin/oxdb/oxql.rs b/oximeter/db/src/bin/oxdb/oxql.rs new file mode 100644 index 0000000000..54e40afa15 --- /dev/null +++ b/oximeter/db/src/bin/oxdb/oxql.rs @@ -0,0 +1,333 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! OxQL shell. + +// Copyright 2024 Oxide Computer + +use crate::make_client; +use clap::Args; +use crossterm::style::Stylize; +use dropshot::EmptyScanParams; +use dropshot::WhichPage; +use oximeter_db::oxql::query::special_idents; +use oximeter_db::oxql::Table; +use oximeter_db::Client; +use oximeter_db::OxqlResult; +use reedline::DefaultPrompt; +use reedline::DefaultPromptSegment; +use reedline::Reedline; +use reedline::Signal; +use slog::Logger; +use std::net::IpAddr; + +#[derive(Clone, Debug, Args)] +pub struct ShellOptions { + /// Print summaries of each SQL query run against the database. + #[clap(long = "summaries")] + print_summaries: bool, + /// Print the total elapsed query duration. + #[clap(long = "elapsed")] + print_elapsed: bool, +} + +// Print help for the basic OxQL commands. +fn print_basic_commands() { + println!("Basic commands:"); + println!(" \\?, \\h, help - Print this help"); + println!(" \\q, quit, exit, ^D - Exit the shell"); + println!(" \\l - List timeseries"); + println!(" \\d - Describe a timeseries"); + println!(" \\ql [] - Get OxQL help about an operation"); + println!(); + println!("Or try entering an OxQL `get` query"); +} + +// Print high-level information about OxQL. +fn print_general_oxql_help() { + const HELP: &str = r#"Oximeter Query Language + +The Oximeter Query Language (OxQL) implements queries as +as sequence of operations. Each of these takes zero or more +timeseries as inputs, and produces zero or more timeseries +as outputs. Operations are chained together with the pipe +operator, "|". + +All queries start with a `get` operation, which selects a +timeseries from the database, by name. For example: + +`get physical_data_link:bytes_received` + +The supported timeseries operations are: + +- get: Select a timeseries by name +- filter: Filter timeseries by field or sample values +- group_by: Group timeseries by fields, applying a reducer. +- join: Join two or more timeseries together + +Run `\ql ` to get specific help about that operation. + "#; + println!("{HELP}"); +} + +// Print help for a specific OxQL operation. +fn print_oxql_operation_help(op: &str) { + match op { + "get" => { + const HELP: &str = r#"get "); + +Get instances of a timeseries by name"#; + println!("{HELP}"); + } + "filter" => { + const HELP: &str = r#"filter "); + +Filter timeseries based on their attributes. + can be a logical combination of filtering +\"atoms\", such as `field_foo > 0`. Expressions +may use any of the usual comparison operators, and +can be nested and combined with && or ||. + +Expressions must refer to the name of a field +for a timeseries at this time, and must compare +against literals. For example, `some_field > 0` +is supported, but `some_field > other_field` is not."#; + println!("{HELP}"); + } + "group_by" => { + const HELP: &str = r#"group_by [, ... ] +group_by [, ... ], + +Group timeseries by the named fields, optionally +specifying a reducer to use when aggregating the +timeseries within each group. If no reducer is +specified, `mean` is used, averaging the values +within each group. + +Current supported reducers: + - mean + - sum"#; + println!("{HELP}"); + } + "join" => { + const HELP: &str = r#"join + +Combine 2 or more tables by peforming a natural +inner join, matching up those with fields of the +same value. Currently, joining does not take into +account the timestamps, and does not align the outputs +directly."#; + println!("{HELP}"); + } + _ => eprintln!("unrecognized OxQL operation: '{op}'"), + } +} + +// List the known timeseries. +async fn list_timeseries(client: &Client) -> anyhow::Result<()> { + let mut page = WhichPage::First(EmptyScanParams {}); + let limit = 100.try_into().unwrap(); + loop { + let results = client.timeseries_schema_list(&page, limit).await?; + for schema in results.items.iter() { + println!("{}", schema.timeseries_name); + } + if results.next_page.is_some() { + if let Some(last) = results.items.last() { + page = WhichPage::Next(last.timeseries_name.clone()); + } else { + return Ok(()); + } + } else { + return Ok(()); + } + } +} + +// Describe a single timeseries. +async fn describe_timeseries( + client: &Client, + timeseries: &str, +) -> anyhow::Result<()> { + match timeseries.parse() { + Err(_) => eprintln!( + "Invalid timeseries name '{timeseries}, \ + use \\l to list available timeseries by name + " + ), + Ok(name) => { + if let Some(schema) = client.schema_for_timeseries(&name).await? { + let mut cols = + Vec::with_capacity(schema.field_schema.len() + 2); + let mut types = cols.clone(); + for field in schema.field_schema.iter() { + cols.push(field.name.clone()); + types.push(field.field_type.to_string()); + } + cols.push(special_idents::TIMESTAMP.into()); + types.push(special_idents::DATETIME64.into()); + + if schema.datum_type.is_histogram() { + cols.push(special_idents::START_TIME.into()); + types.push(special_idents::DATETIME64.into()); + + cols.push(special_idents::BINS.into()); + types.push( + special_idents::array_type_name_from_histogram_type( + schema.datum_type, + ) + .unwrap(), + ); + + cols.push(special_idents::COUNTS.into()); + types.push(special_idents::ARRAYU64.into()); + } else if schema.datum_type.is_cumulative() { + cols.push(special_idents::START_TIME.into()); + types.push(special_idents::DATETIME64.into()); + cols.push(special_idents::DATUM.into()); + types.push(schema.datum_type.to_string()); + } else { + cols.push(special_idents::DATUM.into()); + types.push(schema.datum_type.to_string()); + } + + let mut builder = tabled::builder::Builder::default(); + builder.push_record(cols); // first record is the header + builder.push_record(types); + println!( + "{}", + builder.build().with(tabled::settings::Style::psql()) + ); + } else { + eprintln!("No such timeseries: {timeseries}"); + } + } + } + Ok(()) +} + +/// Run the OxQL shell. +pub async fn oxql_shell( + address: IpAddr, + port: u16, + log: Logger, + opts: ShellOptions, +) -> anyhow::Result<()> { + let client = make_client(address, port, &log).await?; + + // A workaround to ensure the client has all available timeseries when the + // shell starts. + let dummy = "foo:bar".parse().unwrap(); + let _ = client.schema_for_timeseries(&dummy).await; + + // Create the line-editor. + let mut ed = Reedline::create(); + let prompt = DefaultPrompt::new( + DefaultPromptSegment::Basic("0x".to_string()), + DefaultPromptSegment::Empty, + ); + println!("Oximeter Query Language shell"); + println!(); + print_basic_commands(); + loop { + let sig = ed.read_line(&prompt); + match sig { + Ok(Signal::Success(buf)) => { + let cmd = buf.as_str().trim(); + match cmd { + "" => continue, + "\\?" | "\\h" | "help" => print_basic_commands(), + "\\q" | "quit" | "exit" => return Ok(()), + "\\l" | "\\d" => list_timeseries(&client).await?, + _ => { + if let Some(timeseries_name) = cmd.strip_prefix("\\d") { + if timeseries_name.is_empty() { + list_timeseries(&client).await?; + } else { + describe_timeseries( + &client, + timeseries_name + .trim() + .trim_end_matches(';'), + ) + .await?; + } + } else if let Some(stmt) = cmd.strip_prefix("\\ql") { + let stmt = stmt.trim(); + if stmt.is_empty() { + print_general_oxql_help(); + } else { + print_oxql_operation_help(stmt); + } + } else { + match client + .oxql_query(cmd.trim().trim_end_matches(';')) + .await + { + Ok(result) => { + print_query_summary( + &result, + opts.print_elapsed, + opts.print_summaries, + ); + print_tables(&result.tables); + } + Err(e) => { + eprintln!("{}", "Error".underlined().red()); + eprintln!("{e}"); + } + } + } + } + } + } + Ok(Signal::CtrlD) => return Ok(()), + Ok(Signal::CtrlC) => continue, + err => eprintln!("err: {err:?}"), + } + } +} + +fn print_query_summary( + result: &OxqlResult, + print_elapsed: bool, + print_summaries: bool, +) { + if !print_elapsed && !print_summaries { + return; + } + println!("{}", "Query summary".underlined().bold()); + println!(" {}: {}", "ID".bold(), result.query_id); + if print_elapsed { + println!(" {}: {:?}\n", "Total duration".bold(), result.total_duration); + } + if print_summaries { + println!(" {}:", "SQL queries".bold()); + for summary in result.query_summaries.iter() { + println!(" {}: {}", "ID".bold(), summary.id); + println!(" {}: {:?}", "Duration".bold(), summary.elapsed); + println!(" {}: {}", "Read".bold(), summary.io_summary.read); + println!(); + } + } +} + +fn print_tables(tables: &[Table]) { + for table in tables.iter() { + println!(); + println!("{}", table.name().underlined().bold()); + for timeseries in table.iter() { + if timeseries.points.is_empty() { + continue; + } + println!(); + for (name, value) in timeseries.fields.iter() { + println!(" {}: {}", name.as_str().bold(), value); + } + for point in timeseries.points.iter_points() { + println!(" {point}"); + } + } + } +} diff --git a/oximeter/db/src/bin/oxdb/sql.rs b/oximeter/db/src/bin/oxdb/sql.rs new file mode 100644 index 0000000000..d50a60f4d7 --- /dev/null +++ b/oximeter/db/src/bin/oxdb/sql.rs @@ -0,0 +1,298 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! SQL shell subcommand for `oxdb`. + +// Copyright 2024 Oxide Computer Company + +use crate::make_client; +use clap::Args; +use dropshot::EmptyScanParams; +use dropshot::WhichPage; +use oximeter_db::sql::function_allow_list; +use oximeter_db::sql::QueryResult; +use oximeter_db::sql::Table; +use oximeter_db::Client; +use oximeter_db::QuerySummary; +use reedline::DefaultPrompt; +use reedline::DefaultPromptSegment; +use reedline::Reedline; +use reedline::Signal; +use slog::Logger; +use std::net::IpAddr; + +fn print_basic_commands() { + println!("Basic commands:"); + println!(" \\?, \\h, help - Print this help"); + println!(" \\q, quit, exit, ^D - Exit the shell"); + println!(" \\l - List tables"); + println!(" \\d
- Describe a table"); + println!( + " \\f - List or describe ClickHouse SQL functions" + ); + println!(); + println!("Or try entering a SQL `SELECT` statement"); +} + +async fn list_virtual_tables(client: &Client) -> anyhow::Result<()> { + let mut page = WhichPage::First(EmptyScanParams {}); + let limit = 100.try_into().unwrap(); + loop { + let results = client.timeseries_schema_list(&page, limit).await?; + for schema in results.items.iter() { + println!("{}", schema.timeseries_name); + } + if results.next_page.is_some() { + if let Some(last) = results.items.last() { + page = WhichPage::Next(last.timeseries_name.clone()); + } else { + return Ok(()); + } + } else { + return Ok(()); + } + } +} + +async fn describe_virtual_table( + client: &Client, + table: &str, +) -> anyhow::Result<()> { + match table.parse() { + Err(_) => println!("Invalid timeseries name: {table}"), + Ok(name) => { + if let Some(schema) = client.schema_for_timeseries(&name).await? { + let mut cols = + Vec::with_capacity(schema.field_schema.len() + 2); + let mut types = cols.clone(); + for field in schema.field_schema.iter() { + cols.push(field.name.clone()); + types.push(field.field_type.to_string()); + } + cols.push("timestamp".into()); + types.push("DateTime64".into()); + + if schema.datum_type.is_histogram() { + cols.push("start_time".into()); + types.push("DateTime64".into()); + + cols.push("bins".into()); + types.push(format!( + "Array[{}]", + schema + .datum_type + .to_string() + .strip_prefix("Histogram") + .unwrap() + .to_lowercase(), + )); + + cols.push("counts".into()); + types.push("Array[u64]".into()); + } else if schema.datum_type.is_cumulative() { + cols.push("start_time".into()); + types.push("DateTime64".into()); + cols.push("datum".into()); + types.push(schema.datum_type.to_string()); + } else { + cols.push("datum".into()); + types.push(schema.datum_type.to_string()); + } + + let mut builder = tabled::builder::Builder::default(); + builder.push_record(cols); // first record is the header + builder.push_record(types); + println!( + "{}", + builder.build().with(tabled::settings::Style::psql()) + ); + } else { + println!("No such timeseries: {table}"); + } + } + } + Ok(()) +} + +#[derive(Clone, Debug, Args)] +pub struct ShellOptions { + /// Print query metadata. + #[clap(long = "metadata")] + print_metadata: bool, + /// Print the original SQL query. + #[clap(long = "original")] + print_original_query: bool, + /// Print the rewritten SQL query that is actually run on the DB. + #[clap(long = "rewritten")] + print_rewritten_query: bool, + /// Print the transformed query, but do not run it. + #[clap(long)] + transform: Option, +} + +impl Default for ShellOptions { + fn default() -> Self { + Self { + print_metadata: true, + print_original_query: false, + print_rewritten_query: false, + transform: None, + } + } +} + +fn list_supported_functions() { + println!("Subset of ClickHouse SQL functions currently supported"); + println!( + "See https://clickhouse.com/docs/en/sql-reference/functions for more" + ); + println!(); + for func in function_allow_list().iter() { + println!(" {func}"); + } +} + +fn show_supported_function(name: &str) { + if let Some(func) = function_allow_list().iter().find(|f| f.name == name) { + println!("{}", func.name); + println!(" {}", func.usage); + println!(" {}", func.description); + } else { + println!("No supported function '{name}'"); + } +} + +fn print_sql_query(query: &str) { + println!( + "{}", + sqlformat::format( + &query, + &sqlformat::QueryParams::None, + sqlformat::FormatOptions { uppercase: true, ..Default::default() } + ) + ); + println!(); +} + +fn print_query_summary(table: &Table, summary: &QuerySummary) { + println!("Summary"); + println!(" Query ID: {}", summary.id); + println!(" Result rows: {}", table.rows.len()); + println!(" Time: {:?}", summary.elapsed); + println!(" Read: {}\n", summary.io_summary.read); +} + +pub async fn sql_shell( + address: IpAddr, + port: u16, + log: Logger, + opts: ShellOptions, +) -> anyhow::Result<()> { + let client = make_client(address, port, &log).await?; + + // A workaround to ensure the client has all available timeseries when the + // shell starts. + let dummy = "foo:bar".parse().unwrap(); + let _ = client.schema_for_timeseries(&dummy).await; + + // Possibly just transform the query, but do not execute it. + if let Some(query) = &opts.transform { + let transformed = client.transform_query(query).await?; + println!( + "{}", + sqlformat::format( + &transformed, + &sqlformat::QueryParams::None, + sqlformat::FormatOptions { + uppercase: true, + ..Default::default() + } + ) + ); + return Ok(()); + } + + let mut ed = Reedline::create(); + let prompt = DefaultPrompt::new( + DefaultPromptSegment::Basic("0x".to_string()), + DefaultPromptSegment::Empty, + ); + println!("Oximeter SQL shell"); + println!(); + print_basic_commands(); + loop { + let sig = ed.read_line(&prompt); + match sig { + Ok(Signal::Success(buf)) => { + let cmd = buf.as_str().trim(); + match cmd { + "" => continue, + "\\?" | "\\h" | "help" => print_basic_commands(), + "\\q" | "quit" | "exit" => return Ok(()), + "\\l" | "\\d" => list_virtual_tables(&client).await?, + _ => { + if let Some(table_name) = cmd.strip_prefix("\\d") { + if table_name.is_empty() { + list_virtual_tables(&client).await?; + } else { + describe_virtual_table( + &client, + table_name.trim().trim_end_matches(';'), + ) + .await?; + } + } else if let Some(func_name) = cmd.strip_prefix("\\f") + { + if func_name.is_empty() { + list_supported_functions(); + } else { + show_supported_function( + func_name.trim().trim_end_matches(';'), + ); + } + } else { + match client.query(&buf).await { + Err(e) => println!("Query failed: {e:#?}"), + Ok(QueryResult { + original_query, + rewritten_query, + summary, + table, + }) => { + println!(); + let mut builder = + tabled::builder::Builder::default(); + builder.push_record(&table.column_names); // first record is the header + for row in table.rows.iter() { + builder.push_record( + row.iter().map(ToString::to_string), + ); + } + if opts.print_original_query { + print_sql_query(&original_query); + } + if opts.print_rewritten_query { + print_sql_query(&rewritten_query); + } + println!( + "{}\n", + builder.build().with( + tabled::settings::Style::psql() + ) + ); + if opts.print_metadata { + print_query_summary(&table, &summary); + } + } + } + } + } + } + } + Ok(Signal::CtrlD) => return Ok(()), + Ok(Signal::CtrlC) => continue, + err => eprintln!("err: {err:?}"), + } + } +} diff --git a/oximeter/db/src/client/dbwrite.rs b/oximeter/db/src/client/dbwrite.rs new file mode 100644 index 0000000000..f21880f314 --- /dev/null +++ b/oximeter/db/src/client/dbwrite.rs @@ -0,0 +1,266 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of client methods that write to the ClickHouse database. + +// Copyright 2024 Oxide Computer Company + +use crate::client::Client; +use crate::model; +use crate::Error; +use oximeter::Sample; +use oximeter::TimeseriesName; +use slog::debug; +use std::collections::BTreeMap; +use std::collections::BTreeSet; + +#[derive(Debug)] +pub(super) struct UnrolledSampleRows { + /// The timeseries schema rows, keyed by timeseries name. + pub new_schema: BTreeMap, + /// The rows to insert in all the other tables, keyed by the table name. + pub rows: BTreeMap>, +} + +/// A trait allowing a [`Client`] to write data into the timeseries database. +/// +/// The vanilla [`Client`] object allows users to query the timeseries database, returning +/// timeseries samples corresponding to various filtering criteria. This trait segregates the +/// methods required for _writing_ new data into the database, and is intended only for use by the +/// `oximeter-collector` crate. +#[async_trait::async_trait] +pub trait DbWrite { + /// Insert the given samples into the database. + async fn insert_samples(&self, samples: &[Sample]) -> Result<(), Error>; + + /// Initialize the replicated telemetry database, creating tables as needed. + async fn init_replicated_db(&self) -> Result<(), Error>; + + /// Initialize a single node telemetry database, creating tables as needed. + async fn init_single_node_db(&self) -> Result<(), Error>; + + /// Wipe the ClickHouse database entirely from a single node set up. + async fn wipe_single_node_db(&self) -> Result<(), Error>; + + /// Wipe the ClickHouse database entirely from a replicated set up. + async fn wipe_replicated_db(&self) -> Result<(), Error>; +} + +#[async_trait::async_trait] +impl DbWrite for Client { + /// Insert the given samples into the database. + async fn insert_samples(&self, samples: &[Sample]) -> Result<(), Error> { + debug!(self.log, "unrolling {} total samples", samples.len()); + let UnrolledSampleRows { new_schema, rows } = + self.unroll_samples(samples).await; + self.save_new_schema_or_remove(new_schema).await?; + self.insert_unrolled_samples(rows).await + } + + /// Initialize the replicated telemetry database, creating tables as needed. + async fn init_replicated_db(&self) -> Result<(), Error> { + debug!(self.log, "initializing ClickHouse database"); + self.run_many_sql_statements(include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/schema/replicated/db-init.sql" + ))) + .await + } + + /// Wipe the ClickHouse database entirely from a replicated set up. + async fn wipe_replicated_db(&self) -> Result<(), Error> { + debug!(self.log, "wiping ClickHouse database"); + self.run_many_sql_statements(include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/schema/replicated/db-wipe.sql" + ))) + .await + } + + /// Initialize a single node telemetry database, creating tables as needed. + async fn init_single_node_db(&self) -> Result<(), Error> { + debug!(self.log, "initializing ClickHouse database"); + self.run_many_sql_statements(include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/schema/single-node/db-init.sql" + ))) + .await + } + + /// Wipe the ClickHouse database entirely from a single node set up. + async fn wipe_single_node_db(&self) -> Result<(), Error> { + debug!(self.log, "wiping ClickHouse database"); + self.run_many_sql_statements(include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/schema/single-node/db-wipe.sql" + ))) + .await + } +} + +impl Client { + // Unroll each sample into its consituent rows, after verifying the schema. + // + // Note that this also inserts the schema into the internal cache, if it + // does not already exist there. + pub(super) async fn unroll_samples( + &self, + samples: &[Sample], + ) -> UnrolledSampleRows { + let mut seen_timeseries = BTreeSet::new(); + let mut rows = BTreeMap::new(); + let mut new_schema = BTreeMap::new(); + + for sample in samples.iter() { + match self.verify_or_cache_sample_schema(sample).await { + Err(_) => { + // Skip the sample, but otherwise do nothing. The error is logged in the above + // call. + continue; + } + Ok(None) => {} + Ok(Some((name, schema))) => { + debug!( + self.log, + "new timeseries schema"; + "timeseries_name" => %name, + "schema" => %schema + ); + new_schema.insert(name, schema); + } + } + + // Key on both the timeseries name and key, as timeseries may actually share keys. + let key = ( + sample.timeseries_name.as_str(), + crate::timeseries_key(sample), + ); + if !seen_timeseries.contains(&key) { + for (table_name, table_rows) in model::unroll_field_rows(sample) + { + rows.entry(table_name) + .or_insert_with(Vec::new) + .extend(table_rows); + } + } + + let (table_name, measurement_row) = + model::unroll_measurement_row(sample); + + rows.entry(table_name) + .or_insert_with(Vec::new) + .push(measurement_row); + + seen_timeseries.insert(key); + } + + UnrolledSampleRows { new_schema, rows } + } + + // Insert unrolled sample rows into the corresponding tables. + async fn insert_unrolled_samples( + &self, + rows: BTreeMap>, + ) -> Result<(), Error> { + for (table_name, rows) in rows { + let body = format!( + "INSERT INTO {table_name} FORMAT JSONEachRow\n{row_data}\n", + table_name = table_name, + row_data = rows.join("\n") + ); + // TODO-robustness We've verified the schema, so this is likely a transient failure. + // But we may want to check the actual error condition, and, if possible, continue + // inserting any remaining data. + self.execute(body).await?; + debug!( + self.log, + "inserted rows into table"; + "n_rows" => rows.len(), + "table_name" => table_name, + ); + } + + // TODO-correctness We'd like to return all errors to clients here, and there may be as + // many as one per sample. It's not clear how to structure this in a way that's useful. + Ok(()) + } + + // Save new schema to the database, or remove them from the cache on + // failure. + // + // This attempts to insert the provided schema into the timeseries schema + // table. If that fails, those schema are _also_ removed from the internal + // cache. + // + // TODO-robustness There's still a race possible here. If two distinct clients receive new + // but conflicting schema, they will both try to insert those at some point into the schema + // tables. It's not clear how to handle this, since ClickHouse provides no transactions. + // This is unlikely to happen at this point, because the design is such that there will be + // a single `oximeter` instance, which has one client object, connected to a single + // ClickHouse server. But once we start replicating data, the window within which the race + // can occur is much larger, since it includes the time it takes ClickHouse to replicate + // data between nodes. + // + // NOTE: This is an issue even in the case where the schema don't conflict. Two clients may + // receive a sample with a new schema, and both would then try to insert that schema. + pub(super) async fn save_new_schema_or_remove( + &self, + new_schema: BTreeMap, + ) -> Result<(), Error> { + if !new_schema.is_empty() { + debug!( + self.log, + "inserting {} new timeseries schema", + new_schema.len() + ); + const APPROX_ROW_SIZE: usize = 64; + let mut body = String::with_capacity( + APPROX_ROW_SIZE + APPROX_ROW_SIZE * new_schema.len(), + ); + body.push_str("INSERT INTO "); + body.push_str(crate::DATABASE_NAME); + body.push_str(".timeseries_schema FORMAT JSONEachRow\n"); + for row_data in new_schema.values() { + body.push_str(row_data); + body.push('\n'); + } + + // Try to insert the schema. + // + // If this fails, be sure to remove the schema we've added from the + // internal cache. Since we check the internal cache first for + // schema, if we fail here but _don't_ remove the schema, we'll + // never end up inserting the schema, but we will insert samples. + if let Err(e) = self.execute(body).await { + debug!( + self.log, + "failed to insert new schema, removing from cache"; + "error" => ?e, + ); + let mut schema = self.schema.lock().await; + for name in new_schema.keys() { + schema + .remove(name) + .expect("New schema should have been cached"); + } + return Err(e); + } + } + Ok(()) + } + + // Run one or more SQL statements. + // + // This is intended to be used for the methods which run SQL from one of the + // SQL files in the crate, e.g., the DB initialization or update files. + async fn run_many_sql_statements( + &self, + sql: impl AsRef, + ) -> Result<(), Error> { + for stmt in sql.as_ref().split(';').filter(|s| !s.trim().is_empty()) { + self.execute(stmt).await?; + } + Ok(()) + } +} diff --git a/oximeter/db/src/client.rs b/oximeter/db/src/client/mod.rs similarity index 88% rename from oximeter/db/src/client.rs rename to oximeter/db/src/client/mod.rs index abea11aa06..e92518ae08 100644 --- a/oximeter/db/src/client.rs +++ b/oximeter/db/src/client/mod.rs @@ -4,11 +4,19 @@ //! Rust client to ClickHouse database -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company +pub(crate) mod dbwrite; +#[cfg(any(feature = "oxql", test))] +pub(crate) mod oxql; +pub(crate) mod query_summary; +#[cfg(any(feature = "sql", test))] +mod sql; + +pub use self::dbwrite::DbWrite; +use crate::client::query_summary::QuerySummary; use crate::model; use crate::query; -use crate::sql::RestrictedQuery; use crate::Error; use crate::Metric; use crate::Target; @@ -18,16 +26,13 @@ use crate::TimeseriesName; use crate::TimeseriesPageSelector; use crate::TimeseriesScanParams; use crate::TimeseriesSchema; -use async_trait::async_trait; use dropshot::EmptyScanParams; use dropshot::PaginationOrder; use dropshot::ResultsPage; use dropshot::WhichPage; -use indexmap::IndexMap; use oximeter::types::Sample; use regex::Regex; use regex::RegexBuilder; -use reqwest::header::HeaderMap; use slog::debug; use slog::error; use slog::info; @@ -44,7 +49,6 @@ use std::ops::Bound; use std::path::Path; use std::path::PathBuf; use std::sync::OnceLock; -use std::time::Duration; use std::time::Instant; use tokio::fs; use tokio::sync::Mutex; @@ -56,139 +60,11 @@ const CLICKHOUSE_DB_VERSION_MISSING: &'static str = #[usdt::provider(provider = "clickhouse_client")] mod probes { - fn query__start(_: &usdt::UniqueId, sql: &str) {} - fn query__done(_: &usdt::UniqueId) {} -} - -/// A count of bytes / rows accessed during a query. -#[derive(Clone, Copy, Debug)] -pub struct IoCount { - pub bytes: u64, - pub rows: u64, -} - -impl std::fmt::Display for IoCount { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{} rows ({} bytes)", self.rows, self.bytes) - } -} - -/// Summary of the I/O and duration of a query. -#[derive(Clone, Copy, Debug, serde::Deserialize)] -#[serde(try_from = "serde_json::Value")] -pub struct QuerySummary { - /// The bytes and rows read by the query. - pub read: IoCount, - /// The bytes and rows written by the query. - pub written: IoCount, -} - -impl TryFrom for QuerySummary { - type Error = Error; - - fn try_from(j: serde_json::Value) -> Result { - use serde_json::Map; - use serde_json::Value; - use std::str::FromStr; - - let Value::Object(map) = j else { - return Err(Error::Database(String::from( - "Expected a JSON object for a metadata summary", - ))); - }; + /// Fires when a SQL query begins, with the query string. + fn sql__query__start(_: &usdt::UniqueId, sql: &str) {} - fn unpack_summary_value( - map: &Map, - key: &str, - ) -> Result - where - T: FromStr, - ::Err: std::error::Error, - { - let value = map.get(key).ok_or_else(|| { - Error::MissingHeaderKey { key: key.to_string() } - })?; - let Value::String(v) = value else { - return Err(Error::BadMetadata { - key: key.to_string(), - msg: String::from("Expected a string value"), - }); - }; - v.parse::().map_err(|e| Error::BadMetadata { - key: key.to_string(), - msg: e.to_string(), - }) - } - let rows_read: u64 = unpack_summary_value(&map, "read_rows")?; - let bytes_read: u64 = unpack_summary_value(&map, "read_bytes")?; - let rows_written: u64 = unpack_summary_value(&map, "written_rows")?; - let bytes_written: u64 = unpack_summary_value(&map, "written_bytes")?; - Ok(Self { - read: IoCount { bytes: bytes_read, rows: rows_read }, - written: IoCount { bytes: bytes_written, rows: rows_written }, - }) - } -} - -/// Basic metadata about the resource usage of a single SQL query. -#[derive(Clone, Copy, Debug)] -pub struct QueryMetadata { - /// The database-assigned query ID. - pub id: Uuid, - /// The total duration of the query (network plus execution). - pub elapsed: Duration, - /// Summary of the data read and written. - pub summary: QuerySummary, -} - -impl QueryMetadata { - fn from_headers( - elapsed: Duration, - headers: &HeaderMap, - ) -> Result { - fn get_header<'a>( - map: &'a HeaderMap, - key: &'a str, - ) -> Result<&'a str, Error> { - let hdr = map.get(key).ok_or_else(|| Error::MissingHeaderKey { - key: key.to_string(), - })?; - std::str::from_utf8(hdr.as_bytes()) - .map_err(|err| Error::Database(err.to_string())) - } - let summary = - serde_json::from_str(get_header(headers, "X-ClickHouse-Summary")?) - .map_err(|err| Error::Database(err.to_string()))?; - let id = get_header(headers, "X-ClickHouse-Query-Id")? - .parse() - .map_err(|err: uuid::Error| Error::Database(err.to_string()))?; - Ok(Self { id, elapsed, summary }) - } -} - -/// A tabular result from a SQL query against a timeseries. -#[derive(Clone, Debug, Default, serde::Serialize)] -pub struct Table { - /// The name of each column in the result set. - pub column_names: Vec, - /// The rows of the result set, one per column. - pub rows: Vec>, -} - -/// The full result of running a SQL query against a timeseries. -#[derive(Clone, Debug)] -pub struct QueryResult { - /// The query as written by the client. - pub original_query: String, - /// The rewritten query, run against the JOINed representation of the - /// timeseries. - /// - /// This is the query that is actually run in the database itself. - pub rewritten_query: String, - /// Metadata about the resource usage of the query. - pub metadata: QueryMetadata, - /// The result of the query, with column names and rows. - pub table: Table, + /// Fires when a SQL query ends, either in success or failure. + fn sql__query__done(_: &usdt::UniqueId) {} } /// A `Client` to the ClickHouse metrics database. @@ -229,76 +105,6 @@ impl Client { Ok(()) } - /// Transform a SQL query against a timeseries, but do not execute it. - pub async fn transform_query( - &self, - query: impl AsRef, - ) -> Result { - let restricted = RestrictedQuery::new(query.as_ref())?; - restricted.to_oximeter_sql(&*self.schema.lock().await) - } - - /// Run a SQL query against a timeseries. - pub async fn query( - &self, - query: impl AsRef, - ) -> Result { - let original_query = query.as_ref().trim_end_matches(';'); - let ox_sql = self.transform_query(original_query).await?; - let rewritten = format!("{ox_sql} FORMAT JSONEachRow"); - debug!( - self.log, - "rewrote restricted query"; - "original_sql" => &original_query, - "rewritten_sql" => &rewritten, - ); - let request = self - .client - .post(&self.url) - .query(&[ - ("output_format_json_quote_64bit_integers", "0"), - ("database", crate::DATABASE_NAME), - ]) - .body(rewritten.clone()); - let query_start = Instant::now(); - let response = handle_db_response( - request - .send() - .await - .map_err(|err| Error::DatabaseUnavailable(err.to_string()))?, - ) - .await?; - let metadata = QueryMetadata::from_headers( - query_start.elapsed(), - response.headers(), - )?; - let text = response.text().await.unwrap(); - let mut table = Table::default(); - for line in text.lines() { - let row = - serde_json::from_str::>( - line.trim(), - ) - .unwrap(); - if table.column_names.is_empty() { - table.column_names.extend(row.keys().cloned()) - } else { - assert!(table - .column_names - .iter() - .zip(row.keys()) - .all(|(k1, k2)| k1 == k2)); - } - table.rows.push(row.into_values().collect()); - } - Ok(QueryResult { - original_query: original_query.to_string(), - rewritten_query: rewritten, - metadata, - table, - }) - } - /// Select timeseries from criteria on the fields and start/end timestamps. pub async fn select_timeseries_with( &self, @@ -348,6 +154,7 @@ impl Client { Some(field_query) => { self.select_matching_timeseries_info(&field_query, &schema) .await? + .1 } None => BTreeMap::new(), }; @@ -367,6 +174,7 @@ impl Client { } } + /// Return a page of timeseries schema from the database. pub async fn list_timeseries( &self, page: &WhichPage, @@ -401,6 +209,7 @@ impl Client { Some(field_query) => { self.select_matching_timeseries_info(&field_query, &schema) .await? + .1 } None => BTreeMap::new(), }; @@ -445,6 +254,7 @@ impl Client { concat!( "SELECT * ", "FROM {}.timeseries_schema ", + "ORDER BY timeseries_name ", "LIMIT {} ", "FORMAT JSONEachRow;", ), @@ -457,6 +267,7 @@ impl Client { concat!( "SELECT * FROM {}.timeseries_schema ", "WHERE timeseries_name > '{}' ", + "ORDER BY timeseries_name ", "LIMIT {} ", "FORMAT JSONEachRow;", ), @@ -466,7 +277,7 @@ impl Client { ) } }; - let body = self.execute_with_body(sql).await?; + let body = self.execute_with_body(sql).await?.1; let schema = body .lines() .map(|line| { @@ -848,14 +659,14 @@ impl Client { ); let version = match self.execute_with_body(sql).await { - Ok(body) if body.is_empty() => { + Ok((_, body)) if body.is_empty() => { warn!( self.log, "no version in database (treated as 'version 0')" ); 0 } - Ok(body) => body.trim().parse::().map_err(|err| { + Ok((_, body)) => body.trim().parse::().map_err(|err| { Error::Database(format!("Cannot read version: {err}")) })?, Err(Error::Database(err)) @@ -895,14 +706,13 @@ impl Client { "INSERT INTO {db_name}.version (*) VALUES ({version}, now());", db_name = crate::DATABASE_NAME, ); - self.execute_with_body(sql).await?; - Ok(()) + self.execute(sql).await } /// Verifies if instance is part of oximeter_cluster pub async fn is_oximeter_cluster(&self) -> Result { let sql = "SHOW CLUSTERS FORMAT JSONEachRow;"; - let res = self.execute_with_body(sql).await?; + let res = self.execute_with_body(sql).await?.1; Ok(res.contains("oximeter_cluster")) } @@ -972,8 +782,9 @@ impl Client { &self, field_query: &str, schema: &TimeseriesSchema, - ) -> Result, Error> { - let body = self.execute_with_body(field_query).await?; + ) -> Result<(QuerySummary, BTreeMap), Error> + { + let (summary, body) = self.execute_with_body(field_query).await?; let mut results = BTreeMap::new(); for line in body.lines() { let row: model::FieldSelectRow = serde_json::from_str(line) @@ -982,7 +793,7 @@ impl Client { model::parse_field_select_row(&row, schema); results.insert(id, (target, metric)); } - Ok(results) + Ok((summary, results)) } // Given information returned from `select_matching_timeseries_info`, select the actual @@ -996,7 +807,8 @@ impl Client { let mut timeseries_by_key = BTreeMap::new(); let keys = info.keys().copied().collect::>(); let measurement_query = query.measurement_query(&keys); - for line in self.execute_with_body(&measurement_query).await?.lines() { + for line in self.execute_with_body(&measurement_query).await?.1.lines() + { let (key, measurement) = model::parse_measurement_from_row(line, schema.datum_type); let timeseries = timeseries_by_key.entry(key).or_insert_with( @@ -1032,7 +844,10 @@ impl Client { // Execute a generic SQL statement, awaiting the response as text // // TODO-robustness This currently does no validation of the statement. - async fn execute_with_body(&self, sql: S) -> Result + async fn execute_with_body( + &self, + sql: S, + ) -> Result<(QuerySummary, String), Error> where S: AsRef, { @@ -1042,24 +857,50 @@ impl Client { "executing SQL query"; "sql" => &sql, ); + + // Run the SQL query itself. + // + // This code gets a bit convoluted, so that we can fire the USDT probe + // in all situations, even when the various fallible operations + // complete. let id = usdt::UniqueId::new(); - probes::query__start!(|| (&id, &sql)); - let response = handle_db_response( - self.client - .post(&self.url) - // See regression test `test_unquoted_64bit_integers` for details. - .query(&[("output_format_json_quote_64bit_integers", "0")]) - .body(sql) - .send() - .await - .map_err(|err| Error::DatabaseUnavailable(err.to_string()))?, - ) - .await? - .text() - .await - .map_err(|err| Error::Database(err.to_string())); - probes::query__done!(|| (&id)); - response + probes::sql__query__start!(|| (&id, &sql)); + let start = Instant::now(); + + // Submit the SQL request itself. + let response = self + .client + .post(&self.url) + .query(&[("output_format_json_quote_64bit_integers", "0")]) + .body(sql) + .send() + .await + .map_err(|err| { + probes::sql__query__done!(|| (&id)); + Error::DatabaseUnavailable(err.to_string()) + })?; + + // Convert the HTTP response into a database response. + let response = handle_db_response(response).await.map_err(|err| { + probes::sql__query__done!(|| (&id)); + err + })?; + + // Extract the query summary, measuring resource usage and duration. + let summary = + QuerySummary::from_headers(start.elapsed(), response.headers()) + .map_err(|err| { + probes::sql__query__done!(|| (&id)); + err + })?; + + // Extract the actual text of the response. + let text = response.text().await.map_err(|err| { + probes::sql__query__done!(|| (&id)); + Error::Database(err.to_string()) + })?; + probes::sql__query__done!(|| (&id)); + Ok((summary, text)) } // Get timeseries schema from the database. @@ -1095,7 +936,7 @@ impl Client { ) } }; - let body = self.execute_with_body(sql).await?; + let body = self.execute_with_body(sql).await?.1; if body.is_empty() { trace!(self.log, "no new timeseries schema in database"); } else { @@ -1113,167 +954,6 @@ impl Client { } Ok(()) } - - // Unroll each sample into its consituent rows, after verifying the schema. - // - // Note that this also inserts the schema into the internal cache, if it - // does not already exist there. - async fn unroll_samples(&self, samples: &[Sample]) -> UnrolledSampleRows { - let mut seen_timeseries = BTreeSet::new(); - let mut rows = BTreeMap::new(); - let mut new_schema = BTreeMap::new(); - - for sample in samples.iter() { - match self.verify_or_cache_sample_schema(sample).await { - Err(_) => { - // Skip the sample, but otherwise do nothing. The error is logged in the above - // call. - continue; - } - Ok(None) => {} - Ok(Some((name, schema))) => { - debug!( - self.log, - "new timeseries schema"; - "timeseries_name" => %name, - "schema" => %schema - ); - new_schema.insert(name, schema); - } - } - - // Key on both the timeseries name and key, as timeseries may actually share keys. - let key = ( - sample.timeseries_name.as_str(), - crate::timeseries_key(&sample), - ); - if !seen_timeseries.contains(&key) { - for (table_name, table_rows) in model::unroll_field_rows(sample) - { - rows.entry(table_name) - .or_insert_with(Vec::new) - .extend(table_rows); - } - } - - let (table_name, measurement_row) = - model::unroll_measurement_row(sample); - - rows.entry(table_name) - .or_insert_with(Vec::new) - .push(measurement_row); - - seen_timeseries.insert(key); - } - - UnrolledSampleRows { new_schema, rows } - } - - // Save new schema to the database, or remove them from the cache on - // failure. - // - // This attempts to insert the provided schema into the timeseries schema - // table. If that fails, those schema are _also_ removed from the internal - // cache. - // - // TODO-robustness There's still a race possible here. If two distinct clients receive new - // but conflicting schema, they will both try to insert those at some point into the schema - // tables. It's not clear how to handle this, since ClickHouse provides no transactions. - // This is unlikely to happen at this point, because the design is such that there will be - // a single `oximeter` instance, which has one client object, connected to a single - // ClickHouse server. But once we start replicating data, the window within which the race - // can occur is much larger, since it includes the time it takes ClickHouse to replicate - // data between nodes. - // - // NOTE: This is an issue even in the case where the schema don't conflict. Two clients may - // receive a sample with a new schema, and both would then try to insert that schema. - async fn save_new_schema_or_remove( - &self, - new_schema: BTreeMap, - ) -> Result<(), Error> { - if !new_schema.is_empty() { - debug!( - self.log, - "inserting {} new timeseries schema", - new_schema.len() - ); - const APPROX_ROW_SIZE: usize = 64; - let mut body = String::with_capacity( - APPROX_ROW_SIZE + APPROX_ROW_SIZE * new_schema.len(), - ); - body.push_str("INSERT INTO "); - body.push_str(crate::DATABASE_NAME); - body.push_str(".timeseries_schema FORMAT JSONEachRow\n"); - for row_data in new_schema.values() { - body.push_str(row_data); - body.push_str("\n"); - } - - // Try to insert the schema. - // - // If this fails, be sure to remove the schema we've added from the - // internal cache. Since we check the internal cache first for - // schema, if we fail here but _don't_ remove the schema, we'll - // never end up inserting the schema, but we will insert samples. - if let Err(e) = self.execute(body).await { - debug!( - self.log, - "failed to insert new schema, removing from cache"; - "error" => ?e, - ); - let mut schema = self.schema.lock().await; - for name in new_schema.keys() { - schema - .remove(name) - .expect("New schema should have been cached"); - } - return Err(e); - } - } - Ok(()) - } - - // Insert unrolled sample rows into the corresponding tables. - async fn insert_unrolled_samples( - &self, - rows: BTreeMap>, - ) -> Result<(), Error> { - for (table_name, rows) in rows { - let body = format!( - "INSERT INTO {table_name} FORMAT JSONEachRow\n{row_data}\n", - table_name = table_name, - row_data = rows.join("\n") - ); - // TODO-robustness We've verified the schema, so this is likely a transient failure. - // But we may want to check the actual error condition, and, if possible, continue - // inserting any remaining data. - self.execute(body).await?; - debug!( - self.log, - "inserted rows into table"; - "n_rows" => rows.len(), - "table_name" => table_name, - ); - } - - // TODO-correctness We'd like to return all errors to clients here, and there may be as - // many as one per sample. It's not clear how to structure this in a way that's useful. - Ok(()) - } - - // Run one or more SQL statements. - // - // This is intended to be used for the methods which run SQL from one of the - // SQL files in the crate, e.g., the DB initialization or update files. - async fn run_many_sql_statements( - &self, - sql: impl AsRef, - ) -> Result<(), Error> { - for stmt in sql.as_ref().split(';').filter(|s| !s.trim().is_empty()) { - self.execute(stmt).await?; - } - Ok(()) - } } // A regex used to validate supported schema updates. @@ -1297,87 +977,6 @@ fn schema_validation_regex() -> &'static Regex { .expect("Invalid regex") }) } - -#[derive(Debug)] -struct UnrolledSampleRows { - // The timeseries schema rows, keyed by timeseries name. - new_schema: BTreeMap, - // The rows to insert in all the other tables, keyed by the table name. - rows: BTreeMap>, -} - -/// A trait allowing a [`Client`] to write data into the timeseries database. -/// -/// The vanilla [`Client`] object allows users to query the timeseries database, returning -/// timeseries samples corresponding to various filtering criteria. This trait segregates the -/// methods required for _writing_ new data into the database, and is intended only for use by the -/// `oximeter-collector` crate. -#[async_trait] -pub trait DbWrite { - /// Insert the given samples into the database. - async fn insert_samples(&self, samples: &[Sample]) -> Result<(), Error>; - - /// Initialize the replicated telemetry database, creating tables as needed. - async fn init_replicated_db(&self) -> Result<(), Error>; - - /// Initialize a single node telemetry database, creating tables as needed. - async fn init_single_node_db(&self) -> Result<(), Error>; - - /// Wipe the ClickHouse database entirely from a single node set up. - async fn wipe_single_node_db(&self) -> Result<(), Error>; - - /// Wipe the ClickHouse database entirely from a replicated set up. - async fn wipe_replicated_db(&self) -> Result<(), Error>; -} - -#[async_trait] -impl DbWrite for Client { - /// Insert the given samples into the database. - async fn insert_samples(&self, samples: &[Sample]) -> Result<(), Error> { - debug!(self.log, "unrolling {} total samples", samples.len()); - let UnrolledSampleRows { new_schema, rows } = - self.unroll_samples(samples).await; - self.save_new_schema_or_remove(new_schema).await?; - self.insert_unrolled_samples(rows).await - } - - /// Initialize the replicated telemetry database, creating tables as needed. - async fn init_replicated_db(&self) -> Result<(), Error> { - debug!(self.log, "initializing ClickHouse database"); - self.run_many_sql_statements(include_str!( - "../schema/replicated/db-init.sql" - )) - .await - } - - /// Wipe the ClickHouse database entirely from a replicated set up. - async fn wipe_replicated_db(&self) -> Result<(), Error> { - debug!(self.log, "wiping ClickHouse database"); - self.run_many_sql_statements(include_str!( - "../schema/replicated/db-wipe.sql" - )) - .await - } - - /// Initialize a single node telemetry database, creating tables as needed. - async fn init_single_node_db(&self) -> Result<(), Error> { - debug!(self.log, "initializing ClickHouse database"); - self.run_many_sql_statements(include_str!( - "../schema/single-node/db-init.sql" - )) - .await - } - - /// Wipe the ClickHouse database entirely from a single node set up. - async fn wipe_single_node_db(&self) -> Result<(), Error> { - debug!(self.log, "wiping ClickHouse database"); - self.run_many_sql_statements(include_str!( - "../schema/single-node/db-wipe.sql" - )) - .await - } -} - // Return Ok if the response indicates success, otherwise return either the reqwest::Error, if this // is a client-side error, or the body of the actual error retrieved from ClickHouse if the error // was generated there. @@ -1397,6 +996,7 @@ async fn handle_db_response( #[cfg(test)] mod tests { + use super::dbwrite::UnrolledSampleRows; use super::*; use crate::model::OXIMETER_VERSION; use crate::query; @@ -1933,7 +1533,7 @@ mod tests { let mut result = String::from(""); let tries = 5; for _ in 0..tries { - result = client_2.execute_with_body(sql.clone()).await.unwrap(); + result = client_2.execute_with_body(sql.clone()).await.unwrap().1; if !result.contains("oximeter") { sleep(Duration::from_secs(1)).await; continue; @@ -1948,21 +1548,21 @@ mod tests { let sql = String::from( "INSERT INTO oximeter.measurements_string (datum) VALUES ('hiya');", ); - let result = client_2.execute_with_body(sql.clone()).await.unwrap(); + let result = client_2.execute_with_body(sql.clone()).await.unwrap().1; info!(log, "Inserted datum to client #2"; "sql" => sql, "result" => result); // Make sure replicas are synched let sql = String::from( "SYSTEM SYNC REPLICA oximeter.measurements_string_local;", ); - let result = client_1.execute_with_body(sql.clone()).await.unwrap(); + let result = client_1.execute_with_body(sql.clone()).await.unwrap().1; info!(log, "Synced replicas via client #1"; "sql" => sql, "result" => result); // Make sure data exists in the other replica let sql = String::from( "SELECT * FROM oximeter.measurements_string FORMAT JSONEachRow;", ); - let result = client_1.execute_with_body(sql.clone()).await.unwrap(); + let result = client_1.execute_with_body(sql.clone()).await.unwrap().1; info!(log, "Retrieved values via client #1"; "sql" => sql, "result" => result.clone()); assert!(result.contains("hiya")); @@ -2124,7 +1724,7 @@ mod tests { let sql = String::from( "SELECT * FROM oximeter.timeseries_schema FORMAT JSONEachRow;", ); - let result = client.execute_with_body(sql).await.unwrap(); + let result = client.execute_with_body(sql).await.unwrap().1; let schema = result .lines() .map(|line| { @@ -2253,7 +1853,8 @@ mod tests { table )) .await - .unwrap(); + .unwrap() + .1; let actual_count = body.lines().next().unwrap().trim().parse::().expect( "Expected a count of the number of rows from ClickHouse", @@ -2301,7 +1902,8 @@ mod tests { "SELECT toUInt64(1) AS foo FORMAT JSONEachRow;".to_string(), ) .await - .unwrap(); + .unwrap() + .1; let json: Value = serde_json::from_str(&output).unwrap(); assert_eq!(json["foo"], Value::Number(1u64.into())); @@ -3167,7 +2769,8 @@ mod tests { let body = client .execute_with_body(select_sql) .await - .expect("Failed to select field row"); + .expect("Failed to select field row") + .1; let actual_row: serde_json::Value = serde_json::from_str(&body) .expect("Failed to parse field row JSON"); println!("{actual_row:?}"); @@ -3507,7 +3110,8 @@ mod tests { let body = client .execute_with_body(select_sql) .await - .expect("Failed to select measurement row"); + .expect("Failed to select measurement row") + .1; let (_, actual_row) = crate::model::parse_measurement_from_row( &body, measurement.datum_type(), @@ -3528,6 +3132,7 @@ mod tests { ) .await .expect("Failed to SELECT from database") + .1 .lines() .count() } @@ -3749,7 +3354,7 @@ mod tests { // one. let response = client.execute_with_body( "SELECT COUNT() FROM oximeter.timeseries_schema FORMAT JSONEachRow; - ").await.unwrap(); + ").await.unwrap().1; assert_eq!(response.lines().count(), 1, "Expected exactly 1 schema"); assert_eq!(client.schema.lock().await.len(), 1); @@ -3766,7 +3371,7 @@ mod tests { // only the one schema. let response = client.execute_with_body( "SELECT COUNT() FROM oximeter.timeseries_schema FORMAT JSONEachRow; - ").await.unwrap(); + ").await.unwrap().1; assert_eq!( response.lines().count(), 1, @@ -3804,7 +3409,7 @@ mod tests { crate::DATABASE_NAME, crate::model::DbDatumType::from(ty), ); - let res = client.execute_with_body(sql).await.unwrap(); + let res = client.execute_with_body(sql).await.unwrap().1; let count = res.trim().parse::().unwrap(); assert_eq!(count, 0); } @@ -4099,7 +3704,8 @@ mod tests { " )) .await - .unwrap(); + .unwrap() + .1; let mut lines = body.lines(); assert_eq!(lines.next().unwrap(), "\"col0\",\"UInt8\""); assert_eq!(lines.next().unwrap(), "\"col1\",\"UInt16\""); @@ -4319,7 +3925,8 @@ mod tests { " )) .await - .unwrap(); + .unwrap() + .1; let mut lines = body.lines(); assert_eq!(lines.next().unwrap(), "\"col0\",\"UInt8\""); assert_eq!(lines.next().unwrap(), "\"col1\",\"UInt16\""); @@ -4480,7 +4087,7 @@ mod tests { crate::DATABASE_NAME, crate::model::DbFieldType::from(ty), ); - let res = client.execute_with_body(sql).await.unwrap(); + let res = client.execute_with_body(sql).await.unwrap().1; let count = res.trim().parse::().unwrap(); assert_eq!(count, 0); } @@ -4488,6 +4095,7 @@ mod tests { logctx.cleanup_successful(); } + #[cfg(any(feature = "sql", test))] #[tokio::test] async fn test_sql_query_output() { let logctx = test_setup_log("test_sql_query_output"); diff --git a/oximeter/db/src/client/oxql.rs b/oximeter/db/src/client/oxql.rs new file mode 100644 index 0000000000..9da4abd007 --- /dev/null +++ b/oximeter/db/src/client/oxql.rs @@ -0,0 +1,1281 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Client methods for running OxQL queries against the timeseries database. + +// Copyright 2024 Oxide Computer Company + +use super::query_summary::QuerySummary; +use crate::client::Client; +use crate::model; +use crate::oxql; +use crate::oxql::ast::table_ops::filter; +use crate::oxql::ast::table_ops::filter::Filter; +use crate::query::field_table_name; +use crate::Error; +use crate::Metric; +use crate::Target; +use crate::TimeseriesKey; +use oximeter::TimeseriesSchema; +use slog::debug; +use slog::trace; +use slog::Logger; +use std::collections::BTreeMap; +use std::time::Duration; +use std::time::Instant; +use uuid::Uuid; + +#[usdt::provider(provider = "clickhouse_client")] +mod probes { + /// Fires when an OxQL query starts, with the query ID and string. + fn oxql__query__start(_: &usdt::UniqueId, _: &Uuid, query: &str) {} + + /// Fires when an OxQL query ends, either in success or failure. + fn oxql__query__done(_: &usdt::UniqueId, _: &Uuid) {} + + /// Fires when an OxQL table operation starts, with the query ID and details + /// of the operation itself. + fn oxql__table__op__start(_: &usdt::UniqueId, _: &Uuid, op: &str) {} + + /// Fires when an OxQL table operation ends. + fn oxql__table__op__done(_: &usdt::UniqueId, _: &Uuid) {} +} + +/// The full result of an OxQL query. +#[derive(Clone, Debug)] +pub struct OxqlResult { + /// A query ID assigned to this OxQL query. + pub query_id: Uuid, + + /// The total duration of the OxQL query. + /// + /// This includes the time to run SQL queries against the database, and the + /// internal processing for each transformation in the query pipeline. + pub total_duration: Duration, + + /// The summary for each SQL query run against the ClickHouse database. + /// + /// Each OxQL query translates into many calls to ClickHouse. We fetch the + /// fields; count the number of samples; and finally fetch the samples + /// themselves. In the future, more may be needed as well. + /// + /// This returns a list of summaries, one for each SQL query that was run. + /// It includes the ClickHouse-assigned query ID for correlation and looking + /// up in the logs. + pub query_summaries: Vec, + + /// The list of OxQL tables returned from the query. + pub tables: Vec, +} + +/// The maximum number of data values fetched from the database for an OxQL +/// query. +// +// The `Client::oxql_query()` API is currently unpaginated. It's also not clear +// _how_ to paginate it. The objects contributing to the size of the returned +// value, the actual data points, are nested several layers deep, inside the +// `Timeseries` and `Table`s. A page size is supposed to refer to the top-level +// object, so we'd need to flatten this hierarchy for that to work. That's +// undesirable because it will lead to a huge amount of duplication of the table +// / timeseries-level information, once for each point. +// +// Also, since we cannot use a cursor-based pagination, we're stuck with +// limit-offset. That means we may need to run substantially all of the query, +// just to know how to retrieve the next page, sidestepping one of the main +// goals of pagination (to limit resource usage). +// +// Note that it's also hard or impossible to _predict_ how much data a query +// will use. We need to count the number of rows in the database, for example, +// _and also_ understand how table operations might change that size. For +// example, alignment is allowed to upsample the data (within limits), so the +// number of rows in the database are not the only factor. +// +// This limit here is a crude attempt to limit just the raw data fetched from +// ClickHouse itself. For any OxQL query, we may retrieve many measurements from +// the database. Each time we do so, we increment a counter, and compare it to +// this. If we exceed it, the whole query fails. +pub const MAX_DATABASE_ROWS: u64 = 1_000_000; + +// When running an OxQL query, we may need to separately run several field +// queries, to get the consistent keys independently for a range of time. +// +// This type stores the predicates used to generate the keys, and the keys +// consistent with it. +struct ConsistentKeyGroup { + predicates: Option, + consistent_keys: BTreeMap, +} + +impl Client { + /// Run a OxQL query. + pub async fn oxql_query( + &self, + query: impl AsRef, + ) -> Result { + // TODO-security: Need a way to implement authz checks for things like + // viewing resources in another project or silo. + // + // I think one way to do that is look at the predicates and make sure + // they refer to things the user has access to. Another is to add some + // implicit predicates here, indicating the subset of fields that the + // query should be able to access. + // + // This probably means we'll need to parse the query in Nexus, so that + // we can attach the other filters ourselves. + // + // See https://github.com/oxidecomputer/omicron/issues/5298. + let query = query.as_ref(); + let parsed_query = oxql::Query::new(query)?; + let query_id = Uuid::new_v4(); + let query_log = + self.log.new(slog::o!("query_id" => query_id.to_string())); + debug!( + query_log, + "parsed OxQL query"; + "query" => query, + "parsed_query" => ?parsed_query, + ); + let id = usdt::UniqueId::new(); + probes::oxql__query__start!(|| (&id, &query_id, query)); + let mut total_rows_fetched = 0; + let result = self + .run_oxql_query( + &query_log, + query_id, + parsed_query, + &mut total_rows_fetched, + None, + ) + .await; + probes::oxql__query__done!(|| (&id, &query_id)); + result + } + + /// Rewrite the predicates from an OxQL query so that they apply only to the + /// field tables. + fn rewrite_predicate_for_fields( + schema: &TimeseriesSchema, + preds: &filter::Filter, + ) -> Result, Error> { + // Walk the set of predicates, keeping those which apply to this schema. + match &preds.expr { + filter::FilterExpr::Simple(inner) => { + // If the predicate names a field in this timeseries schema, + // return that predicate printed as a string. If not, we return + // None. + let Some(field_schema) = + schema.schema_for_field(inner.ident.as_str()) + else { + return Ok(None); + }; + if !inner.value_type_is_compatible_with_field( + field_schema.field_type, + ) { + return Err(Error::from(anyhow::anyhow!( + "Expression for field {} is not compatible with \ + its type {}", + field_schema.name, + field_schema.field_type, + ))); + } + Ok(Some(inner.as_db_safe_string())) + } + filter::FilterExpr::Compound(inner) => { + let left_pred = + Self::rewrite_predicate_for_fields(schema, &inner.left)?; + let right_pred = + Self::rewrite_predicate_for_fields(schema, &inner.right)?; + let out = match (left_pred, right_pred) { + (Some(left), Some(right)) => Some(format!( + "{}({left}, {right})", + inner.op.as_db_function_name() + )), + (Some(single), None) | (None, Some(single)) => Some(single), + (None, None) => None, + }; + Ok(out) + } + } + } + + /// Rewrite the predicates from an OxQL query so that they apply only to the + /// measurement table. + fn rewrite_predicate_for_measurements( + schema: &TimeseriesSchema, + preds: &oxql::ast::table_ops::filter::Filter, + ) -> Result, Error> { + // Walk the set of predicates, keeping those which apply to this schema. + match &preds.expr { + filter::FilterExpr::Simple(inner) => { + // The relevant columns on which we filter depend on the datum + // type of the timeseries. All timeseries support "timestamp". + let ident = inner.ident.as_str(); + if ident == "timestamp" { + if matches!( + inner.value, + oxql::ast::literal::Literal::Timestamp(_) + ) { + return Ok(Some(inner.as_db_safe_string())); + } + return Err(Error::from(anyhow::anyhow!( + "Literal cannot be compared with a timestamp" + ))); + } + + // We do not currently support filtering in the database on + // values, only the `timestamp` and possibly `start_time` (if + // the metric is cumulative). + if ident == "start_time" { + if !schema.datum_type.is_cumulative() { + return Err(Error::from(anyhow::anyhow!( + "Start time can only be compared if the metric \ + is cumulative, but found one of type {}", + schema.datum_type, + ))); + } + if matches!( + inner.value, + oxql::ast::literal::Literal::Timestamp(_) + ) { + return Ok(Some(inner.as_db_safe_string())); + } + return Err(Error::from(anyhow::anyhow!( + "Literal cannot be compared with a timestamp" + ))); + } + + // We'll delegate to the actual table op to filter on any of the + // data columns. + Ok(None) + } + filter::FilterExpr::Compound(inner) => { + let left_pred = Self::rewrite_predicate_for_measurements( + schema, + &inner.left, + )?; + let right_pred = Self::rewrite_predicate_for_measurements( + schema, + &inner.right, + )?; + let out = match (left_pred, right_pred) { + (Some(left), Some(right)) => Some(format!( + "{}({left}, {right})", + inner.op.as_db_function_name() + )), + (Some(single), None) | (None, Some(single)) => Some(single), + (None, None) => None, + }; + Ok(out) + } + } + } + + // Run one query. + // + // If the query is flat, run it directly. If it's nested, run each of them; + // concatenate the results; and then apply all the remaining + // transformations. + #[async_recursion::async_recursion] + async fn run_oxql_query( + &self, + query_log: &Logger, + query_id: Uuid, + query: oxql::Query, + total_rows_fetched: &mut u64, + outer_predicates: Option, + ) -> Result { + let split = query.split(); + if let oxql::ast::SplitQuery::Nested { subqueries, transformations } = + split + { + trace!( + query_log, + "OxQL query contains subqueries, running recursively" + ); + // Create the new set of outer predicates to pass in to the + // subquery, by merging the previous outer predicates with those of + // the transformation portion of this nested query. + let new_outer_predicates = + query.coalesced_predicates(outer_predicates.clone()); + + // Run each subquery recursively, and extend the results + // accordingly. + let mut query_summaries = Vec::with_capacity(subqueries.len()); + let mut tables = Vec::with_capacity(subqueries.len()); + let query_start = Instant::now(); + for subq in subqueries.into_iter() { + let res = self + .run_oxql_query( + query_log, + query_id, + subq, + total_rows_fetched, + new_outer_predicates.clone(), + ) + .await?; + query_summaries.extend(res.query_summaries); + tables.extend(res.tables); + } + for tr in transformations.into_iter() { + trace!( + query_log, + "applying query transformation"; + "transformation" => ?tr, + ); + let id = usdt::UniqueId::new(); + probes::oxql__table__op__start!(|| ( + &id, + &query_id, + format!("{tr:?}") + )); + let new_tables = tr.apply(&tables, query.end_time()); + probes::oxql__table__op__done!(|| (&id, &query_id)); + tables = new_tables?; + } + let result = OxqlResult { + query_id, + total_duration: query_start.elapsed(), + query_summaries, + tables, + }; + return Ok(result); + } + + // This is a flat query, let's just run it directly. First step is + // getting the schema itself. + let query_start = Instant::now(); + let oxql::ast::SplitQuery::Flat(query) = split else { + unreachable!(); + }; + let name = query.timeseries_name(); + let Some(schema) = self.schema_for_timeseries(name).await? else { + return Err(Error::TimeseriesNotFound(name.to_string())); + }; + debug!( + query_log, + "running flat OxQL query"; + "query" => ?query, + "timeseries_name" => %name, + ); + + // Fetch the consistent fields (including keys) for this timeseries, + // including filtering them based on the predicates in the query + // that apply to this timeseries in particular. We also need to merge + // them in with the predicates passed in from a possible outer query. + let preds = query.coalesced_predicates(outer_predicates.clone()); + debug!( + query_log, + "coalesced predicates from flat query"; + "outer_predicates" => ?&outer_predicates, + "coalesced" => ?&preds, + ); + + // We generally run a few SQL queries for each OxQL query: + // + // - Some number of queries to fetch the timeseries keys that are + // consistent with it. + // - Fetch the consistent samples. + // + // Note that there are often 2 or more queries needed for the first + // case. In particular, there is one query required for each independent + // time range in the query (including when a time range isn't + // specified). + // + // For example, consider the filter operation: + // + // ``` + // filter some_predicate || (timestamp > @now() - 1m && other_predicate) + // ``` + // + // That is, we return all timepoints for things where `some_predicate` + // is true, and only the last minute for those satisfying + // `other_predicate`. If we simply drop the timestamp filter, and run + // the two predicates conjoined, we would erroneously return only the + // last minute for everything, including those satisfying + // `some_predicate`. + // + // So instead, we need to run one query for each of those, fetch the + // keys associated with it, and then independently select the + // measurements satisfying both the time range and key-consistency + // constraints. Thankfully that can be done in one query, albeit a + // complicated one. + // + // Convert any outer predicates to DNF, and split into disjoint key + // groups for the measurement queries. + let disjoint_predicates = if let Some(preds) = preds.as_ref() { + let simplified = preds.simplify_to_dnf()?; + debug!( + query_log, + "simplified filtering predicates to disjunctive normal form"; + "original" => %preds, + "DNF" => %simplified, + ); + simplified + .flatten_disjunctions() + .into_iter() + .map(Option::Some) + .collect() + } else { + // There are no outer predicates, so we have 1 disjoint key group, + // with no predicates. + vec![None] + }; + + // Run each query group indepdendently, keeping the predicates and the + // timeseries keys corresponding to it. + let mut consistent_key_groups = + Vec::with_capacity(1 + disjoint_predicates.len()); + let mut query_summaries = + Vec::with_capacity(1 + disjoint_predicates.len()); + for predicates in disjoint_predicates.into_iter() { + debug!( + query_log, + "running disjoint query predicate"; + "predicate" => predicates.as_ref().map(|s| s.to_string()).unwrap_or("none".into()), + ); + let all_fields_query = + self.all_fields_query(&schema, predicates.as_ref())?; + let (summary, consistent_keys) = self + .select_matching_timeseries_info(&all_fields_query, &schema) + .await?; + debug!( + query_log, + "fetched information for matching timeseries keys"; + "n_keys" => consistent_keys.len(), + ); + query_summaries.push(summary); + + // If there are no consistent keys, move to the next independent + // query chunk. + if consistent_keys.is_empty() { + continue; + } + + // Push the disjoint filter itself, plus the keys consistent with + // it. + consistent_key_groups + .push(ConsistentKeyGroup { predicates, consistent_keys }); + } + + // If there are no consistent keys _at all_, we can just return an empty + // table. + if consistent_key_groups.is_empty() { + let result = OxqlResult { + query_id, + total_duration: query_start.elapsed(), + query_summaries, + tables: vec![oxql::Table::new(schema.timeseries_name.as_str())], + }; + return Ok(result); + } + + // Fetch the consistent measurements for this timeseries, by key group. + // + // We'll keep track of all the measurements for this timeseries schema, + // organized by timeseries key. That's because we fetch all consistent + // samples at once, so we get many concrete _timeseries_ in the returned + // response, even though they're all from the same schema. + let (summary, timeseries_by_key) = self + .select_matching_samples( + query_log, + &schema, + &consistent_key_groups, + total_rows_fetched, + ) + .await?; + query_summaries.push(summary); + + // At this point, let's construct a set of tables and run the results + // through the transformation pipeline. + let mut tables = vec![oxql::Table::from_timeseries( + schema.timeseries_name.as_str(), + timeseries_by_key.into_values(), + )?]; + + let transformations = query.transformations(); + debug!( + query_log, + "constructed OxQL table, starting transformation pipeline"; + "name" => tables[0].name(), + "n_timeseries" => tables[0].n_timeseries(), + "n_transformations" => transformations.len(), + ); + for tr in transformations { + trace!( + query_log, + "applying query transformation"; + "transformation" => ?tr, + ); + let id = usdt::UniqueId::new(); + probes::oxql__table__op__start!(|| ( + &id, + &query_id, + format!("{tr:?}") + )); + let new_tables = tr.apply(&tables, query.end_time()); + probes::oxql__table__op__done!(|| (&id, &query_id)); + tables = new_tables?; + } + let result = OxqlResult { + query_id, + total_duration: query_start.elapsed(), + query_summaries, + tables, + }; + Ok(result) + } + + // Select samples matching the set of predicates and consistent keys. + // + // Note that this also implements the conversion from cumulative to gauge + // samples, depending on how data was requested. + async fn select_matching_samples( + &self, + query_log: &Logger, + schema: &TimeseriesSchema, + consistent_key_groups: &[ConsistentKeyGroup], + total_rows_fetched: &mut u64, + ) -> Result<(QuerySummary, BTreeMap), Error> + { + // We'll create timeseries for each key on the fly. To enable computing + // deltas, we need to track the last measurement we've seen as well. + let mut measurements_by_key: BTreeMap<_, Vec<_>> = BTreeMap::new(); + let measurements_query = self.measurements_query( + schema, + consistent_key_groups, + total_rows_fetched, + )?; + let mut n_measurements: u64 = 0; + let (summary, body) = + self.execute_with_body(&measurements_query).await?; + for line in body.lines() { + let (key, measurement) = + model::parse_measurement_from_row(line, schema.datum_type); + measurements_by_key.entry(key).or_default().push(measurement); + n_measurements += 1; + } + debug!( + query_log, + "fetched measurements for OxQL query"; + "n_keys" => measurements_by_key.len(), + "n_measurements" => n_measurements, + ); + + // At this point, we need to check that we're still within our maximum + // result size. The measurement query we issued limited the returned + // result to 1 more than the remainder on our allotment. So if we get + // exactly that limit, we know that there are more rows than we can + // allow. We don't know how many more, but we don't care, and we fail + // the query regardless. + update_total_rows_and_check( + query_log, + total_rows_fetched, + n_measurements, + )?; + + // At this point, we no longer care about the consistent_key groups. We + // throw away the predicates that distinguished them, and merge the + // timeseries information together. + let info = consistent_key_groups + .iter() + .map(|group| group.consistent_keys.clone()) + .reduce(|mut acc, current| { + acc.extend(current); + acc + }) + .expect("Should have at least one key-group for every query"); + + // Remove the last measurement, returning just the keys and timeseries. + let mut out = BTreeMap::new(); + for (key, measurements) in measurements_by_key.into_iter() { + // Constuct a new timeseries, from the target/metric info. + let (target, metric) = info.get(&key).unwrap(); + let mut timeseries = oxql::Timeseries::new( + target + .fields + .iter() + .chain(metric.fields.iter()) + .map(|field| (field.name.clone(), field.value.clone())), + oxql::point::DataType::try_from(schema.datum_type)?, + if schema.datum_type.is_cumulative() { + oxql::point::MetricType::Delta + } else { + oxql::point::MetricType::Gauge + }, + )?; + + // Covert its oximeter measurements into OxQL data types. + let points = if schema.datum_type.is_cumulative() { + oxql::point::Points::delta_from_cumulative(&measurements)? + } else { + oxql::point::Points::gauge_from_gauge(&measurements)? + }; + timeseries.points = points; + debug!( + query_log, + "inserted new OxQL timeseries"; + "key" => key, + "metric_type" => ?timeseries.points.metric_type(), + "n_points" => timeseries.points.len(), + ); + out.insert(key, timeseries); + } + Ok((summary, out)) + } + + fn measurements_query( + &self, + schema: &TimeseriesSchema, + consistent_key_groups: &[ConsistentKeyGroup], + total_rows_fetched: &mut u64, + ) -> Result { + use std::fmt::Write; + + // Build the base query, which just selects the timeseries by name based + // on the datum type. + let mut query = self.measurements_query_raw(schema.datum_type); + query.push_str(" WHERE timeseries_name = '"); + write!(query, "{}", schema.timeseries_name).unwrap(); + query.push('\''); + + // Filter down the fields to those which apply to the data itself, which + // includes the timestamps and data values. The supported fields here + // depend on the datum type. + // + // We join all the consistent key groups with OR, which mirrors how they + // were split originally. + let all_predicates = consistent_key_groups + .iter() + .map(|group| { + // Write out the predicates on the measurements themselves, + // which really refers to the timestamps (and possibly start + // times). + let maybe_predicates = group + .predicates + .as_ref() + .map(|preds| { + Self::rewrite_predicate_for_measurements(schema, preds) + }) + .transpose()? + .flatten(); + + // Push the predicate that selects the timeseries keys, which + // are unique to this group. + let maybe_key_set = if group.consistent_keys.len() > 0 { + let mut chunk = String::from("timeseries_key IN ("); + let keys = group + .consistent_keys + .keys() + .map(ToString::to_string) + .collect::>() + .join(","); + chunk.push_str(&keys); + chunk.push(')'); + Some(chunk) + } else { + None + }; + + let chunk = match (maybe_predicates, maybe_key_set) { + (Some(preds), None) => preds, + (None, Some(key_set)) => key_set, + (Some(preds), Some(key_set)) => { + format!("({preds} AND {key_set})") + } + (None, None) => String::new(), + }; + Ok(chunk) + }) + .collect::, Error>>()? + .join(" OR "); + if !all_predicates.is_empty() { + query.push_str(" AND ("); + query.push_str(&all_predicates); + query.push(')'); + } + + // Always impose a strong order on these fields. + // + // The tables are all sorted by: + // + // - timeseries_name + // - timeseries_key + // - start_time, if present + // - timestamp + // + // We care most about the timestamp ordering, since that is assumed (and + // asserted) by downstream table operations. We use the full sort order + // of the table, however, to make things the most efficient. + query.push_str(" ORDER BY timeseries_key"); + if schema.datum_type.is_cumulative() { + query.push_str(", start_time"); + } + query.push_str(", timestamp"); + + // Push a limit clause, which restricts the number of records we could + // return. + // + // This is used to ensure that we never go above the limit in + // `MAX_RESULT_SIZE`. That restricts the _total_ number of rows we want + // to retch from the database. So we set our limit to be one more than + // the remainder on our allotment. If we get exactly as many as we set + // in the limit, then we fail the query because there are more rows that + // _would_ be returned. We don't know how many more, but there is at + // least 1 that pushes us over the limit. This prevents tricky + // TOCTOU-like bugs where we need to check the limit twice, and improves + // performance, since we don't return much more than we could possibly + // handle. + let remainder = MAX_DATABASE_ROWS - *total_rows_fetched; + query.push_str(" LIMIT "); + write!(query, "{}", remainder + 1).unwrap(); + + // Finally, use JSON format. + query.push_str(" FORMAT "); + query.push_str(crate::DATABASE_SELECT_FORMAT); + Ok(query) + } + + fn measurements_query_raw( + &self, + datum_type: oximeter::DatumType, + ) -> String { + let value_columns = if datum_type.is_histogram() { + "timeseries_key, start_time, timestamp, bins, counts" + } else if datum_type.is_cumulative() { + "timeseries_key, start_time, timestamp, datum" + } else { + "timeseries_key, timestamp, datum" + }; + format!( + "SELECT {} \ + FROM {}.{}", + value_columns, + crate::DATABASE_NAME, + crate::query::measurement_table_name(datum_type), + ) + } + + fn all_fields_query( + &self, + schema: &TimeseriesSchema, + preds: Option<&oxql::ast::table_ops::filter::Filter>, + ) -> Result { + // Filter down the fields to those which apply to this timeseries + // itself, and rewrite as a DB-safe WHERE clause. + let preds_for_fields = preds + .map(|p| Self::rewrite_predicate_for_fields(schema, p)) + .transpose()? + .flatten(); + let (already_has_where, mut query) = self.all_fields_query_raw(schema); + if let Some(preds) = preds_for_fields { + // If the raw field has only a single select query, then we've + // already added a "WHERE" clause. Simply tack these predicates onto + // that one. + if already_has_where { + query.push_str(" AND "); + } else { + query.push_str(" WHERE "); + } + query.push_str(&preds); + } + query.push_str(" FORMAT "); + query.push_str(crate::DATABASE_SELECT_FORMAT); + Ok(query) + } + + fn all_fields_query_raw( + &self, + schema: &TimeseriesSchema, + ) -> (bool, String) { + match schema.field_schema.len() { + 0 => unreachable!(), + 1 => { + let field_schema = schema.field_schema.first().unwrap(); + ( + true, + format!( + "SELECT DISTINCT timeseries_key, field_value AS {field_name} \ + FROM {db_name}.{field_table} \ + WHERE \ + timeseries_name = '{timeseries_name}' AND \ + field_name = '{field_name}'", + field_name = field_schema.name, + db_name = crate::DATABASE_NAME, + field_table = field_table_name(field_schema.field_type), + timeseries_name = schema.timeseries_name, + ) + ) + } + _ => { + let mut top_level_columns = + Vec::with_capacity(schema.field_schema.len()); + let mut field_subqueries = + Vec::with_capacity(schema.field_schema.len()); + + // Select each field value, aliasing it to its field name. + for field_schema in schema.field_schema.iter() { + top_level_columns.push(format!( + "filter_on_{}.field_value AS {}", + field_schema.name, field_schema.name + )); + field_subqueries.push(( + format!( + "SELECT DISTINCT timeseries_key, field_value \ + FROM {db_name}.{field_table} \ + WHERE \ + timeseries_name = '{timeseries_name}' AND \ + field_name = '{field_name}' \ + ", + db_name = crate::DATABASE_NAME, + field_table = + field_table_name(field_schema.field_type), + timeseries_name = schema.timeseries_name, + field_name = field_schema.name, + ), + format!("filter_on_{}", field_schema.name), + )); + } + + // Write the top-level select statement, starting by selecting + // the timeseries key from the first field schema. + let mut out = format!( + "SELECT {}.timeseries_key AS timeseries_key, {} FROM ", + field_subqueries[0].1, + top_level_columns.join(", "), + ); + + // Then add all the subqueries selecting each field. + // + // We need to add these, along with their aliases. The first + // such subquery has no join conditions, but the later ones all + // refer to the previous via: + // + // `ON .timeseries_key = .timeseries_key` + for (i, (subq, alias)) in field_subqueries.iter().enumerate() { + // Push the subquery itself, aliased. + out.push('('); + out.push_str(subq); + out.push_str(") AS "); + out.push_str(alias); + + // Push the join conditions. + if i > 0 { + let previous_alias = &field_subqueries[i - 1].1; + out.push_str(" ON "); + out.push_str(alias); + out.push_str(".timeseries_key = "); + out.push_str(previous_alias); + out.push_str(".timeseries_key"); + } + + // Push the "INNER JOIN" expression itself, for all but the + // last subquery. + if i < field_subqueries.len() - 1 { + out.push_str(" INNER JOIN "); + } + } + (false, out) + } + } + } +} + +// Helper to update the number of total rows fetched so far, and check it's +// still under the limit. +fn update_total_rows_and_check( + query_log: &Logger, + total_rows_fetched: &mut u64, + count: u64, +) -> Result<(), Error> { + *total_rows_fetched += count; + if *total_rows_fetched > MAX_DATABASE_ROWS { + return Err(Error::from(anyhow::anyhow!( + "Query requires fetching more than the \ + current limit of {} data points from the \ + timeseries database", + MAX_DATABASE_ROWS, + ))); + } + trace!( + query_log, + "verified OxQL measurement query returns few enough results"; + "n_new_measurements" => count, + "n_total" => *total_rows_fetched, + "limit" => MAX_DATABASE_ROWS, + ); + Ok(()) +} + +#[cfg(test)] +mod tests { + use chrono::{DateTime, Utc}; + use dropshot::test_util::LogContext; + use omicron_test_utils::dev::clickhouse::ClickHouseInstance; + use omicron_test_utils::dev::test_setup_log; + use oximeter::Sample; + use oximeter::{types::Cumulative, FieldValue}; + use std::collections::BTreeMap; + use std::time::Duration; + + use crate::{ + oxql::{point::Points, Table, Timeseries}, + Client, DbWrite, + }; + + #[derive( + Clone, Debug, Eq, PartialEq, PartialOrd, Ord, oximeter::Target, + )] + struct SomeTarget { + name: String, + index: u32, + } + + #[derive(Clone, Debug, oximeter::Metric)] + struct SomeMetric { + foo: i32, + datum: Cumulative, + } + + #[derive(Clone, Debug)] + #[allow(dead_code)] + struct TestData { + targets: Vec, + // Note that we really want all the samples per metric _field_, not the + // full metric. That would give us a 1-element sample array for each. + samples_by_timeseries: BTreeMap<(SomeTarget, i32), Vec>, + first_timestamp: DateTime, + } + + struct TestContext { + logctx: LogContext, + clickhouse: ClickHouseInstance, + client: Client, + test_data: TestData, + } + + impl TestContext { + async fn cleanup_successful(mut self) { + self.clickhouse + .cleanup() + .await + .expect("Failed to cleanup ClickHouse server"); + self.logctx.cleanup_successful(); + } + } + + const N_SAMPLES_PER_TIMESERIES: usize = 16; + const SAMPLE_INTERVAL: Duration = Duration::from_secs(1); + const SHIFT: Duration = Duration::from_secs(1); + + fn format_timestamp(t: DateTime) -> String { + format!("{}", t.format("%Y-%m-%dT%H:%M:%S.%f")) + } + + fn generate_test_samples() -> TestData { + // We'll test with 4 different targets, each with two values for its + // fields. + let mut targets = Vec::with_capacity(4); + let names = &["first-target", "second-target"]; + let indices = 1..3; + for (name, index) in itertools::iproduct!(names, indices) { + let target = SomeTarget { name: name.to_string(), index }; + targets.push(target); + } + + // Create a start time for all samples. + // + // IMPORTANT: There is a TTL of 30 days on all data currently. I would + // love this to be a fixed, well-known start time, to make tests easier, + // but that's in conflict with the TTL. Instead, we'll use midnight on + // the current day, and then store it in the test data context. + let first_timestamp = + Utc::now().date_naive().and_hms_opt(0, 0, 0).unwrap().and_utc(); + + // For simplicity, we'll also assume all the cumulative measurements + // start at the first timestamp as well. + let datum = Cumulative::with_start_time(first_timestamp, 0); + + // We'll create two separate metrics, with 16 samples each. + let foos = [-1, 1]; + let mut samples_by_timeseries = BTreeMap::new(); + let mut timeseries_index = 0; + for target in targets.iter() { + for foo in foos.iter() { + // Shift this timeseries relative to the others, to ensure we + // have some different timestamps. + let timeseries_start = + first_timestamp + timeseries_index * SHIFT; + + // Create the first metric, starting from a count of 0. + let mut metric = SomeMetric { foo: *foo, datum }; + + // Create all the samples,, incrementing the datum and sample + // time. + for i in 0..N_SAMPLES_PER_TIMESERIES { + let sample_time = + timeseries_start + SAMPLE_INTERVAL * i as u32; + let sample = Sample::new_with_timestamp( + sample_time, + target, + &metric, + ) + .unwrap(); + samples_by_timeseries + .entry((target.clone(), *foo)) + .or_insert_with(|| { + Vec::with_capacity(N_SAMPLES_PER_TIMESERIES) + }) + .push(sample); + metric.datum += 1; + } + timeseries_index += 1; + } + } + TestData { targets, samples_by_timeseries, first_timestamp } + } + + async fn setup_oxql_test(name: &str) -> TestContext { + let logctx = test_setup_log(name); + let db = ClickHouseInstance::new_single_node(&logctx, 0) + .await + .expect("Failed to start ClickHouse"); + let client = Client::new(db.address, &logctx.log); + client + .init_single_node_db() + .await + .expect("Failed to init single-node oximeter database"); + let test_data = generate_test_samples(); + let samples: Vec<_> = test_data + .samples_by_timeseries + .values() + .flatten() + .cloned() + .collect(); + client + .insert_samples(&samples) + .await + .expect("Failed to insert test data"); + TestContext { logctx, clickhouse: db, client, test_data } + } + + #[tokio::test] + async fn test_get_entire_table() { + let ctx = setup_oxql_test("test_get_entire_table").await; + let query = "get some_target:some_metric"; + let result = ctx + .client + .oxql_query(query) + .await + .expect("failed to run OxQL query"); + assert_eq!(result.tables.len(), 1, "Should be exactly 1 table"); + let table = result.tables.get(0).unwrap(); + assert_eq!( + table.n_timeseries(), + ctx.test_data.samples_by_timeseries.len(), + "Should have fetched every timeseries" + ); + assert!( + table.iter().all(|t| t.points.len() == N_SAMPLES_PER_TIMESERIES), + "Should have fetched all points for all timeseries" + ); + + // Let's build the expected point array, from each timeseries we + // inserted. + let mut matched_timeseries = 0; + for ((target, foo), samples) in + ctx.test_data.samples_by_timeseries.iter() + { + let measurements: Vec<_> = + samples.iter().map(|s| s.measurement.clone()).collect(); + let expected_points = Points::delta_from_cumulative(&measurements) + .expect( + "failed to create expected points from inserted measurements", + ); + let expected_timeseries = + find_timeseries_in_table(&table, target, foo) + .expect("Table did not contain an expected timeseries"); + assert_eq!( + expected_timeseries.points, expected_points, + "Did not reconstruct the correct points for this timeseries" + ); + matched_timeseries += 1; + } + assert_eq!(matched_timeseries, table.len()); + assert_eq!( + matched_timeseries, + ctx.test_data.samples_by_timeseries.len() + ); + + ctx.cleanup_successful().await; + } + + #[tokio::test] + async fn test_get_one_timeseries() { + let ctx = setup_oxql_test("test_get_one_timeseries").await; + + // Specify exactly one timeseries we _want_ to fetch, by picking the + // first timeseries we inserted. + let ((expected_target, expected_foo), expected_samples) = + ctx.test_data.samples_by_timeseries.first_key_value().unwrap(); + let query = format!( + "get some_target:some_metric | filter {}", + exact_filter_for(expected_target, *expected_foo) + ); + let result = ctx + .client + .oxql_query(&query) + .await + .expect("failed to run OxQL query"); + assert_eq!(result.tables.len(), 1, "Should be exactly 1 table"); + let table = result.tables.get(0).unwrap(); + assert_eq!( + table.n_timeseries(), + 1, + "Should have fetched exactly the target timeseries" + ); + assert!( + table.iter().all(|t| t.points.len() == N_SAMPLES_PER_TIMESERIES), + "Should have fetched all points for all timeseries" + ); + + let expected_timeseries = + find_timeseries_in_table(&table, expected_target, expected_foo) + .expect("Table did not contain expected timeseries"); + let measurements: Vec<_> = + expected_samples.iter().map(|s| s.measurement.clone()).collect(); + let expected_points = Points::delta_from_cumulative(&measurements) + .expect("failed to build expected points from measurements"); + assert_eq!( + expected_points, expected_timeseries.points, + "Did not reconstruct the correct points for the one \ + timeseries the query fetched" + ); + + ctx.cleanup_successful().await; + } + + // In this test, we'll fetch the entire history of one timeseries, and only + // the last few samples of another. + // + // This checks that we correctly do complex logical operations that require + // fetching different sets of fields at different times. + #[tokio::test] + async fn test_get_entire_timeseries_and_part_of_another() { + usdt::register_probes().unwrap(); + let ctx = + setup_oxql_test("test_get_entire_timeseries_and_part_of_another") + .await; + + let mut it = ctx.test_data.samples_by_timeseries.iter(); + let (entire, only_part) = (it.next().unwrap(), it.next().unwrap()); + + let entire_filter = exact_filter_for(&entire.0 .0, entire.0 .1); + let only_part_filter = + exact_filter_for(&only_part.0 .0, only_part.0 .1); + let start_timestamp = only_part.1[6].measurement.timestamp(); + let only_part_timestamp_filter = format_timestamp(start_timestamp); + + let query = format!( + "get some_target:some_metric | filter ({}) || (timestamp >= @{} && {})", + entire_filter, + only_part_timestamp_filter, + only_part_filter, + ); + let result = ctx + .client + .oxql_query(&query) + .await + .expect("failed to run OxQL query"); + assert_eq!(result.tables.len(), 1, "Should be exactly 1 table"); + let table = result.tables.get(0).unwrap(); + assert_eq!( + table.n_timeseries(), + 2, + "Should have fetched exactly the two target timeseries" + ); + + // Check that we fetched the entire timeseries for the first one. + let expected_timeseries = + find_timeseries_in_table(table, &entire.0 .0, &entire.0 .1) + .expect("failed to fetch all of the first timeseries"); + let measurements: Vec<_> = + entire.1.iter().map(|s| s.measurement.clone()).collect(); + let expected_points = Points::delta_from_cumulative(&measurements) + .expect("failed to build expected points"); + assert_eq!( + expected_timeseries.points, expected_points, + "Did not collect the entire set of points for the first timeseries", + ); + + // And that we only get the last portion of the second timeseries. + let expected_timeseries = + find_timeseries_in_table(table, &only_part.0 .0, &only_part.0 .1) + .expect("failed to fetch part of the second timeseries"); + let measurements: Vec<_> = only_part + .1 + .iter() + .filter_map(|sample| { + let meas = &sample.measurement; + if meas.timestamp() >= start_timestamp { + Some(meas.clone()) + } else { + None + } + }) + .collect(); + let expected_points = Points::delta_from_cumulative(&measurements) + .expect("failed to build expected points"); + assert_eq!( + expected_timeseries.points, expected_points, + "Did not collect the last few points for the second timeseries", + ); + + ctx.cleanup_successful().await; + } + + // Return an OxQL filter item that will exactly select the provided + // timeseries by its target / metric. + fn exact_filter_for(target: &SomeTarget, foo: i32) -> String { + format!( + "name == '{}' && index == {} && foo == {}", + target.name, target.index, foo, + ) + } + + // Given a table from an OxQL query, look up the timeseries for the inserted + // target / metric, if it exists + fn find_timeseries_in_table<'a>( + table: &'a Table, + target: &'a SomeTarget, + foo: &'a i32, + ) -> Option<&'a Timeseries> { + for timeseries in table.iter() { + let fields = ×eries.fields; + + // Look up each field in turn, and compare it. + let FieldValue::String(val) = fields.get("name")? else { + unreachable!(); + }; + if val != &target.name { + continue; + } + let FieldValue::U32(val) = fields.get("index")? else { + unreachable!(); + }; + if val != &target.index { + continue; + } + let FieldValue::I32(val) = fields.get("foo")? else { + unreachable!(); + }; + if val != foo { + continue; + } + + // We done matched it. + return Some(timeseries); + } + None + } +} diff --git a/oximeter/db/src/client/query_summary.rs b/oximeter/db/src/client/query_summary.rs new file mode 100644 index 0000000000..b00a11c38e --- /dev/null +++ b/oximeter/db/src/client/query_summary.rs @@ -0,0 +1,123 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types representing summaries of queries against the timeseries database. + +// Copyright 2024 Oxide Computer Company + +use crate::Error; +use reqwest::header::HeaderMap; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::time::Duration; +use uuid::Uuid; + +/// A count of bytes / rows accessed during a query. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +pub struct IoCount { + /// The number of bytes accessed. + pub bytes: u64, + /// The number of rows accessed. + pub rows: u64, +} + +impl std::fmt::Display for IoCount { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{} rows ({} bytes)", self.rows, self.bytes) + } +} + +/// Summary of the I/O resources used by a query. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +#[serde(try_from = "serde_json::Value")] +pub struct IoSummary { + /// The bytes and rows read by the query. + pub read: IoCount, + /// The bytes and rows written by the query. + pub written: IoCount, +} + +impl TryFrom for IoSummary { + type Error = Error; + + fn try_from(j: serde_json::Value) -> Result { + use serde_json::Map; + use serde_json::Value; + use std::str::FromStr; + + let Value::Object(map) = j else { + return Err(Error::Database(String::from( + "Expected a JSON object for a metadata summary", + ))); + }; + + fn unpack_summary_value( + map: &Map, + key: &str, + ) -> Result + where + T: FromStr, + ::Err: std::error::Error, + { + let value = map.get(key).ok_or_else(|| { + Error::MissingHeaderKey { key: key.to_string() } + })?; + let Value::String(v) = value else { + return Err(Error::BadMetadata { + key: key.to_string(), + msg: String::from("Expected a string value"), + }); + }; + v.parse::().map_err(|e| Error::BadMetadata { + key: key.to_string(), + msg: e.to_string(), + }) + } + let rows_read: u64 = unpack_summary_value(&map, "read_rows")?; + let bytes_read: u64 = unpack_summary_value(&map, "read_bytes")?; + let rows_written: u64 = unpack_summary_value(&map, "written_rows")?; + let bytes_written: u64 = unpack_summary_value(&map, "written_bytes")?; + Ok(Self { + read: IoCount { bytes: bytes_read, rows: rows_read }, + written: IoCount { bytes: bytes_written, rows: rows_written }, + }) + } +} + +/// Basic metadata about the resource usage of a single SQL query. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +pub struct QuerySummary { + /// The database-assigned query ID. + pub id: Uuid, + /// The total duration of the query (network plus execution). + pub elapsed: Duration, + /// Summary of the data read and written. + pub io_summary: IoSummary, +} + +impl QuerySummary { + /// Construct a SQL query summary from the headers received from the DB. + pub(crate) fn from_headers( + elapsed: Duration, + headers: &HeaderMap, + ) -> Result { + fn get_header<'a>( + map: &'a HeaderMap, + key: &'a str, + ) -> Result<&'a str, Error> { + let hdr = map.get(key).ok_or_else(|| Error::MissingHeaderKey { + key: key.to_string(), + })?; + std::str::from_utf8(hdr.as_bytes()) + .map_err(|err| Error::Database(err.to_string())) + } + let summary = + serde_json::from_str(get_header(headers, "X-ClickHouse-Summary")?) + .map_err(|err| Error::Database(err.to_string()))?; + let id = get_header(headers, "X-ClickHouse-Query-Id")? + .parse() + .map_err(|err: uuid::Error| Error::Database(err.to_string()))?; + Ok(Self { id, elapsed, io_summary: summary }) + } +} diff --git a/oximeter/db/src/client/sql.rs b/oximeter/db/src/client/sql.rs new file mode 100644 index 0000000000..236faa7aa4 --- /dev/null +++ b/oximeter/db/src/client/sql.rs @@ -0,0 +1,104 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Client methods for running SQL queries againts timeseries themselves. +//! +//! This implements a prototype system for creating "virtual tables" from each +//! timeseries, letting us run SQL queries directly against them. These tables +//! are constructed via huge joins, which effectively reconstruct the entire +//! history of samples as received from the producers. Each row is the original +//! sample. This denormalization comes at a big cost, both in cycles and memory +//! usage, since we need to build the entire join in ClickHouse and send it all +//! to the client for deserialization. +//! +//! Thus this prototype is very useful for development, running analyses on +//! small datasets. It's less helpful on real deployments, where the size of +//! data makes this approach prohibitive. + +// Copyright 2024 Oxide Computer Company + +use super::query_summary::QuerySummary; +pub use crate::sql::RestrictedQuery; +use crate::Error; +use crate::{ + client::Client, + sql::{QueryResult, Table}, +}; +pub use indexmap::IndexMap; +use slog::debug; +pub use std::time::Instant; + +impl Client { + /// Transform a SQL query against a timeseries, but do not execute it. + pub async fn transform_query( + &self, + query: impl AsRef, + ) -> Result { + let restricted = RestrictedQuery::new(query.as_ref())?; + restricted.to_oximeter_sql(&*self.schema.lock().await) + } + + /// Run a SQL query against a timeseries. + pub async fn query( + &self, + query: impl AsRef, + ) -> Result { + use crate::client::handle_db_response; + + let original_query = query.as_ref().trim_end_matches(';'); + let ox_sql = self.transform_query(original_query).await?; + let rewritten = format!("{ox_sql} FORMAT JSONEachRow"); + debug!( + self.log, + "rewrote restricted query"; + "original_sql" => &original_query, + "rewritten_sql" => &rewritten, + ); + let request = self + .client + .post(&self.url) + .query(&[ + ("output_format_json_quote_64bit_integers", "0"), + ("database", crate::DATABASE_NAME), + ]) + .body(rewritten.clone()); + let query_start = Instant::now(); + let response = handle_db_response( + request + .send() + .await + .map_err(|err| Error::DatabaseUnavailable(err.to_string()))?, + ) + .await?; + let summary = QuerySummary::from_headers( + query_start.elapsed(), + response.headers(), + )?; + let text = response.text().await.unwrap(); + let mut table = Table::default(); + for line in text.lines() { + let row = + serde_json::from_str::>( + line.trim(), + ) + .unwrap(); + if table.column_names.is_empty() { + table.column_names.extend(row.keys().cloned()) + } else { + assert!(table + .column_names + .iter() + .zip(row.keys()) + .all(|(k1, k2)| k1 == k2)); + } + table.rows.push(row.into_values().collect()); + } + Ok(QueryResult { + original_query: original_query.to_string(), + rewritten_query: rewritten, + summary, + table, + }) + } +} diff --git a/oximeter/db/src/lib.rs b/oximeter/db/src/lib.rs index 24f7d8c2d0..642612b8db 100644 --- a/oximeter/db/src/lib.rs +++ b/oximeter/db/src/lib.rs @@ -4,7 +4,7 @@ //! Tools for interacting with the control plane telemetry database. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company use crate::query::StringFieldSelector; use chrono::DateTime; @@ -32,14 +32,17 @@ use thiserror::Error; mod client; pub mod model; +#[cfg(feature = "oxql")] +pub mod oxql; pub mod query; +#[cfg(any(feature = "sql", test))] pub mod sql; +#[cfg(feature = "oxql")] +pub use client::oxql::OxqlResult; +pub use client::query_summary::QuerySummary; pub use client::Client; pub use client::DbWrite; -pub use client::QueryMetadata; -pub use client::QueryResult; -pub use client::Table; pub use model::OXIMETER_VERSION; #[derive(Debug, Error)] @@ -58,7 +61,7 @@ pub enum Error { BadMetadata { key: String, msg: String }, /// An error interacting with the telemetry database - #[error("Error interacting with telemetry database")] + #[error("Error interacting with telemetry database: {0}")] Database(String), /// A schema provided when collecting samples did not match the expected schema @@ -134,8 +137,20 @@ pub enum Error { #[error("Schema update versions must be sequential without gaps")] NonSequentialSchemaVersions, + #[cfg(any(feature = "sql", test))] #[error("SQL error")] Sql(#[from] sql::Error), + + #[cfg(any(feature = "oxql", test))] + #[error(transparent)] + Oxql(oxql::Error), +} + +#[cfg(any(feature = "oxql", test))] +impl From for Error { + fn from(e: crate::oxql::Error) -> Self { + Error::Oxql(e) + } } impl From for TimeseriesSchema { diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index b1b45eabc4..414ad25ba7 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -1600,30 +1600,23 @@ pub(crate) fn parse_field_select_row( ) -> (TimeseriesKey, Target, Metric) { assert_eq!( row.fields.len(), - 2 * schema.field_schema.len(), - "Expected pairs of (field_name, field_value) from the field query" + schema.field_schema.len(), + "Expected the same number of fields in each row as the schema itself", ); let (target_name, metric_name) = schema.component_names(); let mut target_fields = Vec::new(); let mut metric_fields = Vec::new(); - let mut actual_fields = row.fields.values(); + let mut actual_fields = row.fields.iter(); for _ in 0..schema.field_schema.len() { // Extract the field name from the row and find a matching expected field. - let actual_field_name = actual_fields + let (actual_field_name, actual_field_value) = actual_fields .next() .expect("Missing a field name from a field select query"); - let name = actual_field_name - .as_str() - .expect("Expected a string field name") - .to_string(); - let expected_field = schema.schema_for_field(&name).expect( + let expected_field = schema.schema_for_field(actual_field_name).expect( "Found field with name that is not part of the timeseries schema", ); // Parse the field value as the expected type - let actual_field_value = actual_fields - .next() - .expect("Missing a field value from a field select query"); let value = match expected_field.field_type { FieldType::Bool => { FieldValue::Bool(bool::from(DbBool::from( @@ -1726,7 +1719,7 @@ pub(crate) fn parse_field_select_row( ) } }; - let field = Field { name, value }; + let field = Field { name: actual_field_name.to_string(), value }; match expected_field.source { FieldSource::Target => target_fields.push(field), FieldSource::Metric => metric_fields.push(field), diff --git a/oximeter/db/src/oxql/ast/cmp.rs b/oximeter/db/src/oxql/ast/cmp.rs new file mode 100644 index 0000000000..ea33056c1f --- /dev/null +++ b/oximeter/db/src/oxql/ast/cmp.rs @@ -0,0 +1,76 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! An AST node describing comparison operators + +// Copyright 2024 Oxide Computer Company + +use std::fmt; + +/// Comparison operators. +// TODO-completeness: Operators for other types, like IP containment ('<<'). +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Comparison { + /// Equality comparison. + Eq, + /// Inequality comparison. + Ne, + /// Greater-than comparison + Gt, + /// Greater-than or equals comparison + Ge, + /// Lesser-than comparison + Lt, + /// Lesser-than or equals comparison + Le, + /// Regular expression pattern matching. + Like, +} + +impl Comparison { + // Return the _function name_ of the comparison that is safe for use in + // ClickHouse. + // + // Note that we're always using the functional form for these comparisons, + // even when they have obvious operators. E.g., we return `"equals"` for the + // `Comparison::Eq` rather than `"=="`. + // + // This is to normalize the different comparisons we support, which do not + // all have operator formats. `Comparison::Like` is the best example, but we + // may also want to support things like IP address containment. While DBs + // like PostgreSQL have the `<<` operator for that, ClickHouse supports only + // the function `isIPAddressInRange()`. + // + // One consequence of this is that the caller needs to wrap the argument in + // parentheses manually. + pub(crate) fn as_db_function_name(&self) -> &'static str { + match self { + Comparison::Eq => "equals", + Comparison::Ne => "notEquals", + Comparison::Gt => "greater", + Comparison::Ge => "greaterOrEquals", + Comparison::Lt => "less", + Comparison::Le => "lessOrEquals", + Comparison::Like => "match", + } + } +} + +impl fmt::Display for Comparison { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}", + match self { + Comparison::Eq => "==", + Comparison::Ne => "!=", + Comparison::Gt => ">", + Comparison::Ge => ">=", + Comparison::Lt => "<", + Comparison::Le => "<=", + Comparison::Like => "~=", + } + ) + } +} diff --git a/oximeter/db/src/oxql/ast/grammar.rs b/oximeter/db/src/oxql/ast/grammar.rs new file mode 100644 index 0000000000..00a0e6e0fe --- /dev/null +++ b/oximeter/db/src/oxql/ast/grammar.rs @@ -0,0 +1,1334 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Grammar for the Oximeter Query Language (OxQL). + +// Copyright 2024 Oxide Computer + +peg::parser! { + pub grammar query_parser() for str { + use crate::oxql::ast::cmp::Comparison; + use crate::oxql::ast::table_ops::align::Align; + use crate::oxql::ast::table_ops::align::AlignmentMethod; + use crate::oxql::ast::table_ops::filter::SimpleFilter; + use crate::oxql::ast::table_ops::filter::FilterExpr; + use crate::oxql::ast::table_ops::filter::Filter; + use crate::oxql::ast::table_ops::filter::CompoundFilter; + use crate::oxql::ast::table_ops::get::Get; + use crate::oxql::ast::table_ops::group_by::GroupBy; + use crate::oxql::ast::ident::Ident; + use crate::oxql::ast::literal::Literal; + use crate::oxql::ast::logical_op::LogicalOp; + use crate::oxql::ast::Query; + use crate::oxql::ast::table_ops::join::Join; + use crate::oxql::ast::table_ops::GroupedTableOp; + use crate::oxql::ast::table_ops::BasicTableOp; + use crate::oxql::ast::table_ops::TableOp; + use crate::oxql::ast::table_ops::group_by::Reducer; + use crate::oxql::ast::literal::duration_consts; + use oximeter::TimeseriesName; + use std::time::Duration; + use uuid::Uuid; + use chrono::Utc; + use chrono::DateTime; + use chrono::NaiveDateTime; + use chrono::NaiveDate; + use chrono::NaiveTime; + use std::net::IpAddr; + use std::net::Ipv4Addr; + use std::net::Ipv6Addr; + + rule _ = quiet!{[' ' | '\n' | '\t']+} / expected!("whitespace") + + // Parse boolean literals. + rule true_literal() -> bool = "true" { true } + rule false_literal() -> bool = "false" { false } + pub(super) rule boolean_literal_impl() -> bool + = quiet! { true_literal() / false_literal() } / expected!("boolean literal") + + pub rule boolean_literal() -> Literal + = b:boolean_literal_impl() { Literal::Boolean(b) } + + // Parse duration literals. + rule year() -> Duration + = "Y" { duration_consts::YEAR } + rule month() -> Duration + = "M" { duration_consts::MONTH } + rule week() -> Duration + = "w" { duration_consts::WEEK } + rule day() -> Duration + = "d" { duration_consts::DAY } + rule hour() -> Duration + = "h" { duration_consts::HOUR } + rule minute() -> Duration + = "m" { duration_consts::MINUTE } + rule second() -> Duration + = "s" { duration_consts::SECOND } + rule millisecond() -> Duration + = "ms" { duration_consts::MILLISECOND } + rule microsecond() -> Duration + = "us" { duration_consts::MICROSECOND } + rule nanosecond() -> Duration + = "ns" { duration_consts::NANOSECOND } + pub(super) rule duration_literal_impl() -> Duration + = count:integer_literal_impl() base:( + year() / + month() / + week() / day() / + hour() / + millisecond() / + minute() / + second() / + microsecond() / + nanosecond() + ) + {? + // NOTE: This count is the factor by which we multiply the base + // unit. So it counts the number of nanos, millis, or days, etc. It + // does not limit the total duration itself. + let Ok(count) = u32::try_from(count) else { + return Err("invalid count for duration literal"); + }; + base.checked_mul(count).ok_or("overflowed duration literal") + } + + /// Parse a literal duration from a string. + /// + /// Durations are written as a positive integer multiple of a base time + /// unit. For example, `7s` is interpreted as 7 seconds. Supported units + /// are: + /// + /// - 'y': an approximate year, 365 days + /// - 'M': an approximate month, 30 days + /// - 'w': an approximate week, 7 days + /// - 'h': an hour, 3600 seconds + /// - 'm': a minute, 60 seconds + /// - 's': seconds + /// - 'ms': milliseconds + /// - 'us': microseconds + /// - 'ns': nanoseconds + pub rule duration_literal() -> Literal + = d:duration_literal_impl() { Literal::Duration(d) } + + /// Parse a literal timestamp. + /// + /// Timestamps are literals prefixed with `@`. They can be in one of + /// several formats: + /// + /// - YYYY-MM-DD + /// - HH:MM:SS[.f] + /// - RFC 3339, `YYYY-MM-DDTHH:MM:SS.f` + /// - The literal `now()`, possibly with some simple offset expression, + /// such as `now() - 5m`. The offset must be a duration. + /// + /// All timestamps are in UTC. + pub rule timestamp_literal() -> Literal + = t:timestamp_literal_impl() { Literal::Timestamp(t) } + + rule timestamp_literal_impl() -> DateTime + = timestamp_string() + / now_timestamp() + + pub(super) rule timestamp_string() -> DateTime + = "@" s:$(['0'..='9' | '-' | 'T' | ':' | '.']+) + {? + if let Ok(t) = NaiveDate::parse_from_str(s, "%F") { + return Ok(t.and_hms_opt(0, 0, 0).unwrap().and_utc()); + } + if let Ok(t) = NaiveTime::parse_from_str(s, "%H:%M:%S%.f") { + return Ok(NaiveDateTime::new(Utc::now().date_naive(), t).and_utc()); + } + if let Ok(t) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") { + return Ok(t.and_utc()); + } + Err("a recognized timestamp format") + } + + rule now_offset() -> (bool, Duration) + = _? sign:['+' | '-'] _? dur:duration_literal_impl() + { + let negative = matches!(sign, '-'); + (negative, dur) + } + + pub(super) rule now_timestamp() -> DateTime + = "@now()" maybe_offset:now_offset()? + { + let now = Utc::now(); + if let Some((negative, offset)) = maybe_offset { + if negative { + now - offset + } else { + now + offset + } + } else { + now + } + } + + /// Parse an IP address literal, either IPv4 or IPv6 + pub rule ip_literal() -> Literal + = ip:ipv4_literal() { Literal::IpAddr(IpAddr::V4(ip)) } + / ip:ipv6_literal() { Literal::IpAddr(IpAddr::V6(ip)) } + + pub(super) rule ipv4_literal() -> Ipv4Addr + = "\"" s:$((['0'..='9']*<1,3>)**<4> ".") "\"" + {? + s.parse().map_err(|_| "an IPv4 address") + } + + pub(super) rule ipv6_literal() -> Ipv6Addr + = "\"" s:$(['a'..='f' | '0'..='9' | ':']+) "\"" + {? + s.parse().map_err(|_| "an IPv6 address") + } + + rule dashed_uuid_literal() -> Uuid + = s:$( + "\"" + ['a'..='f' | '0'..='9']*<8> "-" + ['a'..='f' | '0'..='9']*<4> "-" + ['a'..='f' | '0'..='9']*<4> "-" + ['a'..='f' | '0'..='9']*<4> "-" + ['a'..='f' | '0'..='9']*<12> + "\"" + ) {? + let Some(middle) = s.get(1..37) else { + return Err("invalid UUID literal"); + }; + middle.parse().or(Err("invalid UUID literal")) + } + rule undashed_uuid_literal() -> Uuid + = s:$("\"" ['a'..='f' | '0'..='9']*<32> "\"") {? + let Some(middle) = s.get(1..33) else { + return Err("invalid UUID literal"); + }; + middle.parse().or(Err("invalid UUID literal")) + } + pub(super) rule uuid_literal_impl() -> Uuid + = dashed_uuid_literal() / undashed_uuid_literal() + + /// Parse UUID literals. + /// + /// UUIDs should be quoted with `"` and can include or omit dashes + /// between the segments. Both of the following are equivalent. + /// + /// "fc59ab26-f1d8-44ca-abbc-dd8f61321433" + /// "fc59ab26f1d844caabbcdd8f61321433" + pub rule uuid_literal() -> Literal + = id:uuid_literal_impl() { Literal::Uuid(id) } + + // Parse string literals. + rule any_but_single_quote() -> String + = s:$([^'\'']*) + {? + recognize_escape_sequences(s).ok_or("invalid single quoted string") + } + + rule any_but_double_quote() -> String + = s:$([^'"']*) + {? + recognize_escape_sequences(s).ok_or("invalid double quoted string") + } + + rule single_quoted_string() -> String + = "'" s:any_but_single_quote() "'" { s } + + rule double_quoted_string() -> String + = "\"" s:any_but_double_quote() "\"" { s } + + pub(super) rule string_literal_impl() -> String + = single_quoted_string() / double_quoted_string() + + /// Parse a string literal, either single- or double-quoted. + /// + /// Parsing string literals is pretty tricky, but we add several + /// constraints to simplify things. First strings must be quoted, either + /// with single- or double-quotes. E.g., the strings `"this"` and + /// `'this'` parse the same way. + /// + /// We require that the string not _contain_ its quote-style, so there + /// can't be any embedded single-quotes in a single-quoted string, or + /// double-quotes in a double-quoted string. Each quote-style may contain + /// the quote from the other style. + /// + /// We support the following common escape sequences: + /// + /// ```ignore + /// \n + /// \r + /// \t + /// \\ + /// \0 + /// ``` + /// + /// Beyond this, any valid Unicode code point, written in the usual Rust + /// style, is supported. For example, `\u{1234}` is accepted and mapped + /// to `ሴ` upon parsing. This also allows users to write both quote + /// styles if required, by writing them as their Unicode escape + /// sequences. For example, this string: + /// + /// ```ignore + /// "this string has \u{22} in it" + /// ``` + /// + /// Will be parsed as `this string has " in it`. + pub rule string_literal() -> Literal + = s:string_literal_impl() { Literal::String(s) } + + pub(super) rule integer_literal_impl() -> i128 + = n:$("-"? ['0'..='9']+ !['e' | 'E' | '.']) + {? + let Ok(x) = n.parse() else { + return Err("integer literal"); + }; + if x < i128::from(i64::MIN) { + Err("negative overflow") + } else if x > i128::from(u64::MAX) { + Err("positive overflow") + } else { + Ok(x) + } + } + + /// Parse integer literals. + pub rule integer_literal() -> Literal + = n:integer_literal_impl() { Literal::Integer(n) } + + // We're being a bit lazy here, since the rule expression isn't exactly + // right. But we rely on calling `f64`'s `FromStr` implementation to + // actually verify the values can be parsed. + pub(super) rule double_literal_impl() -> f64 + = n:$("-"? ['0'..='9']* "."? ['0'..='9']* (['e' | 'E'] "-"? ['0'..='9']+)*) {? + n.parse().or(Err("double literal")) + } + + // Parse double literals. + pub rule double_literal() -> Literal + = d:double_literal_impl() { Literal::Double(d) } + + /// Parse a literal. + /// + /// Literals are typed, with support for bools, durations, integers and + /// doubles, UUIDs, and general strings. See the rules for each type of + /// literal for details on supported formats. + pub rule literal() -> Literal + = lit:( + boolean_literal() / + duration_literal() / + integer_literal() / + double_literal() / + uuid_literal() / + ip_literal() / + string_literal() / + timestamp_literal() + ) + { + lit + } + + /// Parse a logical operator. + pub(super) rule logical_op_impl() -> LogicalOp + = "||" { LogicalOp::Or} + / "&&" { LogicalOp::And } + / "^" { LogicalOp::Xor } + + + // NOTES: + // + // The rules below are all used to parse a filtering expression. This + // turns out to be surprisingly complicated to express succinctly in + // `peg`, but there are a few tricks. First, it's important that we do + // not try to parse negation ("!") inside the filtering atoms -- it's a + // higher-level concept, and not part of the atom itself. + // + // Second, it's not clear how to use `peg`'s precendence macro to + // correctly describe the precedence. Things are recursive, but we + // choose to define that in the rules themselves, rather than explicitly + // with precedence levels. This is common in PEG definitions, and the + // main trick is force things _not_ to be left-recursive, and use two + // rules tried in sequence. The `factor` rule is a good example of this. + // + // Another example is the logical OR / AND / XOR parsing. We start with + // OR, which is the lowest precedence, and move to the others in + // sequence. Each is defined as parsing either the "thing itself", e.g., + // `foo || bar` for the OR rule; or the rule with next-higher + // precedence. + // + // IMPORTANT: The #[cache] directives on the rules below are _critical_ + // to avoiding wildly exponential runtime with nested expressions. + + /// Parse a logical negation + pub rule not() = "!" + + /// A factor is a logically negated expression, or a primary expression. + #[cache] + pub rule factor() -> Filter + = not() _? factor:factor() + { + Filter { + negated: !factor.negated, + expr: factor.expr + } + } + / p:primary() { p } + + /// A primary expression is either a comparison "atom", e.g., `foo == + /// "bar"`, or a grouping around a sequence of such things. + #[cache] + pub rule primary() -> Filter + = atom:comparison_atom() + {? + if matches!(atom.cmp, Comparison::Like) && !matches!(atom.value, Literal::String(_)) { + Err("~= comparison is only supported for string literals") + } else { + Ok(Filter { negated: false, expr: FilterExpr::Simple(atom) }) + } + } + / "(" _? or:logical_or_expr() _? ")" { or } + + /// A comparison atom is a base-case for all this recursion. + /// + /// It specifies a single comparison between an identifier and a value, + /// using a specific comparison operator. For example, this parses `foo + /// == "bar"`. + pub rule comparison_atom() -> SimpleFilter + = ident:ident() _? cmp:comparison() _? value:literal() + { + SimpleFilter { ident, cmp, value } + } + + /// Two filtering expressions combined with a logical OR. + /// + /// An OR expression is two logical ANDs joined with "||", or just a + /// bare logical AND expression. + #[cache] + pub rule logical_or_expr() -> Filter + = left:logical_and_expr() _? "||" _? right:logical_or_expr() + { + let compound = CompoundFilter { + left: Box::new(left), + op: LogicalOp::Or, + right: Box::new(right), + }; + Filter { negated: false, expr: FilterExpr::Compound(compound) } + } + / logical_and_expr() + + /// Two filtering expressions combined with a logical AND. + /// + /// A logical AND expression is two logical XORs joined with "&&", or + /// just a bare logical XOR expression. + #[cache] + pub rule logical_and_expr() -> Filter + = left:logical_xor_expr() _? "&&" _? right:logical_and_expr() + { + let compound = CompoundFilter { + left: Box::new(left), + op: LogicalOp::And, + right: Box::new(right), + }; + Filter { negated: false, expr: FilterExpr::Compound(compound) } + } + / logical_xor_expr() + + /// Two filtering expressions combined with a logical XOR. + /// + /// A logical XOR expression is two logical XORs joined with "^ or + /// just a bare factor. Note that this either hits the base case, if + /// `factor` is actually an atom, or recurses again if its a logical OR + /// expression. + /// + /// Note that this is the highest-precedence logical operator. + #[cache] + pub rule logical_xor_expr() -> Filter + = left:factor() _? "^" _? right:logical_xor_expr() + { + let compound = CompoundFilter { + left: Box::new(left), + op: LogicalOp::Xor, + right: Box::new(right), + }; + Filter { negated: false, expr: FilterExpr::Compound(compound) } + } + / factor:factor() { factor } + + /// Parse the _logical expression_ part of a `filter` table operation. + pub rule filter_expr() -> Filter = logical_or_expr() + + /// Parse a "filter" table operation. + pub rule filter() -> Filter + = "filter" _ expr:filter_expr() _? + { + expr + } + + pub(super) rule ident_impl() -> &'input str + = quiet!{ inner:$(['a'..='z']+ ['a'..='z' | '0'..='9']* ("_" ['a'..='z' | '0'..='9']+)*) } / + expected!("A valid identifier") + + /// Parse an identifier, usually a column name. + pub rule ident() -> Ident + = inner:ident_impl() { Ident(inner.to_string()) } + + pub(super) rule comparison() -> Comparison + = "==" { Comparison::Eq } + / "!=" { Comparison::Ne } + / ">=" { Comparison::Ge } + / ">" { Comparison::Gt } + / "<=" { Comparison::Le } + / "<" { Comparison::Lt } + / "~=" { Comparison::Like } + + pub rule timeseries_name() -> TimeseriesName + = target_name:ident_impl() ":" metric_name:ident_impl() + {? + format!("{target_name}:{metric_name}") + .try_into() + .map_err(|_| "invalid timeseries name") + } + + rule get_delim() = quiet!{ _? "," _? } + + /// Parse a "get" table operation. + pub rule get() -> Vec + = "get" _ names:(timeseries_name() **<1,> get_delim()) + { + names.into_iter().map(|t| Get { timeseries_name: t }).collect() + } + + /// Parse a reducing operation by name. + pub rule reducer() -> Reducer + = "mean" { Reducer::Mean } + / "sum" { Reducer::Sum } + / expected!("a reducer name") + + rule ws_with_comma() = _? "," _? + pub rule group_by() -> GroupBy + = "group_by" + _ + "[" _? identifiers:(ident() ** ws_with_comma()) ","? _? "]" + reducer:("," _? red:reducer() { red })? + { + GroupBy { + identifiers, + reducer: reducer.unwrap_or_default(), + } + } + + /// Parse a `join` table operation. + pub rule join() = "join" {} + + pub(super) rule alignment_method() -> AlignmentMethod + = "interpolate" { AlignmentMethod::Interpolate } + / "mean_within" { AlignmentMethod::MeanWithin } + + /// Parse an alignment table operation. + pub rule align() -> Align + = "align" _ method:alignment_method() "(" period:duration_literal_impl() ")" + { + Align { method, period } + } + + pub(super) rule basic_table_op() -> TableOp + = g:"get" _ t:timeseries_name() { TableOp::Basic(BasicTableOp::Get(t)) } + / f:filter() { TableOp::Basic(BasicTableOp::Filter(f)) } + / g:group_by() { TableOp::Basic(BasicTableOp::GroupBy(g)) } + / join() { TableOp::Basic(BasicTableOp::Join(Join)) } + / a:align() { TableOp::Basic(BasicTableOp::Align(a)) } + + pub(super) rule grouped_table_op() -> TableOp + = "{" _? ops:(query() ++ grouped_table_op_delim()) _? "}" + { + TableOp::Grouped(GroupedTableOp { ops }) + } + + /// Parse a top-level OxQL query. + /// + /// Queries always start with a "get" operation, and may be followed by + /// any number of other timeseries transformations + pub rule query() -> Query + = ops:(basic_table_op() / grouped_table_op()) ++ query_delim() + {? + let query = Query { ops }; + if query.all_gets_at_query_start() { + Ok(query) + } else { + Err("every subquery must start with a `get` operation") + } + } + + rule grouped_table_op_delim() = quiet!{ _? ";" _? } + rule query_delim() = quiet!{ _? "|" _? } + } +} + +// Recognize escape sequences and convert them into the intended Unicode point +// they represent. +// +// For example, the string containing ASCII "abcd" is returned unchanged. +// +// The string containing "\u{1234}" is returned as the string "ሴ". Note that the +// Unicode bytes must be enclosed in {}, and can have length 1-6. +// +// If the string contains an invalid escape sequence, such as "\uFFFF", or a +// control code, such as `\u07`, `None` is returned. +// +// Note that the main goal of this method is to _unescape_ relevant sequences. +// We will get queries that may contain escaped sequences, like `\\\n`, which +// this method will unescape to `\n`. +fn recognize_escape_sequences(s: &str) -> Option { + let mut out = String::with_capacity(s.len()); + + let mut chars = s.chars().peekable(); + while let Some(ch) = chars.next() { + match ch { + '\\' => { + let Some(next_ch) = chars.next() else { + // Escape at the end of the string + return None; + }; + match next_ch { + 'n' => out.push('\n'), + 'r' => out.push('\r'), + 't' => out.push('\t'), + '\\' => out.push('\\'), + '0' => out.push('\0'), + 'u' => { + // We need this to be delimited by {}, and between 1 and + // 6 characters long. + if !matches!(chars.next(), Some('{')) { + return None; + } + + let mut digits = String::with_capacity(6); + let mut found_closing_brace = false; + while !found_closing_brace && digits.len() < 7 { + // Take the next value, if it's a hex digit or the + // closing brace. + let Some(next) = chars.next_if(|ch| { + ch.is_ascii_hexdigit() || *ch == '}' + }) else { + break; + }; + if next.is_ascii_hexdigit() { + digits.push(next); + continue; + } + found_closing_brace = true; + } + if !found_closing_brace { + return None; + } + let val = u32::from_str_radix(&digits, 16).ok()?; + let decoded = char::from_u32(val)?; + out.push(decoded) + } + _ => return None, + } + } + _ => out.push(ch), + } + } + Some(out) +} + +#[cfg(test)] +mod tests { + use super::query_parser; + use crate::oxql::ast::cmp::Comparison; + use crate::oxql::ast::grammar::recognize_escape_sequences; + use crate::oxql::ast::ident::Ident; + use crate::oxql::ast::literal::Literal; + use crate::oxql::ast::logical_op::LogicalOp; + use crate::oxql::ast::table_ops::align::Align; + use crate::oxql::ast::table_ops::align::AlignmentMethod; + use crate::oxql::ast::table_ops::filter::CompoundFilter; + use crate::oxql::ast::table_ops::filter::Filter; + use crate::oxql::ast::table_ops::filter::FilterExpr; + use crate::oxql::ast::table_ops::filter::SimpleFilter; + use crate::oxql::ast::table_ops::group_by::Reducer; + use chrono::DateTime; + use chrono::NaiveDate; + use chrono::NaiveDateTime; + use chrono::NaiveTime; + use chrono::TimeZone; + use chrono::Utc; + use std::net::IpAddr; + use std::net::Ipv4Addr; + use std::net::Ipv6Addr; + use std::time::Duration; + use uuid::Uuid; + + #[test] + fn test_boolean_literal() { + assert_eq!(query_parser::boolean_literal_impl("true").unwrap(), true); + assert_eq!(query_parser::boolean_literal_impl("false").unwrap(), false); + } + + #[test] + fn test_duration_literal() { + for (as_str, dur) in [ + ("7Y", Duration::from_secs(60 * 60 * 24 * 365 * 7)), + ("7M", Duration::from_secs(60 * 60 * 24 * 30 * 7)), + ("7w", Duration::from_secs(60 * 60 * 24 * 7 * 7)), + ("7d", Duration::from_secs(60 * 60 * 24 * 7)), + ("7h", Duration::from_secs(60 * 60 * 7)), + ("7m", Duration::from_secs(60 * 7)), + ("7s", Duration::from_secs(7)), + ("7ms", Duration::from_millis(7)), + ("7us", Duration::from_micros(7)), + ("7ns", Duration::from_nanos(7)), + ] { + assert_eq!( + query_parser::duration_literal_impl(as_str).unwrap(), + dur + ); + } + + assert!(query_parser::duration_literal_impl("-1m").is_err()); + let too_big: i64 = u32::MAX as i64 + 1; + assert!(query_parser::duration_literal_impl(&format!("{too_big}s")) + .is_err()); + } + + #[test] + fn test_uuid_literal() { + const ID: Uuid = uuid::uuid!("9f8900bd-886d-4988-b623-95b7fda36d23"); + let as_string = format!("\"{}\"", ID); + assert_eq!(query_parser::uuid_literal_impl(&as_string).unwrap(), ID); + let without_dashes = as_string.replace('-', ""); + assert_eq!( + query_parser::uuid_literal_impl(&without_dashes).unwrap(), + ID + ); + + assert!(query_parser::uuid_literal_impl( + &as_string[1..as_string.len() - 2] + ) + .is_err()); + assert!(query_parser::uuid_literal_impl( + &without_dashes[1..without_dashes.len() - 2] + ) + .is_err()); + } + + #[test] + fn test_integer_literal() { + assert_eq!(query_parser::integer_literal_impl("1").unwrap(), 1); + assert_eq!(query_parser::integer_literal_impl("-1").unwrap(), -1); + assert_eq!(query_parser::integer_literal_impl("-1").unwrap(), -1); + + assert!(query_parser::integer_literal_impl("-1.0").is_err()); + assert!(query_parser::integer_literal_impl("-1.").is_err()); + assert!(query_parser::integer_literal_impl("1e3").is_err()); + } + + #[test] + fn test_double_literal() { + assert_eq!(query_parser::double_literal_impl("1.0").unwrap(), 1.0); + assert_eq!(query_parser::double_literal_impl("-1.0").unwrap(), -1.0); + assert_eq!(query_parser::double_literal_impl("1.").unwrap(), 1.0); + assert_eq!(query_parser::double_literal_impl("-1.").unwrap(), -1.0); + assert_eq!(query_parser::double_literal_impl(".5").unwrap(), 0.5); + assert_eq!(query_parser::double_literal_impl("-.5").unwrap(), -0.5); + assert_eq!(query_parser::double_literal_impl("1e3").unwrap(), 1e3); + assert_eq!(query_parser::double_literal_impl("-1e3").unwrap(), -1e3); + assert_eq!(query_parser::double_literal_impl("-1e-3").unwrap(), -1e-3); + assert_eq!( + query_parser::double_literal_impl("0.5e-3").unwrap(), + 0.5e-3 + ); + + assert!(query_parser::double_literal_impl("-.e4").is_err()); + assert!(query_parser::double_literal_impl("-.e-4").is_err()); + assert!(query_parser::double_literal_impl("1e").is_err()); + } + + #[test] + fn test_recognize_escape_sequences_with_none() { + for each in ["", "abc", "$%("] { + assert_eq!(recognize_escape_sequences(each).unwrap(), each); + } + } + + #[test] + fn test_recognize_escape_sequence_with_valid_unicode_sequence() { + // Welp, let's just test every possible code point. + for x in 0..=0x10FFFF { + let expected = char::from_u32(x); + let as_hex = format!("{x:0x}"); + let sequence = format!("\\u{{{as_hex}}}"); + let recognized = recognize_escape_sequences(&sequence) + .map(|s| s.chars().next().unwrap()); + assert_eq!( + expected, recognized, + "did not correctly recognized Unicode escape sequence" + ); + } + } + + #[test] + fn test_recognize_escape_sequences_with_invalid_unicode_sequence() { + for each in [ + r#"\uFFFF"#, // Valid, but not using {} delimiters + r#"\u{}"#, // Not enough characters. + r#"\u{12345678}"#, // Too many characters + r#"\u{ZZZZ}"#, // Not hex digits + r#"\u{d800}"#, // A surrogate code point, not valid. + r#"\u{1234"#, // Valid, but missing closing brace. + ] { + println!("{each}"); + assert!(recognize_escape_sequences(each).is_none()); + } + } + + #[test] + fn test_recognize_escape_sequences_with_valid_escape_sequence() { + for (as_str, expected) in [ + (r#"\n"#, '\n'), + (r#"\r"#, '\r'), + (r#"\t"#, '\t'), + (r#"\0"#, '\0'), + (r#"\\"#, '\\'), + ] { + let recognized = recognize_escape_sequences(as_str).unwrap(); + assert_eq!(recognized.chars().next().unwrap(), expected); + } + } + + #[test] + fn test_single_quoted_string_literal() { + for (input, expected) in [ + ("''", String::new()), + ("'simple'", String::from("simple")), + ("'袈►♖'", String::from("袈►♖")), + (r#"'escapes \n handled'"#, String::from("escapes \n handled")), + (r#"'may contain " in it'"#, String::from("may contain \" in it")), + ( + r#"'may contain "\u{1234}" in it'"#, + String::from("may contain \"ሴ\" in it"), + ), + ] { + assert_eq!( + query_parser::string_literal_impl(input).unwrap(), + expected + ); + } + assert!(query_parser::string_literal_impl(r#"' cannot have ' in it'"#) + .is_err()); + } + + #[test] + fn test_double_quoted_string_literal() { + for (input, expected) in [ + ("\"\"", String::new()), + ("\"simple\"", String::from("simple")), + ("\"袈►♖\"", String::from("袈►♖")), + (r#""escapes \n handled""#, String::from("escapes \n handled")), + (r#""may contain ' in it""#, String::from("may contain ' in it")), + ( + r#""may contain '\u{1234}' in it""#, + String::from("may contain 'ሴ' in it"), + ), + ] { + assert_eq!( + query_parser::string_literal_impl(input).unwrap(), + expected + ); + } + + assert!(query_parser::string_literal_impl(r#"" cannot have " in it""#) + .is_err()); + } + + #[test] + fn test_comparison() { + for (as_str, cmp) in [ + ("==", Comparison::Eq), + ("!=", Comparison::Ne), + (">=", Comparison::Ge), + (">", Comparison::Gt), + ("<=", Comparison::Le), + ("<", Comparison::Lt), + ("~=", Comparison::Like), + ] { + assert_eq!(query_parser::comparison(as_str).unwrap(), cmp); + } + } + + #[test] + fn test_filter_expr_single_simple_expression() { + let expr = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Boolean(true), + }), + }; + assert_eq!(query_parser::filter_expr("a == true").unwrap(), expr); + assert_eq!(query_parser::filter_expr("(a == true)").unwrap(), expr); + + assert!(query_parser::filter_expr("(a == true").is_err()); + } + + #[test] + fn test_filter_expr_single_negated_simple_expression() { + let expr = Filter { + negated: true, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Gt, + value: Literal::Double(1.0), + }), + }; + assert_eq!(query_parser::filter_expr("!(a > 1.)").unwrap(), expr,); + + assert!(query_parser::filter_expr("!(a > 1.0").is_err()); + } + + #[test] + fn test_filter_expr_two_simple_filter_expressions() { + let left = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Boolean(true), + }), + }; + + for op in [LogicalOp::And, LogicalOp::Or] { + let expected = left.merge(&left, op); + // Match with either parenthesized. + let as_str = format!("a == true {op} (a == true)"); + assert_eq!(query_parser::filter_expr(&as_str).unwrap(), expected); + let as_str = format!("(a == true) {op} a == true"); + assert_eq!(query_parser::filter_expr(&as_str).unwrap(), expected); + let as_str = format!("(a == true) {op} (a == true)"); + assert_eq!(query_parser::filter_expr(&as_str).unwrap(), expected); + } + } + + #[test] + fn test_filter_expr_operator_precedence() { + // We'll combine the following simple expression in a number of + // different sequences, to check that we correctly group by operator + // precedence. + let atom = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Boolean(true), + }), + }; + let as_str = "a == true || a == true && a == true ^ a == true"; + let parsed = query_parser::filter_expr(as_str).unwrap(); + assert_eq!( + parsed.to_string(), + "((a == true) || ((a == true) && ((a == true) ^ (a == true))))" + ); + + // This should bind most tighty from right to left: XOR, then AND, then + // OR. Since we're destructuring from out to in, though, we check in the + // opposite order, weakest to strongest, or left to right. + // + // Start with OR, which should bind the most weakly. + assert!(!parsed.negated); + let FilterExpr::Compound(CompoundFilter { left, op, right }) = + parsed.expr + else { + unreachable!(); + }; + assert!(!left.negated); + assert!(!right.negated); + assert_eq!(op, LogicalOp::Or); + assert_eq!(atom, *left); + + // && should bind next-most tightly + let FilterExpr::Compound(CompoundFilter { left, op, right }) = + right.expr + else { + unreachable!(); + }; + assert!(!left.negated); + assert!(!right.negated); + assert_eq!(op, LogicalOp::And); + assert_eq!(atom, *left); + + // Followed by XOR, the tightest binding operator. + let FilterExpr::Compound(CompoundFilter { left, op, right }) = + right.expr + else { + unreachable!(); + }; + assert!(!left.negated); + assert!(!right.negated); + assert_eq!(op, LogicalOp::Xor); + assert_eq!(atom, *left); + assert_eq!(atom, *right); + } + + #[test] + fn test_filter_expr_overridden_precedence() { + // Similar to above, we'll test with a single atom, and group in a + // number of ways. + let atom = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Boolean(true), + }), + }; + let as_str = "(a == true || a == true) && a == true"; + let parsed = query_parser::filter_expr(as_str).unwrap(); + + // Now, || should bind more tightly, so we should have (a && b) at the + // top-level, where b is the test atom. We're comparing the atom at the + // _right_ now with the original expressions. + assert!(!parsed.negated); + let FilterExpr::Compound(CompoundFilter { left, op, right }) = + parsed.expr + else { + unreachable!(); + }; + assert!(!left.negated); + assert!(!right.negated); + assert_eq!(op, LogicalOp::And); + assert_eq!(atom, *right); + + // Destructure the LHS and check it. + let FilterExpr::Compound(CompoundFilter { left, op, right }) = + left.expr + else { + unreachable!(); + }; + assert!(!left.negated); + assert!(!right.negated); + assert_eq!(op, LogicalOp::Or); + assert_eq!(atom, *left); + assert_eq!(atom, *right); + } + + #[test] + fn test_negated_filter_expr() { + let left = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".into()), + cmp: Comparison::Eq, + value: Literal::Boolean(true), + }), + }; + let right = left.negate(); + let top = left.merge(&right, LogicalOp::Xor).negate(); + let as_str = "!(a == true ^ !(a == true))"; + let parsed = query_parser::filter_expr(as_str).unwrap(); + assert_eq!(top, parsed); + } + + #[test] + fn test_filter_table_op() { + for expr in [ + "filter field == 0", + "filter baz == 'quux'", + "filter other_field != 'yes'", + "filter id != \"45c937fb-5e99-4a86-a95b-22bf30bf1507\"", + "filter (foo == 'bar') || ((yes != \"no\") && !(maybe > 'so'))", + ] { + let parsed = query_parser::filter(expr).unwrap_or_else(|_| { + panic!("failed to parse query: '{}'", expr) + }); + println!("{parsed:#?}"); + } + } + + #[test] + fn test_get_table_op() { + for expr in [ + "get foo:bar", + "get target_name:metric_name", + "get target_name_0:metric_name000", + ] { + let parsed = query_parser::get(expr).unwrap_or_else(|_| { + panic!("failed to parse get expr: '{}'", expr) + }); + println!("{parsed:#?}"); + } + + assert!(query_parser::get("get foo").is_err()); + assert!(query_parser::get("get foo:").is_err()); + assert!(query_parser::get("get :bar").is_err()); + assert!(query_parser::get("get 0:0").is_err()); + } + + #[test] + fn test_ident() { + for id in ["foo", "foo0", "foo_0_1_2"] { + query_parser::ident(id) + .unwrap_or_else(|_| panic!("failed to identifier: '{id}'")); + } + + for id in ["0foo", "0", "A", "", "%", "foo_"] { + query_parser::ident(id).expect_err(&format!( + "should not have parsed as identifier: '{}'", + id + )); + } + } + + #[test] + fn test_group_by() { + for q in [ + "group_by []", + "group_by [baz]", + "group_by [baz,]", + "group_by [baz,another_field]", + "group_by [baz,another_field,]", + ] { + let parsed = query_parser::group_by(q) + .unwrap_or_else(|_| panic!("failed to parse group_by: '{q}'")); + println!("{parsed:#?}"); + } + } + + #[test] + fn test_query() { + for q in [ + "get foo:bar", + "get foo:bar | group_by []", + "get foo:bar | group_by [baz]", + "get foo:bar | filter baz == 'quuz'", + "get foo:bar | filter (some == 0) && (id == false || a == -1.0)", + "get foo:bar | group_by [baz] | filter baz == 'yo'", + "{ get foo:bar | filter x == 0; get x:y } | join", + "{ get foo:bar ; get x:y } | join | filter baz == 0", + "get foo:bar | align interpolate(10s)", + ] { + let parsed = query_parser::query(q) + .unwrap_or_else(|_| panic!("failed to parse query: '{q}'")); + println!("{parsed:#?}"); + } + } + + #[test] + fn test_reducer() { + assert_eq!(query_parser::reducer("mean").unwrap(), Reducer::Mean); + assert!(query_parser::reducer("foo").is_err()); + } + + #[test] + fn test_parse_literal_timestamp_string() { + assert_eq!( + query_parser::timestamp_string("@2020-01-01").unwrap(), + Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap(), + ); + assert_eq!( + query_parser::timestamp_string("@01:01:01").unwrap().time(), + NaiveTime::from_hms_opt(1, 1, 1).unwrap(), + ); + assert_eq!( + query_parser::timestamp_string("@01:01:01.123456").unwrap().time(), + NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(), + ); + assert_eq!( + query_parser::timestamp_string("@2020-01-01T01:01:01.123456") + .unwrap(), + NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 1, 1).unwrap(), + NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(), + ) + .and_utc(), + ); + } + + #[test] + fn test_parse_ipv4_literal() { + let check = |s: &str, addr: IpAddr| { + let Literal::IpAddr(ip) = query_parser::ip_literal(s).unwrap() + else { + panic!("expected '{}' to be parsed into {}", s, addr); + }; + assert_eq!(ip, addr); + }; + check("\"100.100.100.100\"", Ipv4Addr::new(100, 100, 100, 100).into()); + check("\"1.2.3.4\"", Ipv4Addr::new(1, 2, 3, 4).into()); + check("\"0.0.0.0\"", Ipv4Addr::UNSPECIFIED.into()); + + assert!(query_parser::ip_literal("\"abcd\"").is_err()); + assert!(query_parser::ip_literal("\"1.1.1.\"").is_err()); + assert!(query_parser::ip_literal("\"1.1.1.1.1.1\"").is_err()); + assert!(query_parser::ip_literal("\"2555.1.1.1\"").is_err()); + assert!(query_parser::ip_literal("1.2.3.4").is_err()); // no quotes + } + + #[test] + fn test_parse_ipv6_literal() { + let check = |s: &str, addr: IpAddr| { + let Literal::IpAddr(ip) = query_parser::ip_literal(s).unwrap() + else { + panic!("expected '{}' to be parsed into {}", s, addr); + }; + assert_eq!(ip, addr); + }; + + // IPv6 is nuts, let's just check a few common patterns. + check("\"::1\"", Ipv6Addr::LOCALHOST.into()); + check("\"::\"", Ipv6Addr::UNSPECIFIED.into()); + check("\"fd00::1\"", Ipv6Addr::new(0xfd00, 0, 0, 0, 0, 0, 0, 1).into()); + check( + "\"fd00:1:2:3:4:5:6:7\"", + Ipv6Addr::new(0xfd00, 1, 2, 3, 4, 5, 6, 7).into(), + ); + + // Don't currently support IPv6-mapped IPv4 addresses + assert!(query_parser::ip_literal("\"::ffff:127.0.0.1\"").is_err()); + + // Other obviously bad patterns. + assert!(query_parser::ip_literal("\"1\"").is_err()); + assert!(query_parser::ip_literal("\":1::1::1\"").is_err()); + assert!(query_parser::ip_literal("\"::g\"").is_err()); + assert!(query_parser::ip_literal("\":::\"").is_err()); + assert!(query_parser::ip_literal("::1").is_err()); // no quotes + } + + #[test] + fn test_query_starts_with_get() { + assert!(query_parser::query("{ get a:b }") + .unwrap() + .all_gets_at_query_start()); + assert!(query_parser::query("{ get a:b; get a:b } | join") + .unwrap() + .all_gets_at_query_start()); + assert!(query_parser::query( + "{ { get a:b ; get a:b } | join; get c:d } | join" + ) + .unwrap() + .all_gets_at_query_start()); + + assert!(query_parser::query("{ get a:b; filter foo == 0 }").is_err()); + assert!(query_parser::query("{ get a:b; filter foo == 0 }").is_err()); + assert!(query_parser::query("get a:b | get a:b").is_err()); + } + + #[test] + fn test_now_with_offset() { + fn check(expr: &str, expected: DateTime) { + // Rough but still-useful bound in microseconds. + const MAX_DIFF_IN_MICROS: i64 = 1000; + let d = query_parser::now_timestamp(expr).unwrap(); + let now = Utc::now(); + let micros = d.timestamp_micros() - expected.timestamp_micros(); + assert!( + micros.abs() <= MAX_DIFF_IN_MICROS, + "Expected `{}` to be within {}us of {}, but it is {}us away", + expr, + MAX_DIFF_IN_MICROS, + now, + micros, + ); + } + check("@now() - 5m", Utc::now() - Duration::from_secs(60 * 5)); + check("@now() + 5m", Utc::now() + Duration::from_secs(60 * 5)); + check("@now() - 5s", Utc::now() - Duration::from_secs(5)); + check("@now() + 5s", Utc::now() + Duration::from_secs(5)); + check("@now() - 1d", Utc::now() - Duration::from_secs(60 * 60 * 24)); + check("@now() + 1d", Utc::now() + Duration::from_secs(60 * 60 * 24)); + } + + #[test] + fn test_like_only_available_for_strings() { + assert!(query_parser::filter_expr("foo ~= 0").is_err()); + assert!(query_parser::filter_expr("foo ~= \"something\"").is_ok()); + } + + #[test] + fn test_align_table_op() { + assert_eq!( + query_parser::align("align interpolate(1m)").unwrap(), + Align { + method: AlignmentMethod::Interpolate, + period: Duration::from_secs(60) + } + ); + assert_eq!( + query_parser::align("align mean_within(100s)").unwrap(), + Align { + method: AlignmentMethod::MeanWithin, + period: Duration::from_secs(100) + } + ); + + assert!(query_parser::align("align whatever(100s)").is_err()); + assert!(query_parser::align("align interpolate('foo')").is_err()); + } + + #[test] + fn test_complicated_logical_combinations() { + let parsed = + query_parser::logical_or_expr("a == 'b' ^ !(c == 0) && d == false") + .unwrap(); + + // Build up this expected expression from its components. + let left = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::String("b".into()), + }), + }; + let middle = Filter { + negated: true, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("c".to_string()), + cmp: Comparison::Eq, + value: Literal::Integer(0), + }), + }; + let right = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("d".to_string()), + cmp: Comparison::Eq, + value: Literal::Boolean(false), + }), + }; + + // The left and right are bound most tightly, by the XOR operator. + let xor = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(left), + op: LogicalOp::Xor, + right: Box::new(middle), + }), + }; + + // And then those two together are joined with the AND. + let expected = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(xor), + op: LogicalOp::And, + right: Box::new(right), + }), + }; + assert_eq!(parsed, expected); + } + + #[test] + fn test_multiple_negation() { + let negated = + query_parser::filter_expr("(a == 0) || !!!(a == 0 && a == 0)") + .unwrap(); + let expected = + query_parser::filter_expr("(a == 0) || !(a == 0 && a == 0)") + .unwrap(); + assert_eq!(negated, expected, "Failed to handle multiple negations"); + } +} diff --git a/oximeter/db/src/oxql/ast/ident.rs b/oximeter/db/src/oxql/ast/ident.rs new file mode 100644 index 0000000000..6fb2dab85a --- /dev/null +++ b/oximeter/db/src/oxql/ast/ident.rs @@ -0,0 +1,25 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! OxQL identifiers, such as column names. + +// Copyright 2024 Oxide Computer Company + +use std::fmt; + +/// An identifier, such as a column or function name. +#[derive(Clone, Debug, PartialEq)] +pub struct Ident(pub(in crate::oxql) String); + +impl Ident { + pub fn as_str(&self) -> &str { + self.0.as_str() + } +} + +impl fmt::Display for Ident { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/oximeter/db/src/oxql/ast/literal.rs b/oximeter/db/src/oxql/ast/literal.rs new file mode 100644 index 0000000000..33f3d81485 --- /dev/null +++ b/oximeter/db/src/oxql/ast/literal.rs @@ -0,0 +1,384 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! AST node for literal values. + +// Copyright 2024 Oxide Computer Company + +use crate::oxql::ast::cmp::Comparison; +use crate::oxql::Error; +use anyhow::Context; +use chrono::DateTime; +use chrono::Utc; +use oximeter::FieldType; +use oximeter::FieldValue; +use regex::Regex; +use std::fmt; +use std::net::IpAddr; +use std::time::Duration; +use uuid::Uuid; + +/// A literal value. +#[derive(Clone, Debug, PartialEq)] +pub enum Literal { + // TODO-performance: An i128 here is a bit gratuitous. + Integer(i128), + Double(f64), + String(String), + Boolean(bool), + Uuid(Uuid), + Duration(Duration), + Timestamp(DateTime), + IpAddr(IpAddr), +} + +impl Literal { + // Format the literal as a safe, typed string for ClickHouse. + pub(crate) fn as_db_safe_string(&self) -> String { + match self { + Literal::Integer(inner) => format!("{inner}"), + Literal::Double(inner) => format!("{inner}"), + Literal::String(inner) => format!("'{inner}'"), + Literal::Boolean(inner) => format!("{inner}"), + Literal::Uuid(inner) => format!("'{inner}'"), + Literal::Duration(inner) => { + let (count, interval) = duration_to_db_interval(inner); + format!("INTERVAL {} {}", count, interval) + } + Literal::Timestamp(inner) => { + format!("'{}'", inner.format(crate::DATABASE_TIMESTAMP_FORMAT)) + } + Literal::IpAddr(inner) => { + // NOTE: We store all IP addresses in ClickHouse as IPv6, with + // IPv4 addresses mapped to that. To run a comparison against a + // literal in Rust, we can use the value directly, since we + // decode it an convert to the right type during + // deserialization. But to compare in the DB itself, we need to + // do that with an IPv4-mapped IPv6 address. + // + // Helpfully, ClickHouse's `toIPv6` function takes a string of + // either family, and maps IPv4 into the IPv6 space, if needed. + format!("toIPv6('{inner}')") + } + } + } + + // Return true if this literal can be compared to a field of the provided + // type. + pub(crate) fn is_compatible_with_field( + &self, + field_type: FieldType, + ) -> bool { + match self { + Literal::Integer(_) => matches!( + field_type, + FieldType::U8 + | FieldType::I8 + | FieldType::U16 + | FieldType::I16 + | FieldType::U32 + | FieldType::I32 + | FieldType::U64 + | FieldType::I64 + ), + Literal::Double(_) => false, + Literal::String(_) => matches!(field_type, FieldType::String), + Literal::Boolean(_) => matches!(field_type, FieldType::Bool), + Literal::Uuid(_) => matches!(field_type, FieldType::Uuid), + Literal::Duration(_) => false, + Literal::Timestamp(_) => false, + Literal::IpAddr(_) => matches!(field_type, FieldType::IpAddr), + } + } + + /// Apply the comparison op between self and the provided field. + /// + /// Return None if the comparison cannot be applied, either because the type + /// is not compatible or the comparison doesn't make sense. + pub(crate) fn compare_field( + &self, + value: &FieldValue, + cmp: Comparison, + ) -> Result, Error> { + anyhow::ensure!( + self.is_compatible_with_field(value.field_type()), + "Field value of type {} is cannot be compared to \ + the value in this filter", + value.field_type(), + ); + macro_rules! generate_cmp_match { + ($lhs:ident, $rhs:ident) => { + match cmp { + Comparison::Eq => Ok(Some($lhs == $rhs)), + Comparison::Ne => Ok(Some($lhs != $rhs)), + Comparison::Gt => Ok(Some($lhs > $rhs)), + Comparison::Ge => Ok(Some($lhs >= $rhs)), + Comparison::Lt => Ok(Some($lhs < $rhs)), + Comparison::Le => Ok(Some($lhs <= $rhs)), + Comparison::Like => Ok(None), + } + }; + } + // Filter expressions are currently written as ` + // `. That means the literal stored in `self` is the RHS of + // the comparison, and the field value passed in is the LHS. + match (value, self) { + (FieldValue::Bool(lhs), Literal::Boolean(rhs)) => { + generate_cmp_match!(rhs, lhs) + } + (FieldValue::String(lhs), Literal::String(rhs)) => match cmp { + Comparison::Eq => Ok(Some(lhs == rhs)), + Comparison::Ne => Ok(Some(lhs != rhs)), + Comparison::Gt => Ok(Some(lhs > rhs)), + Comparison::Ge => Ok(Some(lhs >= rhs)), + Comparison::Lt => Ok(Some(lhs < rhs)), + Comparison::Le => Ok(Some(lhs <= rhs)), + Comparison::Like => { + let re = Regex::new(rhs).context( + "failed to create regex for string matching", + )?; + Ok(Some(re.is_match(lhs))) + } + }, + (FieldValue::IpAddr(lhs), Literal::IpAddr(rhs)) => { + generate_cmp_match!(rhs, lhs) + } + (FieldValue::Uuid(lhs), Literal::Uuid(rhs)) => { + generate_cmp_match!(rhs, lhs) + } + (FieldValue::U8(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (FieldValue::I8(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (FieldValue::U16(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (FieldValue::I16(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (FieldValue::U32(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (FieldValue::I32(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (FieldValue::U64(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (FieldValue::I64(lhs), Literal::Integer(rhs)) => { + let lhs = i128::from(*lhs); + let rhs = *rhs; + generate_cmp_match!(lhs, rhs) + } + (_, _) => unreachable!(), + } + } +} + +/// Duration constants used for interpreting duration literals. +/// +/// Many of the values here are **approximate**. For example, a "year" is always +/// 365 24-hour periods, regardless of leap years, the current time, or any +/// other context. +pub(crate) mod duration_consts { + use std::time::Duration; + + /// Approximately 1 year, 365 24-hour periods. + pub const YEAR: Duration = Duration::from_secs(60 * 60 * 24 * 365); + + /// Approximately 1 month, 30 24-hour periods. + pub const MONTH: Duration = Duration::from_secs(60 * 60 * 24 * 30); + + /// Approximately 1 week, 7 24-hour periods. + pub const WEEK: Duration = Duration::from_secs(60 * 60 * 24 * 7); + + /// One day, equal to 24 hours. + pub const DAY: Duration = Duration::from_secs(60 * 60 * 24); + + /// An hour, exactly 3600 seconds. + pub const HOUR: Duration = Duration::from_secs(60 * 60); + + /// A minute, exactly 60 seconds. + pub const MINUTE: Duration = Duration::from_secs(60); + + /// One second. + pub const SECOND: Duration = Duration::from_secs(1); + + /// One millisecond, a thousandth of a second. + pub const MILLISECOND: Duration = Duration::from_millis(1); + + /// One microsecond, a millionth of a second. + pub const MICROSECOND: Duration = Duration::from_micros(1); + + /// One nanosecond, a billionth of a second. + pub const NANOSECOND: Duration = Duration::from_nanos(1); +} + +// Convert a duration into an appropriate interval for a database query. +// +// This converts the provided duration into the largest interval type for which +// the value is an integer. For example: +// +// `1us` -> (1, "MICROSECOND"), +// `3.4s` -> (3400, "MILLISECOND") +fn duration_to_db_interval(dur: &Duration) -> (u64, &'static str) { + fn as_whole_multiple(dur: &Duration, base: &Duration) -> Option { + let d = dur.as_nanos(); + let base = base.as_nanos(); + if d % base == 0 { + Some(u64::try_from(d / base).unwrap()) + } else { + None + } + } + use duration_consts::*; + const INTERVALS: [(Duration, &str); 10] = [ + (YEAR, "YEAR"), + (MONTH, "MONTH"), + (WEEK, "WEEK"), + (DAY, "DAY"), + (HOUR, "HOUR"), + (MINUTE, "MINUTE"), + (SECOND, "SECOND"), + (MILLISECOND, "MILLISECOND"), + (MICROSECOND, "MICROSECOND"), + (NANOSECOND, "NANOSECOND"), + ]; + for (base, interval) in &INTERVALS { + if let Some(count) = as_whole_multiple(dur, base) { + return (count, interval); + } + } + + // Durations must be a whole number of nanoseconds, so we will never fall + // past the last interval in the array above. + unreachable!(); +} + +impl fmt::Display for Literal { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Literal::Integer(inner) => write!(f, "{inner}"), + Literal::Double(inner) => write!(f, "{inner}"), + Literal::String(inner) => write!(f, "{inner:?}"), + Literal::Boolean(inner) => write!(f, "{inner}"), + Literal::Uuid(inner) => write!(f, "\"{inner}\""), + Literal::Duration(inner) => write!(f, "{inner:?}"), + Literal::Timestamp(inner) => write!(f, "@{inner}"), + Literal::IpAddr(inner) => write!(f, "{inner}"), + } + } +} + +#[cfg(test)] +mod tests { + use super::duration_consts::*; + use super::duration_to_db_interval; + use super::Literal; + use crate::oxql::ast::cmp::Comparison; + use oximeter::FieldValue; + + #[test] + fn test_duration_to_db_interval() { + for base in [1_u32, 2, 3] { + let b = u64::from(base); + assert_eq!(duration_to_db_interval(&(base * YEAR)), (b, "YEAR")); + assert_eq!(duration_to_db_interval(&(base * MONTH)), (b, "MONTH")); + assert_eq!(duration_to_db_interval(&(base * WEEK)), (b, "WEEK")); + assert_eq!(duration_to_db_interval(&(base * DAY)), (b, "DAY")); + assert_eq!(duration_to_db_interval(&(base * HOUR)), (b, "HOUR")); + assert_eq!( + duration_to_db_interval(&(base * MINUTE)), + (b, "MINUTE") + ); + assert_eq!( + duration_to_db_interval(&(base * SECOND)), + (b, "SECOND") + ); + assert_eq!( + duration_to_db_interval(&(base * MILLISECOND)), + (b, "MILLISECOND") + ); + assert_eq!( + duration_to_db_interval(&(base * MICROSECOND)), + (b, "MICROSECOND") + ); + assert_eq!( + duration_to_db_interval(&(base * NANOSECOND)), + (b, "NANOSECOND") + ); + } + assert_eq!(duration_to_db_interval(&(YEAR / 2)), (4380, "HOUR")); + assert_eq!(duration_to_db_interval(&(HOUR / 60)), (1, "MINUTE")); + assert_eq!(duration_to_db_interval(&(HOUR / 10)), (6, "MINUTE")); + assert_eq!(duration_to_db_interval(&(HOUR / 12)), (5, "MINUTE")); + assert_eq!(duration_to_db_interval(&(HOUR / 120)), (30, "SECOND")); + assert_eq!(duration_to_db_interval(&(MINUTE / 2)), (30, "SECOND")); + assert_eq!(duration_to_db_interval(&(MINUTE / 10)), (6, "SECOND")); + assert_eq!( + duration_to_db_interval(&MINUTE.mul_f64(1.5)), + (90, "SECOND") + ); + assert_eq!( + duration_to_db_interval(&MICROSECOND.mul_f64(1.5)), + (1500, "NANOSECOND") + ); + assert_eq!( + duration_to_db_interval(&(YEAR + NANOSECOND)), + (31536000000000001, "NANOSECOND") + ); + } + + #[test] + fn test_literal_compare_field() { + let value = FieldValue::I64(3); + let lit = Literal::Integer(4); + + // The literal comparison would be written like: `field >= 4` where + // `field` has a value of 3 here. So the comparison is false. + assert_eq!( + lit.compare_field(&value, Comparison::Ge).unwrap(), + Some(false) + ); + + // Reversing this, we should have true. + assert_eq!( + lit.compare_field(&value, Comparison::Lt).unwrap(), + Some(true) + ); + + // It should not be equal. + assert_eq!( + lit.compare_field(&value, Comparison::Eq).unwrap(), + Some(false) + ); + assert_eq!( + lit.compare_field(&value, Comparison::Ne).unwrap(), + Some(true) + ); + } + + #[test] + fn test_literal_compare_field_wrong_type() { + let value = FieldValue::String(String::from("foo")); + let lit = Literal::Integer(4); + assert!(lit.compare_field(&value, Comparison::Eq).is_err()); + } +} diff --git a/oximeter/db/src/oxql/ast/logical_op.rs b/oximeter/db/src/oxql/ast/logical_op.rs new file mode 100644 index 0000000000..60fc5d134f --- /dev/null +++ b/oximeter/db/src/oxql/ast/logical_op.rs @@ -0,0 +1,41 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! An AST node describing logical operators. + +// Copyright 2024 Oxide Computer Company + +use std::fmt; + +/// Logical operators. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum LogicalOp { + And, + Or, + Xor, +} + +impl LogicalOp { + pub(crate) fn as_db_function_name(&self) -> &'static str { + match self { + LogicalOp::And => "and", + LogicalOp::Or => "or", + LogicalOp::Xor => "xor", + } + } +} + +impl fmt::Display for LogicalOp { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}", + match self { + LogicalOp::And => "&&", + LogicalOp::Or => "||", + LogicalOp::Xor => "^", + } + ) + } +} diff --git a/oximeter/db/src/oxql/ast/mod.rs b/oximeter/db/src/oxql/ast/mod.rs new file mode 100644 index 0000000000..7037b74a7f --- /dev/null +++ b/oximeter/db/src/oxql/ast/mod.rs @@ -0,0 +1,152 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! AST for the Oximeter Query Language. + +// Copyright 2024 Oxide Computer Company + +use chrono::DateTime; +use chrono::Utc; +use oximeter::TimeseriesName; + +use self::table_ops::BasicTableOp; +use self::table_ops::GroupedTableOp; +use self::table_ops::TableOp; +pub mod cmp; +pub(super) mod grammar; +pub mod ident; +pub mod literal; +pub mod logical_op; +pub mod table_ops; + +/// An OxQL query. +#[derive(Clone, Debug, PartialEq)] +pub struct Query { + ops: Vec, +} + +impl Query { + // Return the first operation in the query, which is always a form of `get`. + fn first_op(&self) -> &TableOp { + self.ops.first().expect("Should have parsed at least 1 operation") + } + + pub(crate) fn timeseries_name(&self) -> &TimeseriesName { + match self.first_op() { + TableOp::Basic(BasicTableOp::Get(n)) => n, + TableOp::Basic(_) => unreachable!(), + TableOp::Grouped(GroupedTableOp { ops }) => { + ops.first().unwrap().timeseries_name() + } + } + } + + // Check that this query (and any subqueries) start with a get table op, and + // that there are no following get operations. I.e., we have: + // + // get ... | + // { get .. } | + // { get .. ; get .. } | + pub(crate) fn all_gets_at_query_start(&self) -> bool { + fn all_gets_at_query_start(ops: &[TableOp]) -> bool { + let (head, tail) = ops.split_at(1); + match &head[0] { + // If the head is a get, check that there are no following get + // operations. + TableOp::Basic(BasicTableOp::Get(_)) => { + !tail.iter().any(|op| { + matches!(op, TableOp::Basic(BasicTableOp::Get(_))) + }) + } + // Cannot start with any other basic op. + TableOp::Basic(_) => false, + // Recurse for grouped ops. + TableOp::Grouped(GroupedTableOp { ops }) => { + ops.iter().all(Query::all_gets_at_query_start) + } + } + } + all_gets_at_query_start(&self.ops) + } + + // Return the non-get table transformations. + pub(crate) fn transformations(&self) -> &[TableOp] { + &self.ops[1..] + } + + // Split the query into either: + // + // - a list of nested queries and the remaining table ops in self, or + // - the flat query contained in self. + pub(crate) fn split(&self, query_end_time: DateTime) -> SplitQuery { + match &self.ops[0] { + TableOp::Basic(BasicTableOp::Get(_)) => { + SplitQuery::Flat(crate::oxql::Query { + parsed: self.clone(), + end_time: query_end_time, + }) + } + TableOp::Basic(_) => unreachable!(), + TableOp::Grouped(GroupedTableOp { ops }) => SplitQuery::Nested { + subqueries: ops + .iter() + .cloned() + .map(|parsed| crate::oxql::Query { + parsed, + end_time: query_end_time, + }) + .collect(), + transformations: self.ops[1..].to_vec(), + }, + } + } + + // Return the last referenced timestamp in the query, if any. + pub(crate) fn query_end_time(&self) -> Option> { + match &self.ops[0] { + TableOp::Basic(BasicTableOp::Get(_)) => self + .transformations() + .iter() + .filter_map(|op| { + let TableOp::Basic(BasicTableOp::Filter(filter)) = op + else { + return None; + }; + filter.last_timestamp() + }) + .max(), + TableOp::Basic(_) => unreachable!(), + TableOp::Grouped(GroupedTableOp { ops }) => { + let grouped_max = + ops.iter().filter_map(Self::query_end_time).max(); + let op_max = self + .transformations() + .iter() + .filter_map(|op| { + let TableOp::Basic(BasicTableOp::Filter(filter)) = op + else { + return None; + }; + filter.last_timestamp() + }) + .max(); + grouped_max.max(op_max) + } + } + } +} + +// Either a flat query or one with nested subqueries. +// +// OxQL supports subqueries. Though they can be nested, they must always be at +// the front of a query. This represents either a query that is flat, _or_ that +// prefix of subqueries and the following transformations. +#[derive(Clone, Debug, PartialEq)] +pub(crate) enum SplitQuery { + Flat(crate::oxql::Query), + Nested { + subqueries: Vec, + transformations: Vec, + }, +} diff --git a/oximeter/db/src/oxql/ast/table_ops/align.rs b/oximeter/db/src/oxql/ast/table_ops/align.rs new file mode 100644 index 0000000000..cf54ebc312 --- /dev/null +++ b/oximeter/db/src/oxql/ast/table_ops/align.rs @@ -0,0 +1,753 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! An AST node describing timeseries alignment operations. + +// Copyright 2024 Oxide Computer Company + +use crate::oxql::point::DataType; +use crate::oxql::point::MetricType; +use crate::oxql::point::Points; +use crate::oxql::point::ValueArray; +use crate::oxql::point::Values; +use crate::oxql::query::Alignment; +use crate::oxql::Error; +use crate::oxql::Table; +use crate::oxql::Timeseries; +use anyhow::Context; +use chrono::DateTime; +use chrono::TimeDelta; +use chrono::Utc; +use std::time::Duration; + +// The maximum factor by which an alignment operation may upsample data. +// +// This is a crude way to limit the size of a query result. We do not currently +// paginate the results of OxQL queries, so we need to find other ways to avoid +// DOS attacks due to large query results. +// +// While we also apply limits on the total number of samples fetched from the +// ClickHouse database, this alone is insufficient. For example, suppose we have +// two samples, spaced 1 second apart, which are then passed to an alignment +// table operation with a period of 1 nanosecond. Now you have a billion points! +// +// To prevent this, we restrict the total amount by which any alignment +// operation can upsample the data. Another way to think of it is that this +// limits the ratio between the requested period and the largest interval +// between timestamps in the data. +const MAX_UPSAMPLING_RATIO: u128 = 10; + +fn verify_max_upsampling_ratio( + timestamps: &[DateTime], + period: &Duration, +) -> Result<(), Error> { + let period = period.as_nanos(); + let max = MAX_UPSAMPLING_RATIO * period; + for (t1, t0) in timestamps.iter().skip(1).zip(timestamps.iter()) { + let Some(nanos) = t1.signed_duration_since(t0).num_nanoseconds() else { + anyhow::bail!("Overflow computing timestamp delta"); + }; + assert!(nanos > 0, "Timestamps should be sorted"); + let nanos = nanos as u128; + anyhow::ensure!( + nanos <= max, + "A table alignment operation may not upsample data by \ + more than a factor of {MAX_UPSAMPLING_RATIO}" + ); + } + Ok(()) +} + +/// An `align` table operation, used to produce data at well-defined periods. +/// +/// Alignment is important for any kind of aggregation. Data is actually +/// produced at variable intervals, under the control of the producer itself. +/// This means that in general, two timeseries that are related (say, the same +/// schema) may have data samples at slightly different timestamps. +/// +/// Alignment is used to produce data at the defined timestamps, so that samples +/// from multiple timeseries may be combined or correlated in meaningful ways. +#[derive(Clone, Debug, PartialEq)] +pub struct Align { + /// The alignment method, used to describe how data over the input period + /// is used to generate an output sample. + pub method: AlignmentMethod, + // TODO-completeness. We'd like to separate the concept of the period, the + // interval on which data is produced by this alignment, and the input + // window, the range of time in the past over which data is considered to + // produce the output values. + // + // For example, we might want to produce a moving average, by considering + // the last 1h of data, and produce an output value every 10m. Each of those + // output values would share 50m of data with the points on either side. + // + // For now, we'll enforce that the output period and input window are the + // same. + pub period: Duration, +} + +impl Align { + // Apply the alignment function to the set of tables. + pub(crate) fn apply( + &self, + tables: &[Table], + query_end: &DateTime, + ) -> Result, Error> { + match self.method { + AlignmentMethod::Interpolate => tables + .iter() + .map(|table| align_interpolate(table, query_end, &self.period)) + .collect(), + AlignmentMethod::MeanWithin => tables + .iter() + .map(|table| align_mean_within(table, query_end, &self.period)) + .collect(), + } + } +} + +/// An alignment method. +#[derive(Clone, Debug, PartialEq)] +pub enum AlignmentMethod { + /// Alignment is done by interpolating the output data at the specified + /// period. + Interpolate, + /// Alignment is done by computing the mean of the output data within the + /// specified period. + MeanWithin, +} + +// Align the timeseries in a table by computing the average within each output +// period. +fn align_mean_within( + table: &Table, + query_end: &DateTime, + period: &Duration, +) -> Result { + let mut output_table = Table::new(table.name()); + for timeseries in table.iter() { + let points = ×eries.points; + anyhow::ensure!( + points.dimensionality() == 1, + "Aligning multidimensional timeseries is not yet supported" + ); + let data_type = points.data_types().next().unwrap(); + anyhow::ensure!( + data_type.is_numeric(), + "Alignment by mean requires numeric data type, not {}", + data_type + ); + let metric_type = points.metric_type().unwrap(); + anyhow::ensure!( + matches!(metric_type, MetricType::Gauge | MetricType::Delta), + "Alignment by mean requires a gauge or delta metric, not {}", + metric_type, + ); + verify_max_upsampling_ratio(&points.timestamps, &period)?; + + // Always convert the output to doubles, when computing the mean. The + // output is always a gauge, so we do not need the start times of the + // input either. + // + // IMPORTANT: We compute the mean in the loop below from the back of the + // array (latest timestamp) to the front (earliest timestamp). They are + // appended to these arrays here in that _reversed_ order. These arrays + // are flipped before pushing them onto the timeseries at the end of the + // loop below. + let mut output_values = Vec::with_capacity(points.len()); + let mut output_timestamps = Vec::with_capacity(points.len()); + + // Convert the input to doubles now, so the tight loop below does less + // conversion / matching inside. + let input_points = match points.values(0).unwrap() { + ValueArray::Integer(values) => values + .iter() + .map(|maybe_int| maybe_int.map(|int| int as f64)) + .collect(), + ValueArray::Double(values) => values.clone(), + _ => unreachable!(), + }; + + // Alignment works as follows: + // + // - Start at the end of the timestamp array, working our way backwards + // in time. + // - Create the output timestamp from the current step. + // - Find all points in the input array that are within the alignment + // period. + // - Compute the mean of those. + let period_ = + TimeDelta::from_std(*period).context("time delta out of range")?; + let first_timestamp = points.timestamps[0]; + let mut ix: u32 = 0; + loop { + // Compute the next output timestamp, by shifting the query end time + // by the period and the index. + let time_offset = TimeDelta::from_std(ix * *period) + .context("time delta out of range")?; + let output_time = query_end + .checked_sub_signed(time_offset) + .context("overflow computing next output timestamp")?; + let window_start = output_time + .checked_sub_signed(period_) + .context("overflow computing next output window start")?; + + // The output time is before any of the data in the input array, + // we're done. It's OK for the _start time_ to be before any input + // timestamps. + if output_time < first_timestamp { + break; + } + + // Aggregate all values within this time window. + // + // This works a bit differently for gauge timeseries and deltas. + // Gauges are simpler, so let's consider them first. A point is + // "within" the window if the timestamp is within the window. Every + // point is either completely within or completely without the + // window, so we just add the values. + // + // Deltas have a start time, which makes things a bit more + // complicated. In that case, a point can overlap _partially_ with + // the output time window, and we'd like to take that partial + // overlap into account. To do that, we find relevant values which + // have either a start time or timestamp within the output window. + // We compute the fraction of overlap with the window, which is in + // [0.0, 1.0], and multiply the value by that fraction. One can + // think of this as a dot-product between the interval-overlap array + // and the value array, divided by the 1-norm, or number of nonzero + // entries. + let output_value = if matches!(metric_type, MetricType::Gauge) { + mean_gauge_value_in_window( + &points.timestamps, + &input_points, + window_start, + output_time, + ) + } else { + mean_delta_value_in_window( + points.start_times.as_ref().unwrap(), + &points.timestamps, + &input_points, + window_start, + output_time, + ) + }; + output_values.push(output_value); + + // In any case, we push the window's end time and increment to the + // next period. + output_timestamps.push(output_time); + ix += 1; + } + + // We've accumulated our input values into the output arrays, but in + // reverse order. Flip them and push onto the existing table, as a gauge + // timeseries. + let mut new_timeseries = Timeseries::new( + timeseries.fields.clone().into_iter(), + DataType::Double, + MetricType::Gauge, + ) + .unwrap(); + let values = + ValueArray::Double(output_values.into_iter().rev().collect()); + let timestamps = output_timestamps.into_iter().rev().collect(); + let values = Values { values, metric_type: MetricType::Gauge }; + new_timeseries.points = + Points { start_times: None, timestamps, values: vec![values] }; + new_timeseries.alignment = + Some(Alignment { end_time: *query_end, period: *period }); + output_table.insert(new_timeseries).unwrap(); + } + Ok(output_table) +} + +// Given an interval start and end, and a window start and end, compute the +// fraction of the _interval_ that the time window represents. +fn fraction_overlap_with_window( + interval_start: DateTime, + interval_end: DateTime, + window_start: DateTime, + window_end: DateTime, +) -> f64 { + assert!(interval_start < interval_end); + assert!(window_start < window_end); + let end = window_end.min(interval_end); + let start = window_start.max(interval_start); + let contained_size = (end - start).num_nanoseconds().unwrap() as f64; + if contained_size < 0.0 { + return 0.0; + } + let interval_size = + (interval_end - interval_start).num_nanoseconds().unwrap() as f64; + let fraction = contained_size / interval_size; + assert!(fraction >= 0.0); + assert!(fraction <= 1.0); + fraction +} + +// For a delta metric, compute the mean of points falling within the provided +// window. +// +// This uses both the start and end times when considering each point. Each +// point's value is weighted by the faction of overlap with the window. +fn mean_delta_value_in_window( + start_times: &[DateTime], + timestamps: &[DateTime], + input_points: &[Option], + window_start: DateTime, + window_end: DateTime, +) -> Option { + // We can find the indices where the timestamp and start times separately + // overlap the window of interest. Then any interval is potentially of + // interest if _either_ its start time or timestamp is within the window. + // + // Since the start times are <= the timestamps, we can take the min of those + // two to get the first point that overlaps at all, and the max to get the + // last. + let first_timestamp = timestamps.partition_point(|t| t <= &window_start); + let last_timestamp = timestamps.partition_point(|t| t <= &window_end); + let first_start_time = start_times.partition_point(|t| t <= &window_start); + let last_start_time = start_times.partition_point(|t| t <= &window_end); + let first_index = first_timestamp.min(first_start_time); + let last_index = last_timestamp.max(last_start_time); + + // Detect the possible case where the interval is entirely before or + // entirely after the window. + if first_index == last_index { + let t = *timestamps.get(first_timestamp)?; + let s = *start_times.get(first_timestamp)?; + if t < window_start || s > window_end { + return None; + } + let Some(val) = input_points[first_timestamp] else { + return None; + }; + let fraction = fraction_overlap_with_window( + start_times[first_start_time], + timestamps[first_timestamp], + window_start, + window_end, + ); + return Some(fraction * val); + } + + // Compute the overlap for all points which have some overlap. + let starts = &start_times[first_index..last_index]; + let times = ×tamps[first_index..last_index]; + let vals = &input_points[first_index..last_index]; + let iter = starts + .into_iter() + .copied() + .zip(times.into_iter().copied()) + .zip(vals.into_iter().copied()); + let count = (last_timestamp - first_timestamp).max(1) as f64; + let mut maybe_sum = None; + for it in iter.filter_map(|((start, time), maybe_val)| { + let Some(val) = maybe_val else { + return None; + }; + let fraction = + fraction_overlap_with_window(start, time, window_start, window_end); + Some(fraction * val) + }) { + *maybe_sum.get_or_insert(0.0) += it; + } + maybe_sum.map(|sum| sum / count) +} + +// For a gauge metric, compute the mean of points falling within the provided +// window. +fn mean_gauge_value_in_window( + timestamps: &[DateTime], + input_points: &[Option], + window_start: DateTime, + window_end: DateTime, +) -> Option { + // Find the position of the window start and end in the sorted + // array of input timestamps. The `partition_point()` method accepts + // a closure, which partitions the input into a prefix where the + // closure evaluates to true, and a suffix where it's false. It + // returns the first element in the suffix. + // + // So the first closure returns true for all timestamps we want to + // exclude, which are those up to and including the window start time. + // So we get the index of the first point strictly later than the + // window start. + // + // The second closure returns true for all points up to and + // including the output time as well. + let start_index = timestamps.partition_point(|t| t <= &window_start); + let output_index = timestamps.partition_point(|t| t <= &window_end); + assert!(output_index >= start_index); + + // Accumulate the values over this set of indices. + // + // If there are really zero points in this time interval, we add + // a missing value. + if start_index != output_index { + let mut maybe_sum = None; + for it in input_points[start_index..output_index] + .iter() + .filter_map(|x| x.as_ref().copied()) + { + *maybe_sum.get_or_insert(0.0) += it; + } + maybe_sum.map(|output_value| { + output_value / (output_index - start_index) as f64 + }) + } else { + None + } +} + +fn align_interpolate( + _table: &Table, + _query_end: &DateTime, + _period: &Duration, +) -> Result { + anyhow::bail!("Alignment with interpolation not yet implemented") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fraction_overlap_with_window() { + let now = Utc::now(); + let window_start = now - Duration::from_secs(1); + let window_end = now; + let interval_start = window_start; + let interval_end = window_end; + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 1.0 + ); + + let window_start = now - Duration::from_secs(1); + let window_end = now; + let interval_start = window_start; + let interval_end = now - Duration::from_secs_f64(0.5); + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 1.0, + "This interval is aligned with the start time \ + of the window, and contained entirely within it, \ + so the fraction should be 1.0", + ); + + // If we reverse the window and interval, then the interval entirely + // contains the window, which is 50% of the interval. + let (window_start, window_end, interval_start, interval_end) = + (interval_start, interval_end, window_start, window_end); + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 0.5, + "The window is entirely contained within the interval, \ + and covers 50% of it", + ); + + // If the interval is entirely contained in the window, we should have + // the entire interval as our fraction. + let window_start = now - Duration::from_secs(1); + let window_end = now; + let interval_start = window_start + Duration::from_secs_f64(0.25); + let interval_end = window_start + Duration::from_secs_f64(0.5); + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 1.0, + "The interval is entirely contained within the window", + ); + + // This is aligned at the right with the window end. + let window_start = now - Duration::from_secs(1); + let window_end = now; + let interval_start = window_start + Duration::from_secs_f64(0.25); + let interval_end = window_end; + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 1.0, + "The interval is aligned at right with the window, and \ + entirely contained within it, so the fraction should still \ + be 1.0", + ); + + // But if we reverse it again, the fraction should reveal itself. + let (window_start, window_end, interval_start, interval_end) = + (interval_start, interval_end, window_start, window_end); + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 0.75, + "The window represents 75% of the interval", + ); + + // This interval does not overlap at all, to the left. + let window_start = now - Duration::from_secs(1); + let window_end = now; + let interval_start = window_start - Duration::from_secs(2); + let interval_end = window_start - Duration::from_secs(1); + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 0.0, + ); + + // This interval does not overlap at all, to the right. + let window_start = now - Duration::from_secs(1); + let window_end = now; + let interval_start = window_start + Duration::from_secs(1); + let interval_end = window_start + Duration::from_secs(2); + assert_eq!( + fraction_overlap_with_window( + interval_start, + interval_end, + window_start, + window_end, + ), + 0.0, + ); + } + + #[test] + fn test_mean_delta_value_in_window() { + let now = Utc::now(); + let start_times = &[ + now - Duration::from_secs(4), + now - Duration::from_secs(3), + now - Duration::from_secs(2), + now - Duration::from_secs(1), + ]; + let timestamps = &[ + now - Duration::from_secs(3), + now - Duration::from_secs(2), + now - Duration::from_secs(1), + now, + ]; + let input_points = &[Some(0.0), Some(1.0), Some(2.0), Some(3.0)]; + + let window_start = now - Duration::from_secs_f64(0.5); + let window_end = now; + let mean = mean_delta_value_in_window( + start_times, + timestamps, + input_points, + window_start, + window_end, + ) + .expect("This should overlap the last interval"); + assert_eq!( + mean, + input_points.last().unwrap().unwrap() / 2.0, + "This overlaps the last interval by half", + ); + } + + #[test] + fn test_mean_gauge_value_in_window() { + let now = Utc::now(); + let timestamps = &[ + now - Duration::from_secs(3), + now - Duration::from_secs(2), + now - Duration::from_secs(1), + now, + ]; + let input_points = &[Some(0.0), Some(1.0), Some(2.0), Some(3.0)]; + + let window_start = now - Duration::from_secs(4); + let window_end = now - Duration::from_secs(3); + let mean = mean_gauge_value_in_window( + timestamps, + input_points, + window_start, + window_end, + ) + .expect("This window should overlap the first timestamp"); + assert_eq!( + mean, 0.0, + "This window should overlap the first timestamp, so the \ + mean value should be the mean of the first point only" + ); + + let window_start = now - Duration::from_secs(4); + let window_end = now - Duration::from_secs(2); + let mean = mean_gauge_value_in_window( + timestamps, + input_points, + window_start, + window_end, + ) + .expect("This window should overlap the first two timestamps"); + assert_eq!( + mean, 0.5, + "This window should overlap the first two timestamps, so the \ + mean value should be the mean of the first two points" + ); + + let window_start = now - Duration::from_secs(3); + let window_end = now - Duration::from_secs(2); + let mean = mean_gauge_value_in_window( + timestamps, + input_points, + window_start, + window_end, + ) + .expect("This window should overlap the second timestamps"); + assert_eq!( + mean, 1.0, + "This window should overlap the second timestamp, so the \ + mean value should be the mean of the second point only." + ); + + let window_start = now - Duration::from_secs(4); + let window_end = *timestamps.last().unwrap(); + let mean = mean_gauge_value_in_window( + timestamps, + input_points, + window_start, + window_end, + ) + .expect("This window should overlap the all timestamps"); + assert_eq!( + mean, + input_points.iter().map(|x| x.unwrap()).sum::() + / input_points.len() as f64, + "This window should overlap the all timestamps, so the \ + mean value should be the mean of all points", + ); + + let window_start = now - Duration::from_secs(3); + let window_end = now - Duration::from_secs_f64(2.5); + assert!( + mean_gauge_value_in_window( + timestamps, + input_points, + window_start, + window_end, + ) + .is_none(), + "This window should overlap none of the points" + ); + } + + #[test] + fn test_verify_max_upsampling_ratio() { + // We'll use a 1 second period, and ensure that we allow downsampling, + // and upsampling up to the max factor. That's 1/10th of a second, + // currently. + let now = Utc::now(); + let timestamps = &[now - Duration::from_secs(1), now]; + + // All values within the threshold. + for period in [ + Duration::from_secs_f64(0.5), + Duration::from_secs(10), + Duration::from_millis(100), + ] { + assert!(verify_max_upsampling_ratio(timestamps, &period).is_ok()); + } + + // Just below the threshold. + assert!(verify_max_upsampling_ratio( + timestamps, + &Duration::from_millis(99), + ) + .is_err()); + + // Sanity check for way below the threshold. + assert!(verify_max_upsampling_ratio( + timestamps, + &Duration::from_nanos(1), + ) + .is_err()); + + // Arrays where we can't compute an interval are fine. + assert!(verify_max_upsampling_ratio( + ×tamps[..1], + &Duration::from_nanos(1), + ) + .is_ok()); + assert!( + verify_max_upsampling_ratio(&[], &Duration::from_nanos(1),).is_ok() + ); + } + + #[test] + fn test_mean_delta_does_not_modify_missing_values() { + let now = Utc::now(); + let start_times = + &[now - Duration::from_secs(2), now - Duration::from_secs(1)]; + let timestamps = &[now - Duration::from_secs(1), now]; + let input_points = &[Some(1.0), None]; + let window_start = now - Duration::from_secs(1); + let window_end = now; + let mean = mean_delta_value_in_window( + start_times, + timestamps, + input_points, + window_start, + window_end, + ); + assert!( + mean.is_none(), + "This time window contains only a None value, which should not be \ + included in the sum" + ); + } + + #[test] + fn test_mean_gauge_does_not_modify_missing_values() { + let now = Utc::now(); + let timestamps = &[now - Duration::from_secs(1), now]; + let input_points = &[Some(1.0), None]; + let window_start = now - Duration::from_secs(1); + let window_end = now; + let mean = mean_gauge_value_in_window( + timestamps, + input_points, + window_start, + window_end, + ); + assert!( + mean.is_none(), + "This time window contains only a None value, which should not be \ + included in the sum" + ); + } +} diff --git a/oximeter/db/src/oxql/ast/table_ops/filter.rs b/oximeter/db/src/oxql/ast/table_ops/filter.rs new file mode 100644 index 0000000000..e97673c8f8 --- /dev/null +++ b/oximeter/db/src/oxql/ast/table_ops/filter.rs @@ -0,0 +1,1283 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! An AST node describing filtering table operations. + +// Copyright 2024 Oxide Computer Company + +use crate::oxql::ast::cmp::Comparison; +use crate::oxql::ast::ident::Ident; +use crate::oxql::ast::literal::Literal; +use crate::oxql::ast::logical_op::LogicalOp; +use crate::oxql::point::DataType; +use crate::oxql::point::MetricType; +use crate::oxql::point::Points; +use crate::oxql::point::ValueArray; +use crate::oxql::query::special_idents; +use crate::oxql::Error; +use crate::oxql::Table; +use crate::oxql::Timeseries; +use anyhow::Context; +use chrono::DateTime; +use chrono::Utc; +use oximeter::FieldType; +use oximeter::FieldValue; +use regex::Regex; +use std::collections::BTreeSet; +use std::fmt; + +/// An AST node for the `filter` table operation. +/// +/// This can be a simple operation like `foo == "bar"` or a more complex +/// expression, such as: `filter hostname == "foo" || (hostname == "bar" +/// && id == "baz")`. +#[derive(Clone, Debug, PartialEq)] +pub struct Filter { + /// True if the whole expression is negated. + pub negated: bool, + /// The contained filtering expression, which may contain many expressions + /// joined by logical operators. + pub expr: FilterExpr, +} + +impl fmt::Display for Filter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}({})", if self.negated { "!" } else { "" }, self.expr,) + } +} + +impl core::str::FromStr for Filter { + type Err = Error; + fn from_str(s: &str) -> Result { + crate::oxql::ast::grammar::query_parser::filter_expr(s) + .map_err(|e| anyhow::anyhow!("invalid filter expression: {e}")) + } +} + +// A crude limit on expression complexity, governing how many times we +// iteratively apply a DNF simplification before bailing out. +const EXPR_COMPLEXITY_ITERATIVE_LIMIT: usize = 32; + +// A crude limit on expression complexity, governing how many times we +// recurisvely apply a DNF simplification before bailing out. +const EXPR_COMPLEXITY_RECURSIVE_LIMIT: usize = 32; + +impl Filter { + /// Return the negation of this filter. + pub fn negate(&self) -> Filter { + Self { negated: !self.negated, ..self.clone() } + } + + /// Split the filter at top-level disjunctions. + /// + /// This is likely only useful after simplifying to DNF with + /// `simplify_to_dnf()`. + pub fn flatten_disjunctions(&self) -> Vec { + let mut out = vec![]; + self.flatten_disjunctions_inner(&mut out); + out + } + + fn flatten_disjunctions_inner(&self, dis: &mut Vec) { + // Recursion is only needed if this is an OR expression. In that case, + // we split the left and push it, and then recurse on the right. + // + // Note that we don't need left-recursion because the parser is strictly + // non-left-recursive. + if let FilterExpr::Compound(CompoundFilter { + left, + op: LogicalOp::Or, + right, + }) = &self.expr + { + dis.push(*left.clone()); + right.flatten_disjunctions_inner(dis); + } else { + // It's not an OR expression, or it is a simple filter expression. + // In either case, just push it directly, withouth recursing. + dis.push(self.clone()); + } + } + + /// Simplfy a filter expression to disjunctive normal form (DNF). + /// + /// Disjunctive normal form is one of a few canonical ways of writing a + /// boolean expression. It simplifies to a disjunction of conjunctions, + /// i.e., only has terms like `(a && b) || (c && d) || ...`. + /// + /// This method exists for the purposes of creating _independent_ pieces of + /// a filtering expression, each of which can be used to generate a new SQL + /// query run against ClickHouse. This is critical to support complicated + /// OxQL queries. Consider: + /// + /// ```ignore + /// get some_timeseries + /// | filter (foo == "bar") || (timestamp > @now() - 1m && foo == "baz") + /// ``` + /// + /// This requires fetching part of one timeseries, and all of another. One + /// cannot run this as a conjunction on the fields and then a query on the + /// measurements. It must be run in such a way to get the sets of keys + /// consistent with each term in the disjunction _independently_, so that + /// one can apply the timestamp filter to only the correct one. + /// + /// We use this method to generate the DNF, a form with only disjunctions of + /// conjunctions. That is, it's not possible to further distribute + /// conjunctions over disjunctions. + /// + /// Each disjunction is then a separate query against the fields table, where + /// we keep track of the keys in each. Each set of predicates and consistent + /// keys is then used later to fetch the measurements. + /// + /// # Notes + /// + /// There is a huge academic literature on this topic, part of the study of + /// formal languages and other areas theoretical computer science. These + /// references are mostly pretty dense and formal, though a few are really + /// useful. This [paper](https://www.researchgate.net/publication/220154187_A_Survey_of_Strategies_in_Program_Transformation_Systems) + /// is a good and accessible survey to the idea of translation systems -- + /// it's mostly focused on programming languages and compilers, but Figures + /// 7-9 in particular are about DNF. + /// + /// As usual, the Wikipedia page is a reasonable overview as well, + /// [here](https://en.wikipedia.org/wiki/Disjunctive_normal_form). We're + /// using the "syntactic" DNF conversion algorithm, essentially. This + /// involves a recursive application of + /// [de Morgan's rules](https://en.wikipedia.org/wiki/De_Morgan%27s_laws), + /// [involution / double-negation](https://en.wikipedia.org/wiki/Involution_(mathematics)), + /// distributivity of [Boolean operators](https://en.wikipedia.org/wiki/Boolean_algebra#Monotone_laws), + /// etc. + pub fn simplify_to_dnf(&self) -> Result { + self.simplify_to_dnf_impl(0) + } + + fn simplify_to_dnf_impl(&self, level: usize) -> Result { + anyhow::ensure!( + level < EXPR_COMPLEXITY_RECURSIVE_LIMIT, + "Maximum recursion level exceeded trying to simplify \ + logical expression to disjunctive normal form" + ); + let mut out = self.simplify_to_dnf_inner(level)?; + if &out == self { + return Ok(out); + } + // Continually apply simplifications as long as able. + // + // This makes me really nervous, so I'm adding an escape hatch that we + // only allow a few iterations. If we've not simplified within that, + // we'll just declare the expression too complicated to handle. + for _ in 0..EXPR_COMPLEXITY_ITERATIVE_LIMIT { + let out_ = out.simplify_to_dnf_inner(level)?; + if out_ == out { + return Ok(out_); + } + out = out_; + } + anyhow::bail!("Logical expression is too complicated to simplify") + } + + fn simplify_to_dnf_inner(&self, level: usize) -> Result { + let new = self.expr.simplify_to_dnf(level)?; + + // This matches the rule: + // + // !!x -> x + if self.negated && new.negated && new.is_simple() { + return Ok(new.negate()); + } + + // These two blocks match de Morgan's rules, which distribute a negation + // down and swap the logical operator. + if self.negated { + // This matches one of de Morgan's rules: + // + // !(x && y) -> !x || !y + if let FilterExpr::Compound(CompoundFilter { + left: x, + op: LogicalOp::And, + right: y, + }) = &new.expr + { + let expr = FilterExpr::Compound(CompoundFilter { + left: Box::new(x.negate()), + op: LogicalOp::Or, + right: Box::new(y.negate()), + }); + return Ok(Filter { negated: false, expr }); + } + + // This matches the other of de Morgan's rules: + // + // !(x || y) -> !x && !y + if let FilterExpr::Compound(CompoundFilter { + left: x, + op: LogicalOp::And, + right: y, + }) = &new.expr + { + let expr = FilterExpr::Compound(CompoundFilter { + left: Box::new(x.negate()), + op: LogicalOp::Or, + right: Box::new(y.negate()), + }); + return Ok(Filter { negated: false, expr }); + } + } + + // Nothing else to do, just return ourself, though we do need to make + // sure we copy the negation from self as well. + Ok(Self { negated: self.negated, ..new }) + } + + // Merge this filter with another one, using the provided operator. + pub(crate) fn merge(&self, other: &Filter, op: LogicalOp) -> Self { + Self { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(self.clone()), + op, + right: Box::new(other.clone()), + }), + } + } + + // Apply the filter to the provided field. + // + // This returns `Ok(None)` if the filter doesn't apply. It returns `Ok(x)` + // if the filter does apply, where `x` is the logical application of the + // filter to the field. `true` means "keep this field", which is analogous + // to the `Iterator::filter()` method's signature. + // + // If the filter does apply, but is incompatible or incomparable, return an + // error. + fn filter_field( + &self, + name: &str, + value: &FieldValue, + ) -> Result, Error> { + let result = match &self.expr { + FilterExpr::Simple(inner) => inner.filter_field(name, value), + FilterExpr::Compound(inner) => inner.filter_field(name, value), + }; + result.map(|maybe_keep| maybe_keep.map(|keep| self.negated ^ keep)) + } + + // Apply the filter to the provided points. + fn filter_points(&self, points: &Points) -> Result { + let to_keep = self.filter_points_inner(points)?; + points.filter(to_keep) + } + + // Inner implementation of filtering points. + // + // Returns an array of bools, where true indicates the point should be kept. + fn filter_points_inner(&self, points: &Points) -> Result, Error> { + match &self.expr { + FilterExpr::Simple(inner) => { + inner.filter_points(self.negated, points) + } + FilterExpr::Compound(inner) => { + inner.filter_points(self.negated, points) + } + } + } + + // Apply the filtering table operation. + pub(crate) fn apply(&self, tables: &[Table]) -> Result, Error> { + anyhow::ensure!( + tables.len() >= 1, + "Filtering operations require at least one table", + ); + let mut output_tables = Vec::with_capacity(tables.len()); + // Ensure that all the identifiers in this filter apply to the + // input timeseries. We can do this once at the beginning, because all + // the timeseries in a table have the same set of fields. + let first_timeseries = tables[0] + .iter() + .next() + .context("Table contains no timeseries to filter")?; + let ident_names = self.ident_names(); + + // There are extra, implied names that depend on the data type of the + // timeseries itself, check those as well. + let extras = implicit_field_names(first_timeseries); + let not_valid = ident_names + .iter() + .filter(|&&name| { + !(first_timeseries.fields.contains_key(name) + || extras.contains(name)) + }) + .collect::>(); + anyhow::ensure!( + not_valid.is_empty(), + "The filter expression contains identifiers that are not \ + valid for its input timeseries. Invalid identifiers: {:?}, \ + timeseries fields: {:?}", + not_valid, + ident_names.union(&extras), + ); + + // Filter each input table in succession. + for table in tables.iter() { + let mut timeseries = Vec::with_capacity(table.len()); + 'timeseries: for input in table.iter() { + // If the filter restricts any of the fields, remove this + // timeseries altogether. + for (name, value) in input.fields.iter() { + if let Some(false) = self.filter_field(name, value)? { + continue 'timeseries; + } + } + + // Apply the filter to the data points as well. + let points = self.filter_points(&input.points)?; + + // Similar to above, if the filter removes all data points in + // the timeseries, let's remove the timeseries altogether. + if points.is_empty() { + continue; + } + timeseries.push(Timeseries { + fields: input.fields.clone(), + points, + alignment: input.alignment, + }) + } + output_tables.push(Table::from_timeseries( + table.name(), + timeseries.into_iter(), + )?); + } + Ok(output_tables) + } + + // Return the last referenced timestamp by this filter, if any. + // + // This is the maximum timestamp, before which any filtered point must lie. + // This is used to determine the query end time. + pub(crate) fn last_timestamp(&self) -> Option> { + match &self.expr { + FilterExpr::Simple(inner) => inner.last_timestamp(), + FilterExpr::Compound(inner) => inner.last_timestamp(), + } + } + + // Return the name of all identifiers listed in this filter. + fn ident_names(&self) -> BTreeSet<&str> { + match &self.expr { + FilterExpr::Simple(inner) => { + let mut out = BTreeSet::new(); + out.insert(inner.ident.as_str()); + out + } + FilterExpr::Compound(inner) => { + let mut all = inner.left.ident_names(); + all.extend(inner.right.ident_names()); + all + } + } + } + + fn is_xor(&self) -> bool { + self.is_op(LogicalOp::Xor) + } + + fn is_op(&self, expected_op: LogicalOp) -> bool { + let FilterExpr::Compound(CompoundFilter { op, .. }) = &self.expr else { + return false; + }; + op == &expected_op + } + + // If this is an XOR, rewrite it to a disjunction of conjunctions. + // + // If it is not, return a clone of self. + fn rewrite_xor_to_disjunction(&self) -> Self { + let self_ = self.clone(); + if !self.is_xor() { + return self_; + } + let Filter { + negated, + expr: FilterExpr::Compound(CompoundFilter { left, right, .. }), + } = self_ + else { + unreachable!(); + }; + let left_ = CompoundFilter { + left: left.clone(), + op: LogicalOp::And, + right: Box::new(right.negate()), + }; + let right_ = CompoundFilter { + left: Box::new(left.negate()), + op: LogicalOp::And, + right, + }; + let expr = CompoundFilter { + left: Box::new(left_.to_filter()), + op: LogicalOp::Or, + right: Box::new(right_.to_filter()), + }; + Filter { negated, expr: FilterExpr::Compound(expr) } + } + + fn is_simple(&self) -> bool { + matches!(self.expr, FilterExpr::Simple(_)) + } +} + +/// Return the names of the implicit fields / columns that a filter can apply +/// to, based on the metric types of the contained data points. +fn implicit_field_names( + first_timeseries: &Timeseries, +) -> BTreeSet<&'static str> { + let mut out = BTreeSet::new(); + + // Everything has a timestamp! + out.insert(special_idents::TIMESTAMP); + let type_info = first_timeseries + .points + .metric_types() + .zip(first_timeseries.points.data_types()); + for (metric_type, data_type) in type_info { + match (metric_type, data_type) { + // Scalar gauges. + ( + MetricType::Gauge, + DataType::Integer + | DataType::Boolean + | DataType::Double + | DataType::String, + ) => { + out.insert(special_idents::DATUM); + } + // Histogram gauges. + ( + MetricType::Gauge, + DataType::IntegerDistribution | DataType::DoubleDistribution, + ) => { + out.insert(special_idents::BINS); + out.insert(special_idents::COUNTS); + } + // Scalars, either delta or cumulatives. + ( + MetricType::Delta | MetricType::Cumulative, + DataType::Integer | DataType::Double, + ) => { + out.insert(special_idents::DATUM); + out.insert(special_idents::START_TIME); + } + // Histograms, either delta or cumulative. + ( + MetricType::Delta | MetricType::Cumulative, + DataType::IntegerDistribution | DataType::DoubleDistribution, + ) => { + out.insert(special_idents::BINS); + out.insert(special_idents::COUNTS); + out.insert(special_idents::START_TIME); + } + // Impossible combinations + ( + MetricType::Delta | MetricType::Cumulative, + DataType::Boolean | DataType::String, + ) => unreachable!(), + } + } + out +} + +/// A filtering expression, used in the `filter` table operation. +#[derive(Clone, Debug, PartialEq)] +pub enum FilterExpr { + /// A single logical expression, e.g., `foo == "bar"`. + Simple(SimpleFilter), + /// Two logical expressions, e.g., `foo == "bar" || yes == false` + Compound(CompoundFilter), +} + +impl FilterExpr { + fn to_filter(&self) -> Filter { + Filter { negated: false, expr: self.clone() } + } + + fn simplify_to_dnf(&self, level: usize) -> Result { + match self { + FilterExpr::Simple(_) => Ok(self.to_filter()), + FilterExpr::Compound(CompoundFilter { left, op, right }) => { + // Apply recursively first. + let left = left.simplify_to_dnf_impl(level + 1)?; + let right = right.simplify_to_dnf_impl(level + 1)?; + + // This matches the rule: + // + // (x || y) && z -> (x && z) || (y && z) + if let ( + FilterExpr::Compound(CompoundFilter { + left: x, + op: LogicalOp::Or, + right: y, + }), + LogicalOp::And, + FilterExpr::Simple(z), + ) = (&left.expr, op, &right.expr) + { + let left_ = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: x.clone(), + op: LogicalOp::And, + right: Box::new(z.to_filter()), + }), + }; + let right_ = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: y.clone(), + op: LogicalOp::And, + right: Box::new(z.to_filter()), + }), + }; + return Ok(Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(left_), + op: LogicalOp::Or, + right: Box::new(right_), + }), + }); + } + + // This matches the rule: + // + // z && (x || y) -> (z && x) || (z && y) + if let ( + FilterExpr::Simple(z), + LogicalOp::And, + FilterExpr::Compound(CompoundFilter { + left: x, + op: LogicalOp::Or, + right: y, + }), + ) = (&left.expr, op, &right.expr) + { + let left_ = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(z.to_filter()), + op: LogicalOp::And, + right: x.clone(), + }), + }; + let right_ = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(z.to_filter()), + op: LogicalOp::And, + right: y.clone(), + }), + }; + return Ok(Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(left_), + op: LogicalOp::Or, + right: Box::new(right_), + }), + }); + } + + // Lastly, simplify an XOR to its logical equivalent, which is + // in DNF. + let out = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(left), + op: *op, + right: Box::new(right), + }), + }; + Ok(out.rewrite_xor_to_disjunction()) + } + } + } +} + +impl fmt::Display for FilterExpr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + FilterExpr::Simple(inner) => write!(f, "{inner}"), + FilterExpr::Compound(inner) => write!(f, "{inner}"), + } + } +} + +/// Two filter expressions joined by a logical operator. +#[derive(Clone, Debug, PartialEq)] +pub struct CompoundFilter { + /// The left subexpression. + pub left: Box, + /// The logical operator joining the two expressions. + pub op: LogicalOp, + /// The right subexpression. + pub right: Box, +} + +impl fmt::Display for CompoundFilter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} {} {}", self.left, self.op, self.right,) + } +} + +impl CompoundFilter { + fn to_filter(&self) -> Filter { + Filter { negated: false, expr: FilterExpr::Compound(self.clone()) } + } + + // Apply the filter to the provided field. + fn filter_field( + &self, + name: &str, + value: &FieldValue, + ) -> Result, Error> { + let left = self.left.filter_field(name, value)?; + let right = self.right.filter_field(name, value)?; + match (left, right) { + (None, None) => Ok(None), + (Some(x), None) | (None, Some(x)) => Ok(Some(x)), + (Some(left), Some(right)) => match self.op { + LogicalOp::And => Ok(Some(left && right)), + LogicalOp::Or => Ok(Some(left || right)), + LogicalOp::Xor => Ok(Some(left ^ right)), + }, + } + } + + // Apply the filter to the provided points. + fn filter_points( + &self, + negated: bool, + points: &Points, + ) -> Result, Error> { + let mut left = self.left.filter_points_inner(points)?; + let right = self.right.filter_points_inner(points)?; + match self.op { + LogicalOp::And => { + for i in 0..left.len() { + left[i] = negated ^ (left[i] & right[i]); + } + } + LogicalOp::Or => { + for i in 0..left.len() { + left[i] = negated ^ (left[i] | right[i]); + } + } + LogicalOp::Xor => { + for i in 0..left.len() { + left[i] = negated ^ (left[i] ^ right[i]); + } + } + } + Ok(left) + } + + fn last_timestamp(&self) -> Option> { + let left = self.left.last_timestamp(); + let right = self.right.last_timestamp(); + match (left, right) { + (None, None) => None, + (Some(single), None) | (None, Some(single)) => Some(single), + (Some(left), Some(right)) => Some(left.max(right)), + } + } +} + +/// A simple filter expression, comparing an identifier to a value. +#[derive(Clone, Debug, PartialEq)] +pub struct SimpleFilter { + /// The identifier being compared. + pub ident: Ident, + /// The comparison operator. + pub cmp: Comparison, + /// The value to compare the identifier against. + pub value: Literal, +} + +impl fmt::Display for SimpleFilter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} {} {}", self.ident, self.cmp, self.value,) + } +} + +impl SimpleFilter { + fn to_filter(&self) -> Filter { + Filter { negated: false, expr: FilterExpr::Simple(self.clone()) } + } + + // Apply this filter to the provided field. + // + // If the field name does not match the identifier in `self`, return + // `Ok(None)`, since this filter does not apply to the provided field. + // + // If the name matches and the type of `self` is compatible, return `Ok(x)` + // where `x` is the logical application of the filter to the field. + // + // If the field matches the name, but the type is not compatible, return an + // error. + fn filter_field( + &self, + name: &str, + value: &FieldValue, + ) -> Result, Error> { + // If the name does not match, this filter does not apply, and so we do not + // filter the field. + if self.ident.as_str() != name { + return Ok(None); + } + self.value.compare_field(value, self.cmp) + } + + pub(crate) fn value_type_is_compatible_with_field( + &self, + field_type: FieldType, + ) -> bool { + self.value.is_compatible_with_field(field_type) + } + + /// Return the expression as a string that can be applied safely in the + /// database. + pub(crate) fn as_db_safe_string(&self) -> String { + let expr = self.value.as_db_safe_string(); + let fn_name = self.cmp.as_db_function_name(); + format!("{}({}, {})", fn_name, self.ident, expr) + } + + // Returns an array of bools, where true indicates the point should be kept. + fn filter_points( + &self, + negated: bool, + points: &Points, + ) -> Result, Error> { + let ident = self.ident.as_str(); + if ident == "timestamp" { + self.filter_points_by_timestamp(negated, &points.timestamps) + } else if ident == "datum" { + anyhow::ensure!( + points.dimensionality() == 1, + "Filtering multidimensional values by datum is not yet supported" + ); + self.filter_points_by_datum(negated, points.values(0).unwrap()) + } else { + Ok(vec![!negated; points.len()]) + } + } + + fn filter_points_by_timestamp( + &self, + negated: bool, + timestamps: &[DateTime], + ) -> Result, Error> { + let Literal::Timestamp(timestamp) = &self.value else { + anyhow::bail!( + "Cannot compare non-timestamp filter against a timestamp" + ); + }; + match self.cmp { + Comparison::Eq => Ok(timestamps + .iter() + .map(|t| negated ^ (t == timestamp)) + .collect()), + Comparison::Ne => Ok(timestamps + .iter() + .map(|t| negated ^ (t != timestamp)) + .collect()), + Comparison::Gt => Ok(timestamps + .iter() + .map(|t| negated ^ (t > timestamp)) + .collect()), + Comparison::Ge => Ok(timestamps + .iter() + .map(|t| negated ^ (t >= timestamp)) + .collect()), + Comparison::Lt => Ok(timestamps + .iter() + .map(|t| negated ^ (t < timestamp)) + .collect()), + Comparison::Le => Ok(timestamps + .iter() + .map(|t| negated ^ (t <= timestamp)) + .collect()), + Comparison::Like => unreachable!(), + } + } + + fn filter_points_by_datum( + &self, + negated: bool, + values: &ValueArray, + ) -> Result, Error> { + match (&self.value, values) { + (Literal::Integer(int), ValueArray::Integer(ints)) => { + match self.cmp { + Comparison::Eq => Ok(ints + .iter() + .map(|maybe_int| { + maybe_int + .map(|i| negated ^ (i128::from(i) == *int)) + .unwrap_or(false) + }) + .collect()), + Comparison::Ne => Ok(ints + .iter() + .map(|maybe_int| { + maybe_int + .map(|i| negated ^ (i128::from(i) != *int)) + .unwrap_or(false) + }) + .collect()), + Comparison::Gt => Ok(ints + .iter() + .map(|maybe_int| { + maybe_int + .map(|i| negated ^ (i128::from(i) > *int)) + .unwrap_or(false) + }) + .collect()), + Comparison::Ge => Ok(ints + .iter() + .map(|maybe_int| { + maybe_int + .map(|i| negated ^ (i128::from(i) >= *int)) + .unwrap_or(false) + }) + .collect()), + Comparison::Lt => Ok(ints + .iter() + .map(|maybe_int| { + maybe_int + .map(|i| negated ^ (i128::from(i) < *int)) + .unwrap_or(false) + }) + .collect()), + Comparison::Le => Ok(ints + .iter() + .map(|maybe_int| { + maybe_int + .map(|i| negated ^ (i128::from(i) <= *int)) + .unwrap_or(false) + }) + .collect()), + Comparison::Like => unreachable!(), + } + } + (Literal::Double(double), ValueArray::Double(doubles)) => { + match self.cmp { + Comparison::Eq => Ok(doubles + .iter() + .map(|maybe_double| { + maybe_double + .map(|d| negated ^ (d == *double)) + .unwrap_or(false) + }) + .collect()), + Comparison::Ne => Ok(doubles + .iter() + .map(|maybe_double| { + maybe_double + .map(|d| negated ^ (d != *double)) + .unwrap_or(false) + }) + .collect()), + Comparison::Gt => Ok(doubles + .iter() + .map(|maybe_double| { + maybe_double + .map(|d| negated ^ (d > *double)) + .unwrap_or(false) + }) + .collect()), + Comparison::Ge => Ok(doubles + .iter() + .map(|maybe_double| { + maybe_double + .map(|d| negated ^ (d >= *double)) + .unwrap_or(false) + }) + .collect()), + Comparison::Lt => Ok(doubles + .iter() + .map(|maybe_double| { + maybe_double + .map(|d| negated ^ (d < *double)) + .unwrap_or(false) + }) + .collect()), + Comparison::Le => Ok(doubles + .iter() + .map(|maybe_double| { + maybe_double + .map(|d| negated ^ (d <= *double)) + .unwrap_or(false) + }) + .collect()), + Comparison::Like => unreachable!(), + } + } + (Literal::String(string), ValueArray::String(strings)) => { + let string = string.as_str(); + match self.cmp { + Comparison::Eq => Ok(strings + .iter() + .map(|maybe_string| { + maybe_string + .as_deref() + .map(|s| negated ^ (s == string)) + .unwrap_or(false) + }) + .collect()), + Comparison::Ne => Ok(strings + .iter() + .map(|maybe_string| { + maybe_string + .as_deref() + .map(|s| negated ^ (s != string)) + .unwrap_or(false) + }) + .collect()), + Comparison::Gt => Ok(strings + .iter() + .map(|maybe_string| { + maybe_string + .as_deref() + .map(|s| negated ^ (s > string)) + .unwrap_or(false) + }) + .collect()), + Comparison::Ge => Ok(strings + .iter() + .map(|maybe_string| { + maybe_string + .as_deref() + .map(|s| negated ^ (s >= string)) + .unwrap_or(false) + }) + .collect()), + Comparison::Lt => Ok(strings + .iter() + .map(|maybe_string| { + maybe_string + .as_deref() + .map(|s| negated ^ (s < string)) + .unwrap_or(false) + }) + .collect()), + Comparison::Le => Ok(strings + .iter() + .map(|maybe_string| { + maybe_string + .as_deref() + .map(|s| negated ^ (s <= string)) + .unwrap_or(false) + }) + .collect()), + Comparison::Like => { + let re = Regex::new(string)?; + Ok(strings + .iter() + .map(|maybe_string| { + maybe_string + .as_deref() + .map(|s| negated ^ re.is_match(s)) + .unwrap_or(false) + }) + .collect()) + } + } + } + (Literal::Boolean(boolean), ValueArray::Boolean(booleans)) => { + match self.cmp { + Comparison::Eq => Ok(booleans + .iter() + .map(|maybe_boolean| { + maybe_boolean + .map(|b| negated ^ (b == *boolean)) + .unwrap_or(false) + }) + .collect()), + Comparison::Ne => Ok(booleans + .iter() + .map(|maybe_boolean| { + maybe_boolean + .map(|b| negated ^ (b != *boolean)) + .unwrap_or(false) + }) + .collect()), + Comparison::Gt => Ok(booleans + .iter() + .map(|maybe_boolean| { + maybe_boolean + .map(|b| negated ^ (b & !(*boolean))) + .unwrap_or(false) + }) + .collect()), + Comparison::Ge => Ok(booleans + .iter() + .map(|maybe_boolean| { + maybe_boolean + .map(|b| negated ^ (b >= *boolean)) + .unwrap_or(false) + }) + .collect()), + Comparison::Lt => Ok(booleans + .iter() + .map(|maybe_boolean| { + maybe_boolean + .map(|b| negated ^ (!b & *boolean)) + .unwrap_or(false) + }) + .collect()), + Comparison::Le => Ok(booleans + .iter() + .map(|maybe_boolean| { + maybe_boolean + .map(|b| negated ^ (b <= *boolean)) + .unwrap_or(false) + }) + .collect()), + Comparison::Like => unreachable!(), + } + } + (_, _) => { + let lit_type = match &self.value { + Literal::Uuid(_) => "UUID", + Literal::Duration(_) => "duration", + Literal::Timestamp(_) => "timestamp", + Literal::IpAddr(_) => "IP address", + Literal::Integer(_) => "integer", + Literal::Double(_) => "double", + Literal::String(_) => "string", + Literal::Boolean(_) => "boolean", + }; + anyhow::bail!( + "Cannot compare {} literal against values of type {}", + lit_type, + values.data_type(), + ) + } + } + } + + fn last_timestamp(&self) -> Option> { + if self.ident.as_str() == "timestamp" + && matches!( + self.cmp, + Comparison::Lt | Comparison::Le | Comparison::Eq + ) + { + let Literal::Timestamp(t) = self.value else { + return None; + }; + Some(t) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use crate::oxql::ast::grammar::query_parser; + use crate::oxql::ast::logical_op::LogicalOp; + use crate::oxql::point::MetricType; + use crate::oxql::point::Points; + use crate::oxql::point::ValueArray; + use crate::oxql::point::Values; + use chrono::Utc; + use oximeter::FieldValue; + use std::time::Duration; + use uuid::Uuid; + + #[test] + fn test_atom_filter_double_points() { + let start_times = None; + let timestamps = + vec![Utc::now(), Utc::now() + Duration::from_secs(1000)]; + let values = vec![Values { + values: ValueArray::Double(vec![Some(0.0), Some(2.0)]), + metric_type: MetricType::Gauge, + }]; + let points = Points { start_times, timestamps, values }; + + // This filter should remove the first point based on its timestamp. + let t = Utc::now() + Duration::from_secs(10); + let q = + format!("filter timestamp > @{}", t.format("%Y-%m-%dT%H:%M:%S")); + let filter = query_parser::filter(q.as_str()).unwrap(); + let out = filter.filter_points(&points).unwrap(); + assert!(out.len() == 1); + assert_eq!( + out.values(0).unwrap().as_double().unwrap()[0], + points.values(0).unwrap().as_double().unwrap()[1], + ); + + // And this one the second point based on the datum + let filter = query_parser::filter("filter datum < 1.0").unwrap(); + let out = filter.filter_points(&points).unwrap(); + assert!(out.len() == 1); + assert_eq!( + out.values(0).unwrap().as_double().unwrap()[0], + points.values(0).unwrap().as_double().unwrap()[0], + ); + } + + #[test] + fn test_atom_filter_points_wrong_type() { + let start_times = None; + let timestamps = + vec![Utc::now(), Utc::now() + Duration::from_secs(1000)]; + let values = vec![Values { + values: ValueArray::Double(vec![Some(0.0), Some(2.0)]), + metric_type: MetricType::Gauge, + }]; + let points = Points { start_times, timestamps, values }; + + let filter = + query_parser::filter("filter datum < \"something\"").unwrap(); + assert!(filter.filter_points(&points).is_err()); + } + + #[test] + fn test_all_ident_names() { + let f = query_parser::filter("filter timestamp > @now() && datum < 1") + .unwrap(); + assert_eq!( + f.ident_names(), + ["datum", "timestamp"].into_iter().collect() + ); + + let f = query_parser::filter( + "filter timestamp > @now() - 1m && timestamp < @now()", + ) + .unwrap(); + let idents = f.ident_names(); + assert_eq!(idents.len(), 1); + assert_eq!(idents.iter().next().unwrap(), &"timestamp"); + } + + #[test] + #[allow(clippy::impossible_comparisons)] + fn test_filter_field_logic() { + for op in [LogicalOp::And, LogicalOp::Or, LogicalOp::Xor] { + let s = format!("filter (x > 10) {op} (x < 0)"); + let filter = query_parser::filter(&s).unwrap(); + let cases = &[11, 10, 5, 0, -1]; + for &val in cases.iter() { + let pass = match op { + LogicalOp::And => (val > 10) && (val < 0), + LogicalOp::Or => (val > 10) || (val < 0), + LogicalOp::Xor => (val > 10) ^ (val < 0), + }; + let result = filter + .filter_field("x", &FieldValue::I32(val)) + .expect("Filter should be considered comparable") + .expect("Filter should apply to field of the same name"); + assert_eq!( + result, + pass, + "Filter '{}' should {} the value {}", + filter, + if pass { "pass" } else { "not pass" }, + val, + ); + } + + // This names a different field, so should not apply. + assert_eq!( + filter + .filter_field("y", &FieldValue::I32(11)) + .expect("Filter should be considered comparable"), + None, + "Filter should not apply, since it names a different field" + ); + + // These values should not be comparable at all, so we'll return an + // error. + let incomparable = &[ + FieldValue::String("foo".into()), + FieldValue::Uuid(Uuid::new_v4()), + FieldValue::IpAddr("127.0.0.1".parse().unwrap()), + FieldValue::Bool(false), + ]; + for na in incomparable.iter() { + filter + .filter_field("x", na) + .expect_err("These should not be comparable at all"); + } + } + } + + #[test] + fn test_simplify_to_dnf() { + let cases = &[ + // Simple cases that should not be changed + ("a == 0", "a == 0"), + ("!(a == 0)", "!(a == 0)"), + ("a == 0 || b == 1", "a == 0 || b == 1"), + ("a == 0 && b == 1", "a == 0 && b == 1"), + + // Rewrite of XOR + ("a == 0 ^ b == 1", "(a == 0 && !(b == 1)) || (!(a == 0) && (b == 1))"), + + // Simple applications of distribution rules. + // + // Distribute conjunction over disjunction. + ("a == 0 && (b == 1 || c == 2)", "(a == 0 && b == 1) || (a == 0 && c == 2)"), + ("a == 0 && (b == 1 || c == 2 || d == 3)", "(a == 0 && b == 1) || (a == 0 && c == 2) || (a == 0 && d == 3)"), + ("a == 0 && (b == 1 || c == 2 || d == 3 || e == 4)", "(a == 0 && b == 1) || (a == 0 && c == 2) || (a == 0 && d == 3) || (a == 0 && e == 4)"), + ]; + for (input, expected) in cases.iter() { + let parsed_input = query_parser::filter_expr(input).unwrap(); + let simplified = parsed_input.simplify_to_dnf().unwrap(); + let parsed_expected = query_parser::filter_expr(expected).unwrap(); + assert_eq!( + simplified, + parsed_expected, + "\ninput expression: {}\nparsed to: {}\nsimplifed to: {}\nexpected: {}\n", + input, + parsed_input, + simplified, + expected, + ); + } + } + + #[test] + fn test_dnf_conversion_fails_on_extremely_long_expressions() { + let atom = "a == 0"; + let or_chain = std::iter::repeat(atom) + .take(super::EXPR_COMPLEXITY_ITERATIVE_LIMIT + 1) + .collect::>() + .join(" || "); + let expr = format!("{atom} && ({or_chain})"); + let parsed = query_parser::filter_expr(&expr).unwrap(); + assert!( + parsed.simplify_to_dnf().is_err(), + "Should fail for extremely long logical expressions" + ); + } + + #[test] + fn test_dnf_conversion_fails_on_extremely_deep_expressions() { + let atom = "a == 0"; + let mut expr = atom.to_string(); + for _ in 0..super::EXPR_COMPLEXITY_RECURSIVE_LIMIT + 1 { + expr = format!("{atom} && ({expr})"); + } + let parsed = query_parser::filter_expr(&expr).unwrap(); + assert!( + parsed.simplify_to_dnf().is_err(), + "Should fail for extremely deep logical expressions" + ); + } +} diff --git a/oximeter/db/src/oxql/ast/table_ops/get.rs b/oximeter/db/src/oxql/ast/table_ops/get.rs new file mode 100644 index 0000000000..f0ef22c2f6 --- /dev/null +++ b/oximeter/db/src/oxql/ast/table_ops/get.rs @@ -0,0 +1,15 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! AST node for the `get` table operation. + +// Copyright 2024 Oxide Computer Company + +use oximeter::TimeseriesName; + +/// An AST node like: `get foo:bar` +#[derive(Clone, Debug, PartialEq)] +pub struct Get { + pub timeseries_name: TimeseriesName, +} diff --git a/oximeter/db/src/oxql/ast/table_ops/group_by.rs b/oximeter/db/src/oxql/ast/table_ops/group_by.rs new file mode 100644 index 0000000000..da2b1413db --- /dev/null +++ b/oximeter/db/src/oxql/ast/table_ops/group_by.rs @@ -0,0 +1,746 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! AST node for the `group_by` operation. + +// Copyright 2024 Oxide Computer Company + +use chrono::DateTime; +use chrono::Utc; + +use crate::oxql::ast::ident::Ident; +use crate::oxql::point::DataType; +use crate::oxql::point::MetricType; +use crate::oxql::point::ValueArray; +use crate::oxql::Error; +use crate::oxql::Table; +use crate::oxql::Timeseries; +use crate::TimeseriesKey; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; + +/// A table operation for grouping data by fields, apply a reducer to the +/// remaining. +#[derive(Clone, Debug, PartialEq)] +pub struct GroupBy { + pub identifiers: Vec, + pub reducer: Reducer, +} + +impl GroupBy { + // Apply the group_by table operation. + pub(crate) fn apply(&self, tables: &[Table]) -> Result, Error> { + anyhow::ensure!( + tables.len() == 1, + "Group by operations require exactly one table", + ); + let table = &tables[0]; + anyhow::ensure!( + table.is_aligned(), + "Input tables to a `group_by` must be aligned" + ); + + match self.reducer { + Reducer::Mean => self.reduce_mean(table), + Reducer::Sum => self.reduce_sum(table), + } + } + + fn check_input_timeseries(input: &Timeseries) -> Result<(), Error> { + anyhow::ensure!(input.points.len() > 0, "Timeseries cannot be empty"); + + // For now, we can only apply this to 1-D timeseries. + anyhow::ensure!( + input.points.dimensionality() == 1, + "Group-by with multi-dimensional timeseries is not yet supported" + ); + let data_type = input.points.data_types().next().unwrap(); + anyhow::ensure!( + data_type.is_numeric(), + "Only numeric data types can be grouped, not {}", + data_type, + ); + let metric_type = input.points.metric_types().next().unwrap(); + anyhow::ensure!( + !matches!(metric_type, MetricType::Cumulative), + "Cumulative metric types cannot be grouped", + ); + Ok(()) + } + + // Reduce points in each group by summing. + fn reduce_sum(&self, table: &Table) -> Result, Error> { + assert_eq!(self.reducer, Reducer::Sum); + let mut output_table = Table::new(table.name()); + let kept_fields: Vec<_> = + self.identifiers.iter().map(Ident::as_str).collect(); + + for input in table.iter() { + Self::check_input_timeseries(input)?; + + // Throw away the fields in this timeseries that are not in the + // group_by list. + let dropped = input.copy_with_fields(&kept_fields)?; + let key = dropped.key(); + + // Fetch the existing timeseries, if one exists. If one does _not_ exist, + // we'll insert it as is, without converting. That's because we're + // just summing, not averaging. + match output_table.get_mut(key) { + Some(existing) => { + // No casting is done here, we're simply adding T + + // T -> T. + let new_values = dropped.points.values(0).unwrap(); + let existing_values = existing.points.values(0).unwrap(); + match (new_values, existing_values) { + ( + ValueArray::Double(new_values), + ValueArray::Double(existing_values), + ) => { + let new_timestamps = &dropped.points.timestamps; + + // We will be merging the new data with the + // existing, but borrow-checking limits the degree + // to which we can easily do this on the `existing` + // entry in the output table. Instead, aggregate + // everything into a copy of the expected data. + let mut timestamps = + existing.points.timestamps.clone(); + let mut values = existing_values.clone(); + + // Merge in the new values, so long as they actually + // exist. That is, we can just skip missing points + // in this round, since they do not contribute to + // the reduced value. + for (new_timestamp, new_value) in new_timestamps + .iter() + .zip(new_values) + .filter_map(|(timestamp, value)| { + if let Some(val) = value { + Some((*timestamp, *val)) + } else { + None + } + }) + { + // We're really doing binary search, on both the + // sample count map and the data array. They + // both must exist, or both not, or we've done + // our accounting incorrectly. + let maybe_index = + timestamps.binary_search(&new_timestamp); + match maybe_index { + Err(insert_at) => { + // This is a new timestamp. Insert it + // into the output timeseries. + timestamps + .insert(insert_at, new_timestamp); + values + .insert(insert_at, Some(new_value)); + } + Ok(ix) => { + // This is an existing + // timestamp, so we only need to + // add the new value. If the value + // didn't exist before, replace it. + *values[ix].get_or_insert(0.0) += + new_value; + } + } + } + + // Replace the existing output timeseries's + // timestamps and data arrays. + std::mem::swap( + &mut existing.points.timestamps, + &mut timestamps, + ); + existing + .points + .values_mut(0) + .unwrap() + .swap(ValueArray::Double(values)); + } + ( + ValueArray::Integer(new_values), + ValueArray::Integer(existing_values), + ) => { + let new_timestamps = &dropped.points.timestamps; + + // We will be merging the new data with the + // existing, but borrow-checking limits the degree + // to which we can easily do this on the `existing` + // entry in the output table. Instead, aggregate + // everything into a copy of the expected data. + let mut timestamps = + existing.points.timestamps.clone(); + let mut values = existing_values.clone(); + + // Merge in the new values, so long as they actually + // exist. That is, we can just skip missing points + // in this round, since they do not contribute to + // the reduced value. + for (new_timestamp, new_value) in new_timestamps + .iter() + .zip(new_values) + .filter_map(|(timestamp, value)| { + if let Some(val) = value { + Some((*timestamp, *val)) + } else { + None + } + }) + { + // We're really doing binary search, on both the + // sample count map and the data array. They + // both must exist, or both not, or we've done + // our accounting incorrectly. + let maybe_index = + timestamps.binary_search(&new_timestamp); + match maybe_index { + Err(insert_at) => { + // This is a new timestamp. Insert it + // into the output timeseries. + timestamps + .insert(insert_at, new_timestamp); + values + .insert(insert_at, Some(new_value)); + } + Ok(ix) => { + // This is an existing + // timestamp, so we only need to + // add the new value. If the value + // didn't exist before, replace it. + *values[ix].get_or_insert(0) += + new_value; + } + } + } + + // Replace the existing output timeseries's + // timestamps and data arrays. + std::mem::swap( + &mut existing.points.timestamps, + &mut timestamps, + ); + existing + .points + .values_mut(0) + .unwrap() + .swap(ValueArray::Integer(values)); + } + _ => unreachable!(), + } + } + None => output_table.insert(dropped)?, + } + } + Ok(vec![output_table]) + } + + // Reduce points in each group by averaging. + fn reduce_mean(&self, table: &Table) -> Result, Error> { + assert_eq!(self.reducer, Reducer::Mean); + let mut output_table = Table::new(table.name()); + let kept_fields: Vec<_> = + self.identifiers.iter().map(Ident::as_str).collect(); + + // Keep track of the number of values at each output timestamp, within + // each group. + // + // As we iterate through timeseries, we reduce in-group points, so long + // as they occur at the same timestamp. And while timeseries must all be + // aligned the same way, they need not actually have identical + // timestamps. So what we're producing on the output is data at the + // union of all the input timestamps. + // + // These arrays keeps the count of values at each time, and may be either + // expanded or have its values incremented. Note that they're all + // doubles because we will be reducing at the end by dividing the sum at + // each point by the counts. + let mut sample_counts_by_group: BTreeMap< + TimeseriesKey, + BTreeMap, f64>, + > = BTreeMap::new(); + + for input in table.iter() { + Self::check_input_timeseries(input)?; + + // Throw away the fields in this timeseries that are not in the + // group_by list. + let dropped = input.copy_with_fields(&kept_fields)?; + let key = dropped.key(); + + // Fetch the existing timeseries, if one exists. If one does _not_ exist, + // we'll insert the table with the data type converted to a double, + // since we're always averaging. + match output_table.get_mut(key) { + Some(existing) => { + // Cast the new points to doubles, since we'll be + // aggregating. + let new_points = + dropped.points.cast(&[DataType::Double])?; + let ValueArray::Double(new_values) = + new_points.values(0).unwrap() + else { + unreachable!(); + }; + let new_timestamps = &new_points.timestamps; + + // We will be merging the new data with the + // existing, but borrow-checking limits the degree + // to which we can easily do this on the `existing` + // entry in the output table. Instead, aggregate + // everything into a copy of the expected data. + let mut timestamps = existing.points.timestamps.clone(); + let mut values = existing + .points + .values(0) + .unwrap() + .as_double() + .unwrap() + .clone(); + + // Also fetch a reference to the existing counts by + // timestamp for this group. This should exist. + let counts = sample_counts_by_group.get_mut(&key).expect( + "Should already have some sample counts for this group", + ); + + // Merge in the new values, so long as they actually + // exist. That is, we can just skip missing points + // in this round, since they do not contribute to + // the reduced value. + for (new_timestamp, new_value) in new_timestamps + .iter() + .zip(new_values) + .filter_map(|(timestamp, value)| { + if let Some(val) = value { + Some((*timestamp, *val)) + } else { + None + } + }) + { + // We're really doing binary search, on both the + // sample count map and the data array. They + // both must exist, or both not, or we've done + // our accounting incorrectly. + let maybe_index = + timestamps.binary_search(&new_timestamp); + let count = counts.entry(new_timestamp); + match (count, maybe_index) { + (Entry::Vacant(entry), Err(insert_at)) => { + // This is a new timestamp. Insert it + // into the output timeseries, and count + // it. + timestamps.insert(insert_at, new_timestamp); + values.insert(insert_at, Some(new_value)); + entry.insert(1.0); + } + (Entry::Occupied(mut entry), Ok(ix)) => { + // This is an existing timestamp. _Add_ + // it into the output timeseries, and + // count it. Its timestamp already + // exists. If the value was previously None, + // replace it now. + *values[ix].get_or_insert(0.0) += new_value; + *entry.get_mut() += 1.0; + } + (_, _) => { + panic!( + "In-group counts and output \ + values must both exist or \ + both be missing" + ); + } + } + } + + // Replace the existing output timeseries's + // timestamps and data arrays. + std::mem::swap( + &mut existing.points.timestamps, + &mut timestamps, + ); + existing + .points + .values_mut(0) + .unwrap() + .swap(ValueArray::Double(values)); + } + None => { + // There were no previous points for this group. + // + // We'll cast to doubles, but _keep_ any missing samples + // (None) that were in there. Those will have a "count" of + // 0, so that we don't incorrectly over-divide in the case + // where there are both missing and non-missing samples. + let new_timeseries = dropped.cast(&[DataType::Double])?; + let values = new_timeseries + .points + .values(0) + .unwrap() + .as_double() + .unwrap(); + // Insert a count of 1.0 for each timestamp remaining, and + // _zero_ for any where the values are none. + let counts = new_timeseries + .points + .timestamps + .iter() + .zip(values) + .map(|(timestamp, maybe_value)| { + let count = f64::from(maybe_value.is_some()); + (*timestamp, count) + }) + .collect(); + let old = sample_counts_by_group.insert(key, counts); + assert!(old.is_none(), "Should not have counts entry for first timeseries in the group"); + output_table.insert(new_timeseries)?; + } + } + } + + // Since we're computing the mean, we need to divide each output value + // by the number of values that went into it. + for each in output_table.iter_mut() { + let counts = sample_counts_by_group + .get(&each.key()) + .expect("key should have been inserted earlier"); + let ValueArray::Double(values) = each.points.values_mut(0).unwrap() + else { + unreachable!(); + }; + for (val, count) in values.iter_mut().zip(counts.values()) { + if let Some(x) = val.as_mut() { + *x /= *count; + } + } + } + Ok(vec![output_table]) + } +} + +/// A reduction operation applied to unnamed columns during a group by. +#[derive(Clone, Copy, Debug, Default, PartialEq)] +pub enum Reducer { + #[default] + Mean, + Sum, +} + +#[cfg(test)] +mod tests { + use super::{GroupBy, Reducer}; + use crate::oxql::{ + ast::{ + ident::Ident, + table_ops::align::{Align, AlignmentMethod}, + }, + point::{DataType, MetricType, ValueArray}, + Table, Timeseries, + }; + use chrono::{DateTime, Utc}; + use oximeter::FieldValue; + use std::{collections::BTreeMap, time::Duration}; + + // Which timeseries the second data point is missing from. + #[derive(Clone, Copy, Debug)] + enum MissingValue { + Neither, + First, + Both, + } + + #[derive(Clone, Copy, Debug)] + struct TestConfig { + missing_value: MissingValue, + overlapping_times: bool, + reducer: Reducer, + } + + #[derive(Clone, Debug)] + #[allow(dead_code)] + struct TestTable { + aligned_table: Table, + grouped_table: Table, + query_end: DateTime, + timestamps: Vec>, + } + + impl TestTable { + fn new(cfg: TestConfig) -> Self { + let query_end = Utc::now(); + let mut timestamps = vec![ + query_end - Duration::from_secs(2), + query_end - Duration::from_secs(1), + query_end, + ]; + + // Create the first timeseries. + // + // This has two fields, one of which we'll group by. There are three + // timepoints of double values. + let mut fields = BTreeMap::new(); + fields.insert("int".to_string(), FieldValue::U8(0)); + fields.insert( + "name".to_string(), + FieldValue::String("whodat".into()), + ); + let mut ts0 = Timeseries::new( + fields.into_iter(), + DataType::Double, + MetricType::Gauge, + ) + .unwrap(); + ts0.points.start_times = None; + ts0.points.timestamps = timestamps.clone(); + *ts0.points.values_mut(0).unwrap() = ValueArray::Double(vec![ + Some(1.0), + if matches!( + cfg.missing_value, + MissingValue::First | MissingValue::Both + ) { + None + } else { + Some(2.0) + }, + Some(3.0), + ]); + + // Create the second timeseries. + // + // This is nearly the same, and shares the same field value for the + // "int" field. When we group, we should reduce these two timeseries + // together. + let mut fields = BTreeMap::new(); + fields.insert("int".to_string(), FieldValue::U8(0)); + fields.insert( + "name".to_string(), + FieldValue::String("whodis".into()), + ); + let mut ts1 = Timeseries::new( + fields.into_iter(), + DataType::Double, + MetricType::Gauge, + ) + .unwrap(); + ts1.points.start_times = None; + + // Non-overlapping in this test setup means that we just shift one + // value from this array backward in time by one additional second. + // So we should have timestamps like: + // + // ts0: [ _, t0, t1, t2 ] + // ts1: [ t0, _, t1, t2 ] + // + // When reducing, t0 is never changed, and t1-t2 are always reduced + // together, if the values are present. + ts1.points.timestamps = if cfg.overlapping_times { + timestamps.clone() + } else { + let mut new_timestamps = timestamps.clone(); + new_timestamps[0] = new_timestamps[0] - Duration::from_secs(1); + timestamps.insert(0, new_timestamps[0]); + new_timestamps + }; + *ts1.points.values_mut(0).unwrap() = ValueArray::Double(vec![ + Some(2.0), + if matches!(cfg.missing_value, MissingValue::Both) { + None + } else { + Some(3.0) + }, + Some(4.0), + ]); + + let mut table = Table::new("foo"); + table.insert(ts0).unwrap(); + table.insert(ts1).unwrap(); + + // Align the actual table, based on the input, and apply the right + // group-by + let align = Align { + method: AlignmentMethod::MeanWithin, + period: Duration::from_secs(1), + }; + let aligned_tables = align.apply(&[table], &query_end).unwrap(); + let group_by = GroupBy { + identifiers: vec![Ident("int".into())], + reducer: cfg.reducer, + }; + let grouped_tables = group_by.apply(&aligned_tables).unwrap(); + assert_eq!( + grouped_tables.len(), + 1, + "Group by should produce exaclty 1 table" + ); + let grouped_table = grouped_tables.into_iter().next().unwrap(); + let aligned_table = aligned_tables.into_iter().next().unwrap(); + + let test = + Self { timestamps, aligned_table, grouped_table, query_end }; + + // These checks are all valid for grouping in general, independent + // of the exact missing values or reducer. + assert_eq!( + test.grouped_table.len(), + 1, + "Should have grouped both timeseries down to 1" + ); + let grouped_timeseries = test.grouped_table.iter().next().unwrap(); + assert_eq!( + grouped_timeseries.fields.len(), + 1, + "Should have only one grouped-by field" + ); + assert_eq!( + grouped_timeseries.fields.get("int").unwrap(), + &FieldValue::U8(0), + "Grouped-by field was not maintained correctly" + ); + let points = &grouped_timeseries.points; + assert_eq!(points.dimensionality(), 1, "Points should still be 1D"); + assert_eq!( + points.start_times, None, + "Points should not have start times" + ); + assert_eq!( + points.timestamps, test.timestamps, + "Points do not have correct timestamps" + ); + + test + } + } + + #[test] + fn test_group_by() { + const TEST_CASES: &[(TestConfig, &[Option])] = &[ + ( + TestConfig { + missing_value: MissingValue::Neither, + overlapping_times: true, + reducer: Reducer::Mean, + }, + // This is the most basic case, where we simply average all the + // values together. They exactly line up and none are missing. + &[Some(1.5), Some(2.5), Some(3.5)], + ), + ( + TestConfig { + missing_value: MissingValue::Neither, + overlapping_times: true, + reducer: Reducer::Sum, + }, + // This is the next-simplest case, where we simply sum all the + // values together. They exactly line up and none are missing. + &[Some(3.0), Some(5.0), Some(7.0)], + ), + ( + TestConfig { + missing_value: MissingValue::Neither, + overlapping_times: false, + reducer: Reducer::Mean, + }, + // In this case, the timestamps don't all overlap, though some + // of them do. In particular, the arrays are shifted by one + // timestamp relative to each other, so there are 2 extra + // values. The one value that does overlap is averaged, and the + // other two are unchanged. + &[Some(2.0), Some(1.0), Some(2.5), Some(3.5)], + ), + ( + TestConfig { + missing_value: MissingValue::Neither, + overlapping_times: false, + reducer: Reducer::Sum, + }, + // Here, we should have 4 output samples because the timestamps + // don't overlap. The second input timeseries has its first + // point shifted back by one second. That means the first two + // values are just from one array (no reduction), while the next + // two are reduced as usual. + &[Some(2.0), Some(1.0), Some(5.0), Some(7.0)], + ), + ( + TestConfig { + missing_value: MissingValue::First, + overlapping_times: true, + reducer: Reducer::Mean, + }, + // In this case, we have a missing value for the middle + // timestamp of the first input timeseries. That means we should + // still have 3 output samples, but the second point isn't an + // aggregation, it's just the input value, from the second + // timeseries. + &[Some(1.5), Some(3.0), Some(3.5)], + ), + ( + TestConfig { + missing_value: MissingValue::First, + overlapping_times: true, + reducer: Reducer::Sum, + }, + // Same as above, but we're summing, not averaging. + &[Some(3.0), Some(3.0), Some(7.0)], + ), + ( + TestConfig { + missing_value: MissingValue::First, + overlapping_times: false, + reducer: Reducer::Mean, + }, + // We need 4 output points again here, but we also have a + // missing value. So we'll take the first value from the second + // timeseries; the second from the first; the second from the + // second directly, since its corresponding point is missing in + // the first, and then the average of both in the last point. + &[Some(2.0), Some(1.0), Some(3.0), Some(3.5)], + ), + ( + TestConfig { + missing_value: MissingValue::First, + overlapping_times: false, + reducer: Reducer::Sum, + }, + // Same as above, but summing, instead of averaging. + &[Some(2.0), Some(1.0), Some(3.0), Some(7.0)], + ), + ( + TestConfig { + missing_value: MissingValue::Both, + overlapping_times: true, + reducer: Reducer::Mean, + }, + // In this case, the 2nd timepoint is missing from both + // timeseries. We should preserve that as a missing value in the + // output. + &[Some(1.5), None, Some(3.5)], + ), + ( + TestConfig { + missing_value: MissingValue::Both, + overlapping_times: true, + reducer: Reducer::Sum, + }, + // Same as above, but summing instead of averaging. + &[Some(3.0), None, Some(7.0)], + ), + ]; + for (test_config, expected_data) in TEST_CASES.iter() { + let test_table = TestTable::new(*test_config); + let grouped_timeseries = + test_table.grouped_table.iter().next().unwrap(); + let points = &grouped_timeseries.points; + let values = points.values(0).unwrap().as_double().unwrap(); + assert_eq!( + values, expected_data, + "Timeseries values were not grouped correctly, \ + test_config = {test_config:?}" + ); + } + } +} diff --git a/oximeter/db/src/oxql/ast/table_ops/join.rs b/oximeter/db/src/oxql/ast/table_ops/join.rs new file mode 100644 index 0000000000..3c150a4acf --- /dev/null +++ b/oximeter/db/src/oxql/ast/table_ops/join.rs @@ -0,0 +1,385 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! An AST node describing join table operations. + +// Copyright 2024 Oxide Computer Company + +use crate::oxql::point::MetricType; +use crate::oxql::point::Points; +use crate::oxql::point::Values; +use crate::oxql::Error; +use crate::oxql::Table; +use anyhow::Context; + +/// An AST node for a natural inner join. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Join; +impl Join { + // Apply the group_by table operation. + pub(crate) fn apply(&self, tables: &[Table]) -> Result, Error> { + anyhow::ensure!( + tables.len() > 1, + "Join operations require more than one table", + ); + let mut tables = tables.iter().cloned().enumerate(); + let (_, mut out) = tables.next().unwrap(); + anyhow::ensure!( + out.is_aligned(), + "Input tables for a join operation must be aligned" + ); + let metric_types = out + .iter() + .next() + .context("Input tables for a join operation may not be empty")? + .points + .metric_types() + .collect::>(); + ensure_all_metric_types(metric_types.iter().copied())?; + let alignment = out.alignment(); + assert!(alignment.is_some()); + + for (i, next_table) in tables { + anyhow::ensure!( + next_table.alignment() == alignment, + "All tables to a join operator must have the same \ + alignment. Expected alignment: {:?}, found a table \ + aligned with: {:?}", + alignment.unwrap(), + next_table.alignment(), + ); + let name = next_table.name().to_string(); + for next_timeseries in next_table.into_iter() { + let new_types = + next_timeseries.points.metric_types().collect::>(); + ensure_all_metric_types(new_types.iter().copied())?; + anyhow::ensure!( + metric_types == new_types, + "Input tables do not all share the same metric types" + ); + + let key = next_timeseries.key(); + let Some(timeseries) = out.iter_mut().find(|t| t.key() == key) + else { + anyhow::bail!( + "Join failed, input table {} does not \ + contain a timeseries with key {}", + i, + key, + ); + }; + + // Joining the timeseries is done by stacking together the + // values that have the same timestamp. + // + // If two value arrays have different timestamps, which is + // possible if they're derived from two separately-aligned + // tables, then we need to correctly ensure that: + // + // 1. They have the same alignment, and + // 2. We merge the timepoints rather than simply creating a + // ragged array of points. + timeseries.points = inner_join_point_arrays( + ×eries.points, + &next_timeseries.points, + )?; + } + // We'll also update the name, to indicate the joined data. + out.name.push(','); + out.name.push_str(&name); + } + Ok(vec![out]) + } +} + +// Given two arrays of points, stack them together at matching timepoints. +// +// For time points in either which do not have a corresponding point in the +// other, the entire time point is elided. +fn inner_join_point_arrays( + left: &Points, + right: &Points, +) -> Result { + // Create an output array with roughly the right capacity, and double the + // number of dimensions. We're trying to stack output value arrays together + // along the dimension axis. + let data_types = + left.data_types().chain(right.data_types()).collect::>(); + let metric_types = + left.metric_types().chain(right.metric_types()).collect::>(); + let mut out = Points::with_capacity( + left.len().max(right.len()), + data_types.iter().copied(), + metric_types.iter().copied(), + )?; + + // Iterate through each array until one is exhausted. We're only inserting + // values from both arrays where the timestamps actually match, since this + // is an inner join. We may want to insert missing values where timestamps + // do not match on either side, when we support an outer join of some kind. + let n_left_dim = left.values.len(); + let mut left_ix = 0; + let mut right_ix = 0; + while left_ix < left.len() && right_ix < right.len() { + let left_timestamp = left.timestamps[left_ix]; + let right_timestamp = right.timestamps[right_ix]; + if left_timestamp == right_timestamp { + out.timestamps.push(left_timestamp); + push_concrete_values( + &mut out.values[..n_left_dim], + &left.values, + left_ix, + ); + push_concrete_values( + &mut out.values[n_left_dim..], + &right.values, + right_ix, + ); + left_ix += 1; + right_ix += 1; + } else if left_timestamp < right_timestamp { + left_ix += 1; + } else { + right_ix += 1; + } + } + Ok(out) +} + +// Push the `i`th value from each dimension of `from` onto `to`. +fn push_concrete_values(to: &mut [Values], from: &[Values], i: usize) { + assert_eq!(to.len(), from.len()); + for (output, input) in to.iter_mut().zip(from.iter()) { + let input_array = &input.values; + let output_array = &mut output.values; + assert_eq!(input_array.data_type(), output_array.data_type()); + if let Ok(ints) = input_array.as_integer() { + output_array.as_integer_mut().unwrap().push(ints[i]); + continue; + } + if let Ok(doubles) = input_array.as_double() { + output_array.as_double_mut().unwrap().push(doubles[i]); + continue; + } + if let Ok(bools) = input_array.as_boolean() { + output_array.as_boolean_mut().unwrap().push(bools[i]); + continue; + } + if let Ok(strings) = input_array.as_string() { + output_array.as_string_mut().unwrap().push(strings[i].clone()); + continue; + } + if let Ok(dists) = input_array.as_integer_distribution() { + output_array + .as_integer_distribution_mut() + .unwrap() + .push(dists[i].clone()); + continue; + } + if let Ok(dists) = input_array.as_double_distribution() { + output_array + .as_double_distribution_mut() + .unwrap() + .push(dists[i].clone()); + continue; + } + unreachable!(); + } +} + +// Return an error if any metric types are not suitable for joining. +fn ensure_all_metric_types( + mut metric_types: impl ExactSizeIterator, +) -> Result<(), Error> { + anyhow::ensure!( + metric_types + .all(|mt| matches!(mt, MetricType::Gauge | MetricType::Delta)), + "Join operation requires timeseries with gauge or \ + delta metric types", + ); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::oxql::point::DataType; + use crate::oxql::point::Datum; + use crate::oxql::point::ValueArray; + use chrono::Utc; + use std::time::Duration; + + #[test] + fn test_push_concrete_values() { + let mut points = Points::with_capacity( + 2, + [DataType::Integer, DataType::Double].into_iter(), + [MetricType::Gauge, MetricType::Gauge].into_iter(), + ) + .unwrap(); + + // Push a concrete value for the integer dimension + let from_ints = vec![Values { + values: ValueArray::Integer(vec![Some(1)]), + metric_type: MetricType::Gauge, + }]; + push_concrete_values(&mut points.values[..1], &from_ints, 0); + + // And another for the double dimension. + let from_doubles = vec![Values { + values: ValueArray::Double(vec![Some(2.0)]), + metric_type: MetricType::Gauge, + }]; + push_concrete_values(&mut points.values[1..], &from_doubles, 0); + + assert_eq!( + points.dimensionality(), + 2, + "Points should have 2 dimensions", + ); + let ints = points.values[0].values.as_integer().unwrap(); + assert_eq!( + ints.len(), + 1, + "Should have pushed one point in the first dimension" + ); + assert_eq!( + ints[0], + Some(1), + "Should have pushed 1 onto the first dimension" + ); + let doubles = points.values[1].values.as_double().unwrap(); + assert_eq!( + doubles.len(), + 1, + "Should have pushed one point in the second dimension" + ); + assert_eq!( + doubles[0], + Some(2.0), + "Should have pushed 2.0 onto the second dimension" + ); + } + + #[test] + fn test_join_point_arrays() { + let now = Utc::now(); + + // Create a set of integer points to join with. + // + // This will have two timestamps, one of which will match the points + // below that are merged in. + let int_points = Points { + start_times: None, + timestamps: vec![ + now - Duration::from_secs(3), + now - Duration::from_secs(2), + now, + ], + values: vec![Values { + values: ValueArray::Integer(vec![Some(1), Some(2), Some(3)]), + metric_type: MetricType::Gauge, + }], + }; + + // Create an additional set of double points. + // + // This also has two timepoints, one of which matches with the above, + // and one of which does not. + let double_points = Points { + start_times: None, + timestamps: vec![ + now - Duration::from_secs(3), + now - Duration::from_secs(1), + now, + ], + values: vec![Values { + values: ValueArray::Double(vec![ + Some(4.0), + Some(5.0), + Some(6.0), + ]), + metric_type: MetricType::Gauge, + }], + }; + + // Merge the arrays. + let merged = + inner_join_point_arrays(&int_points, &double_points).unwrap(); + + // Basic checks that we merged in the right values and have the right + // types and dimensions. + assert_eq!( + merged.dimensionality(), + 2, + "Should have appended the dimensions from each input array" + ); + assert_eq!(merged.len(), 2, "Should have merged two common points",); + assert_eq!( + merged.data_types().collect::>(), + &[DataType::Integer, DataType::Double], + "Should have combined the data types of the input arrays" + ); + assert_eq!( + merged.metric_types().collect::>(), + &[MetricType::Gauge, MetricType::Gauge], + "Should have combined the metric types of the input arrays" + ); + + // Check the actual values of the array. + let mut points = merged.iter_points(); + + // The first and last timepoint overlapped between the two arrays, so we + // should have both of them as concrete samples. + let pt = points.next().unwrap(); + assert_eq!(pt.start_time, None, "Gauges don't have a start time"); + assert_eq!( + *pt.timestamp, int_points.timestamps[0], + "Should have taken the first input timestamp from both arrays", + ); + assert_eq!( + *pt.timestamp, double_points.timestamps[0], + "Should have taken the first input timestamp from both arrays", + ); + let values = pt.values; + assert_eq!(values.len(), 2, "Should have 2 dimensions"); + assert_eq!( + &values[0], + &(Datum::Integer(Some(&1)), MetricType::Gauge), + "Should have pulled value from first integer array." + ); + assert_eq!( + &values[1], + &(Datum::Double(Some(&4.0)), MetricType::Gauge), + "Should have pulled value from second double array." + ); + + // And the next point + let pt = points.next().unwrap(); + assert_eq!(pt.start_time, None, "Gauges don't have a start time"); + assert_eq!( + *pt.timestamp, int_points.timestamps[2], + "Should have taken the input timestamp from both arrays", + ); + assert_eq!( + *pt.timestamp, double_points.timestamps[2], + "Should have taken the input timestamp from both arrays", + ); + let values = pt.values; + assert_eq!(values.len(), 2, "Should have 2 dimensions"); + assert_eq!( + &values[0], + &(Datum::Integer(Some(&3)), MetricType::Gauge), + "Should have pulled value from first integer array." + ); + assert_eq!( + &values[1], + &(Datum::Double(Some(&6.0)), MetricType::Gauge), + "Should have pulled value from second double array." + ); + + // And there should be no other values. + assert!(points.next().is_none(), "There should be no more points"); + } +} diff --git a/oximeter/db/src/oxql/ast/table_ops/mod.rs b/oximeter/db/src/oxql/ast/table_ops/mod.rs new file mode 100644 index 0000000000..d9930962f8 --- /dev/null +++ b/oximeter/db/src/oxql/ast/table_ops/mod.rs @@ -0,0 +1,76 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! AST nodes for table operations. + +// Copyright 2024 Oxide Computer Company + +pub mod align; +pub mod filter; +pub mod get; +pub mod group_by; +pub mod join; + +use self::align::Align; +use self::filter::Filter; +use self::group_by::GroupBy; +use self::join::Join; +use crate::oxql::ast::Query; +use crate::oxql::Error; +use crate::oxql::Table; +use chrono::DateTime; +use chrono::Utc; +use oximeter::TimeseriesName; + +/// A basic table operation, the atoms of an OxQL query. +#[derive(Clone, Debug, PartialEq)] +pub enum BasicTableOp { + Get(TimeseriesName), + Filter(Filter), + GroupBy(GroupBy), + Join(Join), + Align(Align), +} + +impl BasicTableOp { + pub(crate) fn apply( + &self, + tables: &[Table], + query_end: &DateTime, + ) -> Result, Error> { + match self { + BasicTableOp::Get(_) => panic!("Should not apply get table ops"), + BasicTableOp::Filter(f) => f.apply(tables), + BasicTableOp::GroupBy(g) => g.apply(tables), + BasicTableOp::Join(j) => j.apply(tables), + BasicTableOp::Align(a) => a.apply(tables, query_end), + } + } +} + +/// A grouped table operation is a subquery in OxQL. +#[derive(Clone, Debug, PartialEq)] +pub struct GroupedTableOp { + pub ops: Vec, +} + +/// Any kind of OxQL table operation. +#[derive(Clone, Debug, PartialEq)] +pub enum TableOp { + Basic(BasicTableOp), + Grouped(GroupedTableOp), +} + +impl TableOp { + pub(crate) fn apply( + &self, + tables: &[Table], + query_end: &DateTime, + ) -> Result, Error> { + let TableOp::Basic(basic) = self else { + panic!("Should not apply grouped table ops"); + }; + basic.apply(tables, query_end) + } +} diff --git a/oximeter/db/src/oxql/mod.rs b/oximeter/db/src/oxql/mod.rs new file mode 100644 index 0000000000..b93d75b859 --- /dev/null +++ b/oximeter/db/src/oxql/mod.rs @@ -0,0 +1,39 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The Oximeter Query Language, OxQL. + +// Copyright 2024 Oxide Computer Company + +use peg::error::ParseError as PegError; +use peg::str::LineCol; + +pub mod ast; +pub mod point; +pub mod query; +pub mod table; + +pub use self::query::Query; +pub use self::table::Table; +pub use self::table::Timeseries; +pub use anyhow::Error; + +// Format a PEG parsing error into a nice anyhow error. +fn fmt_parse_error(source: &str, err: PegError) -> Error { + use std::fmt::Write; + let mut out = + format!("Error at {}:{}", err.location.line, err.location.column); + const CONTEXT: usize = 24; + let start = err.location.offset.saturating_sub(CONTEXT); + let end = err.location.offset.saturating_add(CONTEXT).min(source.len()); + if let Some(context) = source.get(start..end) { + let prefix_len = out.len() + 2; + writeln!(out, ": .. {context} ..").unwrap(); + let left_pad = err.location.offset - start + 3 + prefix_len; + let right_pad = end - err.location.offset + 3 + prefix_len; + writeln!(out, "{:right_pad$}", ' ', ' ').unwrap(); + } + writeln!(out, "Expected: {}", err).unwrap(); + anyhow::anyhow!(out) +} diff --git a/oximeter/db/src/oxql/point.rs b/oximeter/db/src/oxql/point.rs new file mode 100644 index 0000000000..e12214aaf0 --- /dev/null +++ b/oximeter/db/src/oxql/point.rs @@ -0,0 +1,2040 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Definition of data points for OxQL. + +// Copyright 2024 Oxide Computer Company + +use super::Error; +use anyhow::Context; +use chrono::DateTime; +use chrono::Utc; +use num::ToPrimitive; +use oximeter::DatumType; +use oximeter::Measurement; +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; +use std::fmt; + +/// The type of each individual data point's value in a timeseries. +#[derive( + Clone, Copy, Debug, Deserialize, Hash, JsonSchema, PartialEq, Serialize, +)] +#[serde(rename_all = "snake_case")] +pub enum DataType { + /// A 64-bit integer. + Integer, + /// A 64-bit float. + Double, + /// A boolean. + Boolean, + /// A string. + String, + /// A distribution, a sequence of integer bins and counts. + IntegerDistribution, + /// A distribution, a sequence of double bins and integer counts. + DoubleDistribution, +} + +impl DataType { + /// True if this is a numeric scalar type. + pub fn is_numeric(&self) -> bool { + matches!(self, DataType::Integer | DataType::Double) + } +} + +impl TryFrom for DataType { + type Error = Error; + + fn try_from(datum_type: DatumType) -> Result { + let data_type = match datum_type { + DatumType::Bool => DataType::Boolean, + DatumType::I8 + | DatumType::U8 + | DatumType::I16 + | DatumType::U16 + | DatumType::I32 + | DatumType::U32 + | DatumType::I64 + | DatumType::U64 + | DatumType::CumulativeI64 + | DatumType::CumulativeU64 => DataType::Integer, + DatumType::F32 + | DatumType::F64 + | DatumType::CumulativeF32 + | DatumType::CumulativeF64 => DataType::Double, + DatumType::String => DataType::String, + DatumType::HistogramI8 + | DatumType::HistogramU8 + | DatumType::HistogramI16 + | DatumType::HistogramU16 + | DatumType::HistogramI32 + | DatumType::HistogramU32 + | DatumType::HistogramI64 + | DatumType::HistogramU64 => DataType::IntegerDistribution, + DatumType::HistogramF32 | DatumType::HistogramF64 => { + DataType::DoubleDistribution + } + DatumType::Bytes => { + anyhow::bail!("Unsupported datum type: {}", datum_type) + } + }; + Ok(data_type) + } +} + +impl fmt::Display for DataType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +/// The type of the metric itself, indicating what its values represent. +#[derive( + Clone, Copy, Debug, Deserialize, Hash, JsonSchema, PartialEq, Serialize, +)] +#[serde(rename_all = "snake_case")] +pub enum MetricType { + /// The value represents an instantaneous measurement in time. + Gauge, + /// The value represents a difference between two points in time. + Delta, + /// The value represents an accumulation between two points in time. + Cumulative, +} + +impl fmt::Display for MetricType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +// A converted oximeter datum, used internally. +// +// This is used when computing deltas between cumulative measurements, and so +// only represents the possible cumulative types. +#[derive(Clone, Debug, PartialEq)] +enum CumulativeDatum { + Integer(i64), + Double(f64), + IntegerDistribution(Distribution), + DoubleDistribution(Distribution), +} + +impl CumulativeDatum { + // Construct a datum from a cumulative type, failing if the measurement is + // not cumulative. + fn from_cumulative(meas: &Measurement) -> Result { + let datum = match meas.datum() { + oximeter::Datum::CumulativeI64(val) => { + CumulativeDatum::Integer(val.value()) + } + oximeter::Datum::CumulativeU64(val) => { + let int = val + .value() + .try_into() + .context("Overflow converting u64 to i64")?; + CumulativeDatum::Integer(int) + } + oximeter::Datum::CumulativeF32(val) => { + CumulativeDatum::Double(val.value().into()) + } + oximeter::Datum::CumulativeF64(val) => { + CumulativeDatum::Double(val.value()) + } + oximeter::Datum::HistogramI8(hist) => hist.into(), + oximeter::Datum::HistogramU8(hist) => hist.into(), + oximeter::Datum::HistogramI16(hist) => hist.into(), + oximeter::Datum::HistogramU16(hist) => hist.into(), + oximeter::Datum::HistogramI32(hist) => hist.into(), + oximeter::Datum::HistogramU32(hist) => hist.into(), + oximeter::Datum::HistogramI64(hist) => hist.into(), + oximeter::Datum::HistogramU64(hist) => hist.try_into()?, + oximeter::Datum::HistogramF32(hist) => hist.into(), + oximeter::Datum::HistogramF64(hist) => hist.into(), + other => anyhow::bail!( + "Input datum of type {} is not cumulative", + other.datum_type(), + ), + }; + Ok(datum) + } +} + +/// A single list of values, for one dimension of a timeseries. +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +pub struct Values { + // The data values. + pub(super) values: ValueArray, + // The type of this metric. + pub(super) metric_type: MetricType, +} + +impl Values { + // Construct an empty array of values to hold the provided types. + fn with_capacity( + size: usize, + data_type: DataType, + metric_type: MetricType, + ) -> Self { + Self { values: ValueArray::with_capacity(size, data_type), metric_type } + } + + fn len(&self) -> usize { + self.values.len() + } +} + +/// Reference type describing a single point in a `Points` array. +/// +/// The `Points` type is column-major, in that the timestamps and each data +/// value (one for each dimension) are stored in separate arrays, of the same +/// length. This type holds references to the relevant items in each array that +/// constitutes a single point. +#[derive(Clone, Debug, PartialEq)] +pub struct Point<'a> { + /// The start time of this point, if any. + pub start_time: Option<&'a DateTime>, + /// The timestamp for this point. + pub timestamp: &'a DateTime, + /// One datum and its metric type, for each dimension in the point. + /// + /// The datum itself is optional, and will be `None` if the point is missing + /// a value at the corresponding point and dimension. + pub values: Vec<(Datum<'a>, MetricType)>, +} + +impl<'a> fmt::Display for Point<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + const TIMESTAMP_FMT: &str = "%Y-%m-%d %H:%M:%S.%f"; + match &self.start_time { + Some(start_time) => write!( + f, + "[{}, {}]: ", + start_time.format(TIMESTAMP_FMT), + self.timestamp.format(TIMESTAMP_FMT) + )?, + None => write!(f, "{}: ", self.timestamp.format(TIMESTAMP_FMT))?, + } + let values = self + .values + .iter() + .map(|(datum, _)| datum.to_string()) + .collect::>() + .join(","); + write!(f, "[{}]", values) + } +} + +impl<'a> Point<'a> { + /// Return the dimensionality of this point. + pub fn dimensionality(&self) -> usize { + self.values.len() + } +} + +/// A reference to a single datum of a multidimensional value. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Datum<'a> { + Boolean(Option), + Integer(Option<&'a i64>), + Double(Option<&'a f64>), + String(Option<&'a str>), + IntegerDistribution(Option<&'a Distribution>), + DoubleDistribution(Option<&'a Distribution>), +} + +impl<'a> fmt::Display for Datum<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Datum::Boolean(Some(inner)) => write!(f, "{}", inner), + Datum::Integer(Some(inner)) => write!(f, "{}", inner), + Datum::Double(Some(inner)) => write!(f, "{}", inner), + Datum::String(Some(inner)) => write!(f, "{}", inner), + Datum::IntegerDistribution(Some(inner)) => write!(f, "{}", inner), + Datum::DoubleDistribution(Some(inner)) => write!(f, "{}", inner), + Datum::Boolean(None) + | Datum::Integer(None) + | Datum::Double(None) + | Datum::String(None) + | Datum::IntegerDistribution(None) + | Datum::DoubleDistribution(None) => { + write!(f, "-") + } + } + } +} + +/// Timepoints and values for one timeseries. +// +// Invariants: +// +// The start_time and timestamp arrays must be the same length, or start_times +// must be None. +// +// The length of timestamps (and possibly start_times) must be the same as the +// length of _each element_ of the `values` array. That is, there are as many +// timestamps as data values. +// +// The length of `values` is the number of dimensions, and is always at least 1. +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +pub struct Points { + // The start time points for cumulative or delta metrics. + pub(super) start_times: Option>>, + // The timestamp of each value. + pub(super) timestamps: Vec>, + // The array of data values, one for each dimension. + pub(super) values: Vec, +} + +impl Points { + /// Construct an empty array of points to hold data of the provided type. + pub fn empty(data_type: DataType, metric_type: MetricType) -> Self { + Self::with_capacity( + 0, + std::iter::once(data_type), + std::iter::once(metric_type), + ) + .unwrap() + } + + // Return a mutable reference to the value array of the specified dimension, if any. + pub(super) fn values_mut(&mut self, dim: usize) -> Option<&mut ValueArray> { + self.values.get_mut(dim).map(|val| &mut val.values) + } + + /// Return a reference to the value array of the specified dimension, if any. + pub fn values(&self, dim: usize) -> Option<&ValueArray> { + self.values.get(dim).map(|val| &val.values) + } + + /// Return the dimensionality of the data points, i.e., the number of values + /// at each timestamp. + pub fn dimensionality(&self) -> usize { + self.values.len() + } + + /// Return the number of points in self. + pub fn len(&self) -> usize { + self.values[0].len() + } + + /// Construct an empty array of points to hold size data points of the + /// provided types. + /// + /// The type information may have length > 1 to reserve space for + /// multi-dimensional values. + pub fn with_capacity( + size: usize, + data_types: D, + metric_types: M, + ) -> Result + where + D: ExactSizeIterator, + M: ExactSizeIterator, + { + anyhow::ensure!( + data_types.len() == metric_types.len(), + "Data and metric type iterators must have the same length", + ); + let timestamps = Vec::with_capacity(size); + let mut start_times = None; + let mut values = Vec::with_capacity(data_types.len()); + for (data_type, metric_type) in data_types.zip(metric_types) { + if matches!(metric_type, MetricType::Delta | MetricType::Cumulative) + && start_times.is_none() + { + start_times.replace(Vec::with_capacity(size)); + } + values.push(Values::with_capacity(size, data_type, metric_type)); + } + Ok(Self { start_times, timestamps, values }) + } + + /// Return the data types of self. + pub fn data_types(&self) -> impl ExactSizeIterator + '_ { + self.values.iter().map(|val| val.values.data_type()) + } + + /// Return the metric types of self. + pub fn metric_types( + &self, + ) -> impl ExactSizeIterator + '_ { + self.values.iter().map(|val| val.metric_type) + } + + /// Return the single metric type of all values in self, it they are all the + /// same. + pub fn metric_type(&self) -> Option { + let mut types = self.metric_types(); + let Some(first_type) = types.next() else { + unreachable!(); + }; + if types.all(|ty| ty == first_type) { + Some(first_type) + } else { + None + } + } + + /// Construct a list of gauge points from a list of gauge measurements. + /// + /// An error is returned if the provided input measurements are not gauges, + /// or do not all have the same datum type. + pub fn gauge_from_gauge( + measurements: &[Measurement], + ) -> Result { + let Some(first) = measurements.first() else { + anyhow::bail!( + "Cannot construct points from empty measurements array" + ); + }; + let datum_type = first.datum_type(); + anyhow::ensure!( + !datum_type.is_cumulative(), + "Measurements are not gauges" + ); + let data_type = DataType::try_from(datum_type)?; + let mut self_ = Self::with_capacity( + measurements.len(), + std::iter::once(data_type), + std::iter::once(MetricType::Gauge), + )?; + + // Since we're directly pushing gauges, each measurement is independent + // of the others. Simply translate types and push the data. + for measurement in measurements.iter() { + anyhow::ensure!( + measurement.datum_type() == datum_type, + "Measurements must all have the same datum type", + ); + self_ + .values_mut(0) + .unwrap() + .push_value_from_datum(measurement.datum())?; + self_.timestamps.push(measurement.timestamp()); + } + Ok(self_) + } + + /// Construct a list of delta points from a list of cumulative measurements. + /// + /// An error is returned if the provided measurements are not of the same + /// type or not cumulative. + pub fn delta_from_cumulative( + measurements: &[Measurement], + ) -> Result { + let mut iter = measurements.iter(); + let Some(first) = iter.next() else { + anyhow::bail!( + "Cannot construct points from empty measurements array" + ); + }; + let datum_type = first.datum_type(); + anyhow::ensure!( + datum_type.is_cumulative(), + "Measurements are not cumulative", + ); + let data_type = DataType::try_from(datum_type)?; + let mut self_ = Self::with_capacity( + measurements.len(), + std::iter::once(data_type), + std::iter::once(MetricType::Delta), + )?; + + // Construct the first point, which directly uses the start / end time + // of the first measurement itself. + self_.values_mut(0).unwrap().push_value_from_datum(first.datum())?; + self_.start_times.as_mut().unwrap().push(first.start_time().unwrap()); + self_.timestamps.push(first.timestamp()); + + // We need to keep track of the last cumulative measurement that's not + // _missing_, to compute successive differences between neighboring + // points. Note that we only need the datum from the measurement, + // because even missing samples have valid timestamp information. So we + // can always generate the timestamp for each delta, even if the datum + // is missing. + let mut last_datum = if first.is_missing() { + None + } else { + // Safety: We're confirming above the measurement is cumulative, and + // in this block if the datum is missing. So we know this conversion + // should succeed. + Some(CumulativeDatum::from_cumulative(first).unwrap()) + }; + + // We also need to keep track of the start time of this "epoch", periods + // where the cumulative data has the same start time. If there are jumps + // forward in this, and thus gaps in the records, we need to update the + // start_time of the epoch and also the last datum. + let mut epoch_start_time = first.start_time().unwrap(); + + // Push the remaining values. + for measurement in iter { + anyhow::ensure!( + measurement.datum_type() == datum_type, + "Measurements must all have the same datum type" + ); + + // For the time ranges we must have either: + // + // 1. Either the start time of the _first_ and new points must be + // equal, with the timestamp of the new strictly later than the + // timestamp of the last, OR + // 2. Both the start time and timestamp of the new point must be + // strictly later than the timestamp (and thus start time) of the + // last point. In this case, we effectively have a _gap_ in the + // timeseries, and so we need to update `first_start_time` to + // reflect this new epoch. + let last_start_time = + *self_.start_times.as_ref().unwrap().last().unwrap(); + let last_timestamp = *self_.timestamps.last().unwrap(); + let new_start_time = measurement.start_time().unwrap(); + let new_timestamp = measurement.timestamp(); + + if epoch_start_time == new_start_time + && last_timestamp < new_timestamp + { + // Push the timestamps to reflect this interval, from the end of + // the last sample to the end of this one. + self_.start_times.as_mut().unwrap().push(last_timestamp); + self_.timestamps.push(new_timestamp); + + // The data value is the difference between the last non-missing + // datum and the new datum. + self_.values_mut(0).unwrap().push_diff_from_last_to_datum( + &last_datum, + measurement.datum(), + data_type, + )?; + } else if new_start_time > last_timestamp + && new_timestamp > last_timestamp + { + // Push the new start time directly, since it begins a new + // epoch. + self_.start_times.as_mut().unwrap().push(new_start_time); + self_.timestamps.push(new_timestamp); + + // Update the epoch start time, and also simply push the datum + // directly. The difference with the previous is not meaningful, + // since we've begun a new epoch. + epoch_start_time = new_start_time; + self_ + .values_mut(0) + .unwrap() + .push_value_from_datum(measurement.datum())?; + } else { + // Print as useful a message as we can here. + anyhow::bail!( + "Cannot compute a delta, the timestamp of the next \ + sample has a new start time, or overlaps with the \ + last processed sample. \n \ + epoch start time = {epoch_start_time}\n \ + last timestamp = [{last_start_time}, {last_timestamp}]\n \ + new timestamp = [{new_start_time}, {new_timestamp}]" + ); + } + + // If the new datum is _not_ missing, we'll update the last one. + if !measurement.is_missing() { + last_datum.replace( + CumulativeDatum::from_cumulative(measurement).unwrap(), + ); + } + } + Ok(self_) + } + + /// Iterate over each point in self. + pub fn iter_points(&self) -> impl Iterator> + '_ { + (0..self.len()).map(|i| Point { + start_time: self.start_times.as_ref().map(|s| &s[i]), + timestamp: &self.timestamps[i], + values: self + .values + .iter() + .map(|val| (val.values.get(i), val.metric_type)) + .collect(), + }) + } + + // Filter points in self to those where `to_keep` is true. + pub(crate) fn filter(&self, to_keep: Vec) -> Result { + anyhow::ensure!( + to_keep.len() == self.len(), + "Filter array must be the same length as self", + ); + + // Compute the indices of values we're keeping. + let indices: Vec<_> = to_keep + .iter() + .enumerate() + .filter(|(_ix, to_keep)| **to_keep) + .map(|(ix, _)| ix) + .collect(); + let n_true = indices.len(); + let mut out = Self::with_capacity( + n_true, + self.data_types(), + self.metric_types(), + )?; + + // Push the compressed start times, if any. + if let Some(start_times) = self.start_times.as_ref() { + let Some(new_start_times) = out.start_times.as_mut() else { + unreachable!(); + }; + for ix in indices.iter().copied() { + new_start_times.push(start_times[ix]); + } + } + + // Push the compressed timestamps. + for ix in indices.iter().copied() { + out.timestamps.push(self.timestamps[ix]); + } + + // Push each dimension of the data values themselves. + for (new_values, existing_values) in + out.values.iter_mut().zip(self.values.iter()) + { + match (&mut new_values.values, &existing_values.values) { + (ValueArray::Integer(new), ValueArray::Integer(existing)) => { + for ix in indices.iter().copied() { + new.push(existing[ix]); + } + } + (ValueArray::Double(new), ValueArray::Double(existing)) => { + for ix in indices.iter().copied() { + new.push(existing[ix]); + } + } + (ValueArray::Boolean(new), ValueArray::Boolean(existing)) => { + for ix in indices.iter().copied() { + new.push(existing[ix]); + } + } + (ValueArray::String(new), ValueArray::String(existing)) => { + for ix in indices.iter().copied() { + new.push(existing[ix].clone()); + } + } + ( + ValueArray::IntegerDistribution(new), + ValueArray::IntegerDistribution(existing), + ) => { + for ix in indices.iter().copied() { + new.push(existing[ix].clone()); + } + } + ( + ValueArray::DoubleDistribution(new), + ValueArray::DoubleDistribution(existing), + ) => { + for ix in indices.iter().copied() { + new.push(existing[ix].clone()); + } + } + (_, _) => unreachable!(), + } + } + Ok(out) + } + + // Return a new set of points, with the values casted to the provided types. + pub(crate) fn cast(&self, types: &[DataType]) -> Result { + anyhow::ensure!( + types.len() == self.dimensionality(), + "Cannot cast to {} types, the data has dimensionality {}", + types.len(), + self.dimensionality(), + ); + let start_times = self.start_times.clone(); + let timestamps = self.timestamps.clone(); + let mut new_values = Vec::with_capacity(self.dimensionality()); + for (new_type, existing_values) in types.iter().zip(self.values.iter()) + { + let values = match (new_type, &existing_values.values) { + // "Cast" from i64 -> i64 + (DataType::Integer, ValueArray::Integer(vals)) => { + ValueArray::Integer(vals.clone()) + } + + // Cast f64 -> i64 + (DataType::Integer, ValueArray::Double(doubles)) => { + let mut new = Vec::with_capacity(doubles.len()); + for maybe_double in doubles.iter().copied() { + if let Some(d) = maybe_double { + let as_int = d + .to_i64() + .context("Cannot cast double {d} to i64")?; + new.push(Some(as_int)); + } else { + new.push(None); + } + } + ValueArray::Integer(new) + } + + // Cast bool -> i64 + (DataType::Integer, ValueArray::Boolean(bools)) => { + ValueArray::Integer( + bools + .iter() + .copied() + .map(|b| b.map(i64::from)) + .collect(), + ) + } + + // Cast string -> i64, by parsing. + (DataType::Integer, ValueArray::String(strings)) => { + let mut new = Vec::with_capacity(strings.len()); + for maybe_str in strings.iter() { + if let Some(s) = maybe_str { + let as_int = s + .parse() + .context("Cannot cast string '{s}' to i64")?; + new.push(Some(as_int)); + } else { + new.push(None); + } + } + ValueArray::Integer(new) + } + + // Cast i64 -> f64 + (DataType::Double, ValueArray::Integer(ints)) => { + let mut new = Vec::with_capacity(ints.len()); + for maybe_int in ints.iter().copied() { + if let Some(int) = maybe_int { + let as_double = int.to_f64().context( + "Cannot cast integer {int} as double", + )?; + new.push(Some(as_double)); + } else { + new.push(None); + } + } + ValueArray::Double(new) + } + + // "Cast" f64 -> f64 + (DataType::Double, ValueArray::Double(vals)) => { + ValueArray::Double(vals.clone()) + } + + // Cast bool -> f64 + (DataType::Double, ValueArray::Boolean(bools)) => { + ValueArray::Double( + bools + .iter() + .copied() + .map(|b| b.map(f64::from)) + .collect(), + ) + } + + // Cast string -> f64, by parsing. + (DataType::Double, ValueArray::String(strings)) => { + let mut new = Vec::with_capacity(strings.len()); + for maybe_str in strings.iter() { + if let Some(s) = maybe_str { + let as_double = s + .parse() + .context("Cannot cast string '{s}' to f64")?; + new.push(Some(as_double)); + } else { + new.push(None); + } + } + ValueArray::Double(new) + } + + // Cast i64 -> bool + // + // Any non-zero value is considered truthy. + (DataType::Boolean, ValueArray::Integer(ints)) => { + let mut new = Vec::with_capacity(ints.len()); + for maybe_int in ints.iter().copied() { + match maybe_int { + Some(0) => new.push(Some(false)), + Some(_) => new.push(Some(true)), + None => new.push(None), + } + } + ValueArray::Boolean(new) + } + + // Cast f64 -> bool + // + // Any non-zero value is considered truthy. + (DataType::Boolean, ValueArray::Double(doubles)) => { + let mut new = Vec::with_capacity(doubles.len()); + for maybe_double in doubles.iter().copied() { + match maybe_double { + Some(d) if d == 0.0 => new.push(Some(false)), + Some(_) => new.push(Some(true)), + None => new.push(None), + } + } + ValueArray::Boolean(new) + } + + // "Cast" bool -> bool + (DataType::Boolean, ValueArray::Boolean(vals)) => { + ValueArray::Boolean(vals.clone()) + } + + // Cast string -> bool. + // + // Any non-empty string is considered truthy + (DataType::Boolean, ValueArray::String(strings)) => { + let mut new = Vec::with_capacity(strings.len()); + for maybe_str in strings.iter() { + match maybe_str { + Some(s) if s.is_empty() => new.push(Some(false)), + Some(_) => new.push(Some(true)), + None => new.push(None), + } + } + ValueArray::Boolean(new) + } + + // Cast i64 -> string + (DataType::String, ValueArray::Integer(ints)) => { + ValueArray::String( + ints.iter().map(|x| x.map(|x| x.to_string())).collect(), + ) + } + + // Cast f64 -> string + (DataType::String, ValueArray::Double(doubles)) => { + ValueArray::String( + doubles + .iter() + .map(|x| x.map(|x| x.to_string())) + .collect(), + ) + } + + // Cast bool -> string + (DataType::String, ValueArray::Boolean(bools)) => { + ValueArray::String( + bools + .iter() + .map(|x| x.map(|x| x.to_string())) + .collect(), + ) + } + + // "Cast" string -> string + (DataType::String, ValueArray::String(vals)) => { + ValueArray::String(vals.clone()) + } + + // "Cast" distributions to the same type of distribution + ( + DataType::IntegerDistribution, + ValueArray::IntegerDistribution(vals), + ) => ValueArray::IntegerDistribution(vals.clone()), + ( + DataType::DoubleDistribution, + ValueArray::DoubleDistribution(vals), + ) => ValueArray::DoubleDistribution(vals.clone()), + + // All other casts are invalid + (_, vals) => anyhow::bail!( + "Cannot cast {} -> {}", + new_type, + vals.data_type(), + ), + }; + new_values.push(Values { + values, + metric_type: existing_values.metric_type, + }); + } + Ok(Self { start_times, timestamps, values: new_values }) + } + + /// Return true if self contains no data points. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// List of data values for one timeseries. +/// +/// Each element is an option, where `None` represents a missing sample. +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "snake_case", tag = "type", content = "values")] +pub enum ValueArray { + Integer(Vec>), + Double(Vec>), + Boolean(Vec>), + String(Vec>), + IntegerDistribution(Vec>>), + DoubleDistribution(Vec>>), +} + +impl ValueArray { + // Create an empty array with capacity `size` of the provided data type. + fn with_capacity(size: usize, data_type: DataType) -> Self { + match data_type { + DataType::Integer => Self::Integer(Vec::with_capacity(size)), + DataType::Double => Self::Double(Vec::with_capacity(size)), + DataType::Boolean => Self::Boolean(Vec::with_capacity(size)), + DataType::String => Self::String(Vec::with_capacity(size)), + DataType::IntegerDistribution => { + Self::IntegerDistribution(Vec::with_capacity(size)) + } + DataType::DoubleDistribution => { + Self::DoubleDistribution(Vec::with_capacity(size)) + } + } + } + + // Return the data type in self. + pub(super) fn data_type(&self) -> DataType { + match self { + ValueArray::Integer(_) => DataType::Integer, + ValueArray::Double(_) => DataType::Double, + ValueArray::Boolean(_) => DataType::Boolean, + ValueArray::String(_) => DataType::String, + ValueArray::IntegerDistribution(_) => DataType::IntegerDistribution, + ValueArray::DoubleDistribution(_) => DataType::DoubleDistribution, + } + } + + // Access the inner array of booleans, if possible. + pub(super) fn as_boolean_mut( + &mut self, + ) -> Result<&mut Vec>, Error> { + let ValueArray::Boolean(inner) = self else { + anyhow::bail!( + "Cannot access value array as boolean type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + /// Access the values as an array of bools, if they have that type. + pub fn as_boolean(&self) -> Result<&Vec>, Error> { + let ValueArray::Boolean(inner) = self else { + anyhow::bail!( + "Cannot access value array as boolean type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + /// Access the values as an array of integers, if they have that type. + pub fn as_integer(&self) -> Result<&Vec>, Error> { + let ValueArray::Integer(inner) = self else { + anyhow::bail!( + "Cannot access value array as integer type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + // Access the inner array of integers, if possible. + pub(super) fn as_integer_mut( + &mut self, + ) -> Result<&mut Vec>, Error> { + let ValueArray::Integer(inner) = self else { + anyhow::bail!( + "Cannot access value array as integer type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + /// Access the values as an array of doubles, if they have that type. + pub fn as_double(&self) -> Result<&Vec>, Error> { + let ValueArray::Double(inner) = self else { + anyhow::bail!( + "Cannot access value array as double type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + // Access the inner array of doubles, if possible. + pub(super) fn as_double_mut( + &mut self, + ) -> Result<&mut Vec>, Error> { + let ValueArray::Double(inner) = self else { + anyhow::bail!( + "Cannot access value array as double type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + /// Access the values as an array of strings, if they have that type. + pub fn as_string(&self) -> Result<&Vec>, Error> { + let ValueArray::String(inner) = self else { + anyhow::bail!( + "Cannot access value array as string type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + // Access the inner array of strings, if possible. + pub(super) fn as_string_mut( + &mut self, + ) -> Result<&mut Vec>, Error> { + let ValueArray::String(inner) = self else { + anyhow::bail!( + "Cannot access value array as string type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + /// Access the values as an array of integer distribution, if they have that + /// type. + pub fn as_integer_distribution( + &self, + ) -> Result<&Vec>>, Error> { + let ValueArray::IntegerDistribution(inner) = self else { + anyhow::bail!( + "Cannot access value array as integer \ + distribution type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + // Access the inner array of integer distribution, if possible. + pub(super) fn as_integer_distribution_mut( + &mut self, + ) -> Result<&mut Vec>>, Error> { + let ValueArray::IntegerDistribution(inner) = self else { + anyhow::bail!( + "Cannot access value array as integer \ + distribution type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + /// Access the values as an array of double distribution, if they have that + /// type. + pub fn as_double_distribution( + &self, + ) -> Result<&Vec>>, Error> { + let ValueArray::DoubleDistribution(inner) = self else { + anyhow::bail!( + "Cannot access value array as double \ + distribution type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + // Access the inner array of double distributions, if possible. + pub(super) fn as_double_distribution_mut( + &mut self, + ) -> Result<&mut Vec>>, Error> { + let ValueArray::DoubleDistribution(inner) = self else { + anyhow::bail!( + "Cannot access value array as double \ + distribution type, it has type {}", + self.data_type(), + ); + }; + Ok(inner) + } + + fn push_missing(&mut self, datum_type: DatumType) -> Result<(), Error> { + match datum_type { + DatumType::Bool => self.as_boolean_mut()?.push(None), + DatumType::I8 + | DatumType::U8 + | DatumType::I16 + | DatumType::U16 + | DatumType::I32 + | DatumType::U32 + | DatumType::I64 + | DatumType::U64 + | DatumType::CumulativeI64 + | DatumType::CumulativeU64 => self.as_integer_mut()?.push(None), + DatumType::F32 + | DatumType::F64 + | DatumType::CumulativeF32 + | DatumType::CumulativeF64 => self.as_double_mut()?.push(None), + DatumType::String => self.as_string_mut()?.push(None), + DatumType::Bytes => { + anyhow::bail!("Bytes data types are not yet supported") + } + DatumType::HistogramI8 + | DatumType::HistogramU8 + | DatumType::HistogramI16 + | DatumType::HistogramU16 + | DatumType::HistogramI32 + | DatumType::HistogramU32 + | DatumType::HistogramI64 + | DatumType::HistogramU64 => { + self.as_integer_distribution_mut()?.push(None) + } + DatumType::HistogramF32 | DatumType::HistogramF64 => { + self.as_double_distribution_mut()?.push(None) + } + } + Ok(()) + } + + // Push a value directly from a datum, without modification. + fn push_value_from_datum( + &mut self, + datum: &oximeter::Datum, + ) -> Result<(), Error> { + match datum { + oximeter::Datum::Bool(b) => self.as_boolean_mut()?.push(Some(*b)), + oximeter::Datum::I8(i) => { + self.as_integer_mut()?.push(Some(i64::from(*i))) + } + oximeter::Datum::U8(i) => { + self.as_integer_mut()?.push(Some(i64::from(*i))) + } + oximeter::Datum::I16(i) => { + self.as_integer_mut()?.push(Some(i64::from(*i))) + } + oximeter::Datum::U16(i) => { + self.as_integer_mut()?.push(Some(i64::from(*i))) + } + oximeter::Datum::I32(i) => { + self.as_integer_mut()?.push(Some(i64::from(*i))) + } + oximeter::Datum::U32(i) => { + self.as_integer_mut()?.push(Some(i64::from(*i))) + } + oximeter::Datum::I64(i) => self.as_integer_mut()?.push(Some(*i)), + oximeter::Datum::U64(i) => { + let i = + i.to_i64().context("Failed to convert u64 datum to i64")?; + self.as_integer_mut()?.push(Some(i)); + } + oximeter::Datum::F32(f) => { + self.as_double_mut()?.push(Some(f64::from(*f))) + } + oximeter::Datum::F64(f) => self.as_double_mut()?.push(Some(*f)), + oximeter::Datum::String(s) => { + self.as_string_mut()?.push(Some(s.clone())) + } + oximeter::Datum::Bytes(_) => { + anyhow::bail!("Bytes data types are not yet supported") + } + oximeter::Datum::CumulativeI64(c) => { + self.as_integer_mut()?.push(Some(c.value())) + } + oximeter::Datum::CumulativeU64(c) => { + let c = c + .value() + .to_i64() + .context("Failed to convert u64 datum to i64")?; + self.as_integer_mut()?.push(Some(c)); + } + oximeter::Datum::CumulativeF32(c) => { + self.as_double_mut()?.push(Some(f64::from(c.value()))) + } + oximeter::Datum::CumulativeF64(c) => { + self.as_double_mut()?.push(Some(c.value())) + } + oximeter::Datum::HistogramI8(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramU8(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramI16(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramU16(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramI32(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramU32(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramI64(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramU64(h) => self + .as_integer_distribution_mut()? + .push(Some(Distribution::try_from(h)?)), + oximeter::Datum::HistogramF32(h) => self + .as_double_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::HistogramF64(h) => self + .as_double_distribution_mut()? + .push(Some(Distribution::from(h))), + oximeter::Datum::Missing(missing) => { + self.push_missing(missing.datum_type())? + } + } + Ok(()) + } + + // Push a delta from the last valid datum and a new one. + // + // This takes the last valid datum, if any, and a new one. It computes the + // delta between the the values of the datum, if possible, and pushes it + // onto the correct value array inside `self`. + // + // If both the last datum and new one exist (are not missing), the normal + // diff is pushed. If the last datum is missing, but the new one exists, + // then the new value is pushed directly. If the last datum exists but the + // new one does not, then a missing datum is pushed. If both are missing, + // then a missing one is pushed as well. + // + // In other words, the diff is always between the new datum and the last + // non-None value. If such a last value does not exist, the datum is + // inserted directly. + fn push_diff_from_last_to_datum( + &mut self, + last_datum: &Option, + new_datum: &oximeter::Datum, + data_type: DataType, + ) -> Result<(), Error> { + match (last_datum.as_ref(), new_datum.is_missing()) { + (None, true) | (Some(_), true) => { + // In this case, either both values are missing, or just the new + // one is. In either case, we cannot compute a new value, and + // need to insert None to represent the new missing datum. + match data_type { + DataType::Integer => self.as_integer_mut()?.push(None), + DataType::Double => self.as_double_mut()?.push(None), + DataType::Boolean => self.as_boolean_mut()?.push(None), + DataType::String => self.as_string_mut()?.push(None), + DataType::IntegerDistribution => { + self.as_integer_distribution_mut()?.push(None) + } + DataType::DoubleDistribution => { + self.as_double_distribution_mut()?.push(None) + } + } + } + (None, false) => { + // The last datum was missing, but the new one is not. We cannot + // compute the difference, since we have no previous point. + // However, we can still push some value by inserting the datum + // directly. + self.push_value_from_datum(new_datum)?; + } + (Some(last_datum), false) => { + // Both values exist, so we can compute the difference between + // them and insert that. + // + // Note that we're asserting both are the same _datum_ type, + // which is guaranteed by a check in the caller. + match (last_datum, new_datum) { + ( + CumulativeDatum::Integer(last), + oximeter::Datum::I8(new), + ) => { + let new = i64::from(*new); + self.as_integer_mut()?.push(Some(new - last)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::U8(new), + ) => { + let new = i64::from(*new); + self.as_integer_mut()?.push(Some(new - last)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::I16(new), + ) => { + let new = i64::from(*new); + self.as_integer_mut()?.push(Some(new - last)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::U16(new), + ) => { + let new = i64::from(*new); + self.as_integer_mut()?.push(Some(new - last)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::I32(new), + ) => { + let new = i64::from(*new); + self.as_integer_mut()?.push(Some(new - last)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::U32(new), + ) => { + let new = i64::from(*new); + self.as_integer_mut()?.push(Some(new - last)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::I64(new), + ) => { + let diff = new + .checked_sub(*last) + .context("Overflow computing deltas")?; + self.as_integer_mut()?.push(Some(diff)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::U64(new), + ) => { + let new = new + .to_i64() + .context("Failed to convert u64 datum to i64")?; + let diff = new + .checked_sub(*last) + .context("Overflow computing deltas")?; + self.as_integer_mut()?.push(Some(diff)); + } + ( + CumulativeDatum::Double(last), + oximeter::Datum::F32(new), + ) => { + self.as_double_mut()? + .push(Some(f64::from(*new) - last)); + } + ( + CumulativeDatum::Double(last), + oximeter::Datum::F64(new), + ) => { + self.as_double_mut()?.push(Some(new - last)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::CumulativeI64(new), + ) => { + let new = new.value(); + let diff = new + .checked_sub(*last) + .context("Overflow computing deltas")?; + self.as_integer_mut()?.push(Some(diff)); + } + ( + CumulativeDatum::Integer(last), + oximeter::Datum::CumulativeU64(new), + ) => { + let new = new + .value() + .to_i64() + .context("Failed to convert u64 datum to i64")?; + let diff = new + .checked_sub(*last) + .context("Overflow computing deltas")?; + self.as_integer_mut()?.push(Some(diff)); + } + ( + CumulativeDatum::Double(last), + oximeter::Datum::CumulativeF32(new), + ) => { + self.as_double_mut()? + .push(Some(f64::from(new.value()) - last)); + } + ( + CumulativeDatum::Double(last), + oximeter::Datum::CumulativeF64(new), + ) => { + self.as_double_mut()?.push(Some(new.value() - last)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramI8(new), + ) => { + let new = Distribution::from(new); + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramU8(new), + ) => { + let new = Distribution::from(new); + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramI16(new), + ) => { + let new = Distribution::from(new); + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramU16(new), + ) => { + let new = Distribution::from(new); + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramI32(new), + ) => { + let new = Distribution::from(new); + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramU32(new), + ) => { + let new = Distribution::from(new); + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramI64(new), + ) => { + let new = Distribution::from(new); + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::IntegerDistribution(last), + oximeter::Datum::HistogramU64(new), + ) => { + let new = Distribution::try_from(new)?; + self.as_integer_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::DoubleDistribution(last), + oximeter::Datum::HistogramF32(new), + ) => { + let new = Distribution::from(new); + self.as_double_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + ( + CumulativeDatum::DoubleDistribution(last), + oximeter::Datum::HistogramF64(new), + ) => { + let new = Distribution::from(new); + self.as_double_distribution_mut()? + .push(Some(new.checked_sub(&last)?)); + } + (_, _) => unreachable!(), + } + } + } + Ok(()) + } + + // Return the number of samples in self. + fn len(&self) -> usize { + match self { + ValueArray::Boolean(inner) => inner.len(), + ValueArray::Integer(inner) => inner.len(), + ValueArray::Double(inner) => inner.len(), + ValueArray::String(inner) => inner.len(), + ValueArray::IntegerDistribution(inner) => inner.len(), + ValueArray::DoubleDistribution(inner) => inner.len(), + } + } + + // Return a reference to the i-th value in the array. + // + // This panics if `i >= self.len()`. + fn get(&self, i: usize) -> Datum<'_> { + match self { + ValueArray::Boolean(inner) => Datum::Boolean(inner[i]), + ValueArray::Integer(inner) => { + Datum::Integer(inner.get(i).unwrap().as_ref()) + } + ValueArray::Double(inner) => { + Datum::Double(inner.get(i).unwrap().as_ref()) + } + ValueArray::String(inner) => { + Datum::String(inner.get(i).unwrap().as_deref()) + } + ValueArray::IntegerDistribution(inner) => { + Datum::IntegerDistribution(inner.get(i).unwrap().as_ref()) + } + ValueArray::DoubleDistribution(inner) => { + Datum::DoubleDistribution(inner.get(i).unwrap().as_ref()) + } + } + } + + // Swap the value in self with other, asserting they're the same type. + pub(crate) fn swap(&mut self, mut values: ValueArray) { + use std::mem::swap; + match (self, &mut values) { + (ValueArray::Integer(x), ValueArray::Integer(y)) => swap(x, y), + (ValueArray::Double(x), ValueArray::Double(y)) => swap(x, y), + (ValueArray::Boolean(x), ValueArray::Boolean(y)) => swap(x, y), + (ValueArray::String(x), ValueArray::String(y)) => swap(x, y), + ( + ValueArray::IntegerDistribution(x), + ValueArray::IntegerDistribution(y), + ) => swap(x, y), + ( + ValueArray::DoubleDistribution(x), + ValueArray::DoubleDistribution(y), + ) => swap(x, y), + (_, _) => panic!("Cannot swap values of different types"), + } + } +} + +mod private { + pub trait Sealed {} + impl Sealed for i64 {} + impl Sealed for f64 {} +} + +pub trait DistributionSupport: + fmt::Display + Clone + Copy + fmt::Debug + PartialEq + private::Sealed +{ +} +impl DistributionSupport for i64 {} +impl DistributionSupport for f64 {} + +/// A distribution is a sequence of bins and counts in those bins. +#[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +#[schemars(rename = "Distribution{T}")] +pub struct Distribution { + bins: Vec, + counts: Vec, +} + +impl fmt::Display for Distribution { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let elems = self + .bins + .iter() + .zip(self.counts.iter()) + .map(|(bin, count)| format!("{bin}: {count}")) + .collect::>() + .join(", "); + write!(f, "{}", elems) + } +} + +impl Distribution { + // Subtract two distributions, checking that they have the same bins. + fn checked_sub( + &self, + rhs: &Distribution, + ) -> Result, Error> { + anyhow::ensure!( + self.bins == rhs.bins, + "Cannot subtract distributions with different bins", + ); + let counts = self + .counts + .iter() + .zip(rhs.counts.iter().copied()) + .map(|(x, y)| x.checked_sub(y)) + .collect::>() + .context("Underflow subtracting distributions values")?; + Ok(Self { bins: self.bins.clone(), counts }) + } + + /// Return the slice of bins. + pub fn bins(&self) -> &[T] { + &self.bins + } + + /// Return the slice of counts. + pub fn counts(&self) -> &[u64] { + &self.counts + } + + /// Return an iterator over each bin and count. + pub fn iter(&self) -> impl ExactSizeIterator + '_ { + self.bins.iter().zip(self.counts.iter()) + } +} + +macro_rules! i64_dist_from { + ($t:ty) => { + impl From<&oximeter::histogram::Histogram<$t>> for Distribution { + fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + let (bins, counts) = hist.to_arrays(); + Self { bins: bins.into_iter().map(i64::from).collect(), counts } + } + } + + impl From<&oximeter::histogram::Histogram<$t>> for CumulativeDatum { + fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + CumulativeDatum::IntegerDistribution(hist.into()) + } + } + }; +} + +i64_dist_from!(i8); +i64_dist_from!(u8); +i64_dist_from!(i16); +i64_dist_from!(u16); +i64_dist_from!(i32); +i64_dist_from!(u32); +i64_dist_from!(i64); + +impl TryFrom<&oximeter::histogram::Histogram> for Distribution { + type Error = Error; + fn try_from( + hist: &oximeter::histogram::Histogram, + ) -> Result { + let (bins, counts) = hist.to_arrays(); + let bins = bins + .into_iter() + .map(i64::try_from) + .collect::>() + .context("Overflow converting u64 to i64")?; + Ok(Self { bins, counts }) + } +} + +impl TryFrom<&oximeter::histogram::Histogram> for CumulativeDatum { + type Error = Error; + fn try_from( + hist: &oximeter::histogram::Histogram, + ) -> Result { + hist.try_into().map(CumulativeDatum::IntegerDistribution) + } +} + +macro_rules! f64_dist_from { + ($t:ty) => { + impl From<&oximeter::histogram::Histogram<$t>> for Distribution { + fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + let (bins, counts) = hist.to_arrays(); + Self { bins: bins.into_iter().map(f64::from).collect(), counts } + } + } + + impl From<&oximeter::histogram::Histogram<$t>> for CumulativeDatum { + fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + CumulativeDatum::DoubleDistribution(hist.into()) + } + } + }; +} + +f64_dist_from!(f32); +f64_dist_from!(f64); + +#[cfg(test)] +mod tests { + use crate::oxql::point::{DataType, ValueArray}; + + use super::{Distribution, MetricType, Points, Values}; + use chrono::{DateTime, Utc}; + use oximeter::types::Cumulative; + use oximeter::Measurement; + use std::time::Duration; + + #[test] + fn test_point_delta_between() { + let mut datum = Cumulative::new(2i64); + let now = Utc::now(); + let meas0 = Measurement::new(now + Duration::from_secs(1), datum); + datum.set(10i64); + let meas1 = Measurement::new(now + Duration::from_secs(2), datum); + let measurements = vec![meas0.clone(), meas1.clone()]; + let points = Points::delta_from_cumulative(&measurements).unwrap(); + + assert_eq!(points.len(), 2); + assert_eq!( + points.values(0).unwrap().as_integer().unwrap(), + &[Some(2i64), Some(8)], + ); + assert_eq!( + Duration::from_secs(1), + (points.timestamps[1] - points.timestamps[0]).to_std().unwrap(), + ); + let expected = vec![now, meas0.timestamp()]; + let actual = points.start_times.as_ref().unwrap(); + assert_eq!(expected.len(), actual.len()); + for (x, y) in expected.into_iter().zip(actual.into_iter()) { + assert!((*y - x).num_nanoseconds().unwrap() <= 1); + } + } + + #[test] + fn test_point_delta_between_with_new_epoch() { + let datum = Cumulative::new(2i64); + let now = Utc::now(); + let meas0 = Measurement::new(now + Duration::from_secs(1), datum); + + // Create a new datum, with a completely new start time, representing a + // new epoch. + let now = Utc::now() + Duration::from_secs(10); + let datum = Cumulative::with_start_time(now, 10i64); + let meas1 = Measurement::new(now + Duration::from_secs(2), datum); + let measurements = vec![meas0.clone(), meas1.clone()]; + let points = Points::delta_from_cumulative(&measurements).unwrap(); + + // The second point should not be referenced to the first, because + // they're in different epochs. + assert_eq!(points.len(), 2); + assert_eq!( + points.values(0).unwrap().as_integer().unwrap(), + &[Some(2i64), Some(10)], + ); + + // The start times should be the start times of the measurements + // themselves as well. Same for timestamps. + assert_eq!( + points.timestamps, + vec![meas0.timestamp(), meas1.timestamp()], + ); + assert_eq!( + points.start_times.as_ref().unwrap(), + &[meas0.start_time().unwrap(), meas1.start_time().unwrap()], + ); + } + + #[test] + fn test_point_delta_between_overlapping_time_ranges() { + // These data points start at `T` and `T + 100ms` respectively, and end + // at those times + 1s. That means their time ranges overlap, and so we + // can't compute a delta from them. + let start_time = Utc::now() - Duration::from_secs(1); + let datum1 = Cumulative::with_start_time(start_time, 1i64); + let datum2 = Cumulative::with_start_time( + start_time + Duration::from_millis(100), + 10i64, + ); + let meas1 = Measurement::new( + datum1.start_time() + Duration::from_secs(1), + datum1, + ); + let meas2 = Measurement::new( + datum2.start_time() + Duration::from_secs(1), + datum2, + ); + + assert!( + Points::delta_from_cumulative(&[meas1.clone(), meas2.clone()]) + .is_err(), + "Should not be able to compute a delta point \ + between two measuremenst with overlapping start \ + times: [{}, {}] and [{}, {}]", + meas1.start_time().unwrap(), + meas1.timestamp(), + meas2.start_time().unwrap(), + meas2.timestamp(), + ); + } + + fn timestamps(n: usize) -> Vec> { + let now = Utc::now(); + let mut out = Vec::with_capacity(n); + for i in 0..n { + out.push(now - Duration::from_secs(i as _)); + } + out.into_iter().rev().collect() + } + + #[test] + fn test_cast_points_from_bool() { + let points = Points { + start_times: None, + timestamps: timestamps(2), + values: vec![Values { + values: ValueArray::Boolean(vec![Some(false), Some(true)]), + metric_type: MetricType::Gauge, + }], + }; + + let as_same = points.cast(&[DataType::Boolean]).unwrap(); + let vals = as_same.values[0].values.as_boolean().unwrap(); + assert_eq!(vals, points.values[0].values.as_boolean().unwrap()); + + let as_int = points.cast(&[DataType::Integer]).unwrap(); + let vals = as_int.values[0].values.as_integer().unwrap(); + assert_eq!(vals, &vec![Some(0), Some(1)]); + + let as_double = points.cast(&[DataType::Double]).unwrap(); + let vals = as_double.values[0].values.as_double().unwrap(); + assert_eq!(vals, &vec![Some(0.0), Some(1.0)]); + + let as_string = points.cast(&[DataType::String]).unwrap(); + let vals = as_string.values[0].values.as_string().unwrap(); + assert_eq!( + vals, + &vec![Some("false".to_string()), Some("true".to_string())] + ); + + for ty in [DataType::IntegerDistribution, DataType::DoubleDistribution] + { + assert!( + points.cast(&[ty]).is_err(), + "Should not be able to cast bool array to distributions" + ); + } + assert!(points.cast(&[]).is_err(), "Should fail to cast with no types"); + assert!( + points.cast(&[DataType::Boolean, DataType::Boolean]).is_err(), + "Should fail to cast to the wrong number of types" + ); + } + + #[test] + fn test_cast_points_from_integer() { + let points = Points { + start_times: None, + timestamps: timestamps(2), + values: vec![Values { + values: ValueArray::Integer(vec![Some(0), Some(10)]), + metric_type: MetricType::Gauge, + }], + }; + + let as_same = points.cast(&[DataType::Integer]).unwrap(); + let vals = as_same.values[0].values.as_integer().unwrap(); + assert_eq!(vals, points.values[0].values.as_integer().unwrap()); + + let as_bools = points.cast(&[DataType::Boolean]).unwrap(); + let vals = as_bools.values[0].values.as_boolean().unwrap(); + assert_eq!(vals, &vec![Some(false), Some(true)]); + + let as_double = points.cast(&[DataType::Double]).unwrap(); + let vals = as_double.values[0].values.as_double().unwrap(); + assert_eq!(vals, &vec![Some(0.0), Some(10.0)]); + + let as_string = points.cast(&[DataType::String]).unwrap(); + let vals = as_string.values[0].values.as_string().unwrap(); + assert_eq!(vals, &vec![Some("0".to_string()), Some("10".to_string())]); + + for ty in [DataType::IntegerDistribution, DataType::DoubleDistribution] + { + assert!( + points.cast(&[ty]).is_err(), + "Should not be able to cast int array to distributions" + ); + } + assert!(points.cast(&[]).is_err(), "Should fail to cast with no types"); + assert!( + points.cast(&[DataType::Boolean, DataType::Boolean]).is_err(), + "Should fail to cast to the wrong number of types" + ); + } + + #[test] + fn test_cast_points_from_double() { + let points = Points { + start_times: None, + timestamps: timestamps(2), + values: vec![Values { + values: ValueArray::Double(vec![Some(0.0), Some(10.5)]), + metric_type: MetricType::Gauge, + }], + }; + + let as_same = points.cast(&[DataType::Double]).unwrap(); + let vals = as_same.values[0].values.as_double().unwrap(); + assert_eq!(vals, points.values[0].values.as_double().unwrap()); + + let as_bools = points.cast(&[DataType::Boolean]).unwrap(); + let vals = as_bools.values[0].values.as_boolean().unwrap(); + assert_eq!(vals, &vec![Some(false), Some(true)]); + + let as_ints = points.cast(&[DataType::Integer]).unwrap(); + let vals = as_ints.values[0].values.as_integer().unwrap(); + assert_eq!(vals, &vec![Some(0), Some(10)]); + + let as_string = points.cast(&[DataType::String]).unwrap(); + let vals = as_string.values[0].values.as_string().unwrap(); + assert_eq!( + vals, + &vec![Some("0".to_string()), Some("10.5".to_string())] + ); + + let points = Points { + start_times: None, + timestamps: timestamps(2), + values: vec![Values { + values: ValueArray::Double(vec![Some(0.0), Some(f64::MAX)]), + metric_type: MetricType::Gauge, + }], + }; + assert!( + points.cast(&[DataType::Integer]).is_err(), + "Should fail to cast out-of-range doubles to integer" + ); + + for ty in [DataType::IntegerDistribution, DataType::DoubleDistribution] + { + assert!( + points.cast(&[ty]).is_err(), + "Should not be able to cast double array to distributions" + ); + } + assert!(points.cast(&[]).is_err(), "Should fail to cast with no types"); + assert!( + points.cast(&[DataType::Boolean, DataType::Boolean]).is_err(), + "Should fail to cast to the wrong number of types" + ); + } + + #[test] + fn test_cast_points_from_string() { + fn make_points(strings: &[&str]) -> Points { + Points { + start_times: None, + timestamps: timestamps(strings.len()), + values: vec![Values { + values: ValueArray::String( + strings.iter().map(|&s| Some(s.into())).collect(), + ), + metric_type: MetricType::Gauge, + }], + } + } + + let points = make_points(&["some", "strings"]); + let as_same = points.cast(&[DataType::String]).unwrap(); + assert_eq!(as_same, points); + + // Any non-empty string is truthy, even "false". + let points = make_points(&["", "false", "true"]); + let as_bools = points.cast(&[DataType::Boolean]).unwrap(); + let vals = as_bools.values[0].values.as_boolean().unwrap(); + assert_eq!(vals, &vec![Some(false), Some(true), Some(true)]); + + // Conversion to integers happens by parsing. + let points = make_points(&["0", "1"]); + let as_ints = points.cast(&[DataType::Integer]).unwrap(); + let vals = as_ints.values[0].values.as_integer().unwrap(); + assert_eq!(vals, &vec![Some(0), Some(1)]); + for bad in ["1.0", "", "foo", "[]"] { + assert!( + make_points(&[bad]).cast(&[DataType::Integer]).is_err(), + "Should fail to cast non-int string '{}' to integers", + bad, + ); + } + + // Conversion to doubles happens by parsing. + let points = make_points(&["0", "1.1"]); + let as_doubles = points.cast(&[DataType::Double]).unwrap(); + let vals = as_doubles.values[0].values.as_double().unwrap(); + assert_eq!(vals, &vec![Some(0.0), Some(1.1)]); + for bad in ["", "foo", "[]"] { + assert!( + make_points(&[bad]).cast(&[DataType::Double]).is_err(), + "Should fail to cast non-double string '{}' to double", + bad, + ); + } + + // Checks for invalid casts + for ty in [DataType::IntegerDistribution, DataType::DoubleDistribution] + { + assert!( + points.cast(&[ty]).is_err(), + "Should not be able to cast double array to distributions" + ); + } + assert!(points.cast(&[]).is_err(), "Should fail to cast with no types"); + assert!( + points.cast(&[DataType::Boolean, DataType::Boolean]).is_err(), + "Should fail to cast to the wrong number of types" + ); + } + + #[test] + fn test_cast_points_from_int_distribution() { + // We can only "cast" to the same type here. + let points = Points { + start_times: None, + timestamps: timestamps(1), + values: vec![Values { + values: ValueArray::IntegerDistribution(vec![Some( + Distribution { bins: vec![0, 1, 2], counts: vec![0; 3] }, + )]), + metric_type: MetricType::Gauge, + }], + }; + let as_same = points.cast(&[DataType::IntegerDistribution]).unwrap(); + assert_eq!(points, as_same); + + for ty in [ + DataType::Boolean, + DataType::String, + DataType::Integer, + DataType::Double, + DataType::DoubleDistribution, + ] { + assert!( + points.cast(&[ty]).is_err(), + "Should not be able to cast distributions to anything other than itself" + ); + } + assert!(points.cast(&[]).is_err()); + assert!(points + .cast(&[ + DataType::IntegerDistribution, + DataType::IntegerDistribution + ]) + .is_err()); + } + + #[test] + fn test_cast_points_from_double_distribution() { + // We can only "cast" to the same type here. + let points = Points { + start_times: None, + timestamps: timestamps(1), + values: vec![Values { + values: ValueArray::DoubleDistribution(vec![Some( + Distribution { + bins: vec![0.0, 1.0, 2.0], + counts: vec![0; 3], + }, + )]), + metric_type: MetricType::Gauge, + }], + }; + let as_same = points.cast(&[DataType::DoubleDistribution]).unwrap(); + assert_eq!(points, as_same); + + for ty in [ + DataType::Boolean, + DataType::String, + DataType::Integer, + DataType::Double, + DataType::IntegerDistribution, + ] { + assert!( + points.cast(&[ty]).is_err(), + "Should not be able to cast distributions to anything other than itself" + ); + } + assert!(points.cast(&[]).is_err()); + assert!(points + .cast(&[DataType::DoubleDistribution, DataType::DoubleDistribution]) + .is_err()); + } +} diff --git a/oximeter/db/src/oxql/query/mod.rs b/oximeter/db/src/oxql/query/mod.rs new file mode 100644 index 0000000000..bb1c0986fe --- /dev/null +++ b/oximeter/db/src/oxql/query/mod.rs @@ -0,0 +1,837 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A single OxQL query. + +// Copyright 2024 Oxide Computer Company + +use super::ast::ident::Ident; +use super::ast::logical_op::LogicalOp; +use super::ast::table_ops::filter::CompoundFilter; +use super::ast::table_ops::filter::FilterExpr; +use super::ast::table_ops::group_by::GroupBy; +use super::ast::table_ops::BasicTableOp; +use super::ast::table_ops::TableOp; +use super::ast::SplitQuery; +use crate::oxql::ast::grammar; +use crate::oxql::ast::table_ops::filter::Filter; +use crate::oxql::ast::Query as QueryNode; +use crate::oxql::fmt_parse_error; +use crate::oxql::Error; +use crate::TimeseriesName; +use chrono::DateTime; +use chrono::Utc; +use std::time::Duration; + +/// Special identifiers for column names or other widely-used values. +pub mod special_idents { + use oximeter::DatumType; + + pub const TIMESTAMP: &str = "timestamp"; + pub const START_TIME: &str = "start_time"; + pub const DATUM: &str = "datum"; + pub const BINS: &str = "bins"; + pub const COUNTS: &str = "counts"; + pub const DATETIME64: &str = "DateTime64"; + pub const ARRAYU64: &str = "Array[u64]"; + + pub fn array_type_name_from_histogram_type( + type_: DatumType, + ) -> Option { + if !type_.is_histogram() { + return None; + } + Some(format!( + "Array[{}]", + type_.to_string().strip_prefix("Histogram").unwrap().to_lowercase(), + )) + } +} + +/// A parsed OxQL query. +#[derive(Clone, Debug, PartialEq)] +pub struct Query { + pub(super) parsed: QueryNode, + pub(super) end_time: DateTime, +} + +impl Query { + /// Construct a query written in OxQL. + pub fn new(query: impl AsRef) -> Result { + let raw = query.as_ref().trim(); + const MAX_LEN: usize = 4096; + anyhow::ensure!( + raw.len() <= MAX_LEN, + "Queries must be <= {} characters", + MAX_LEN, + ); + let parsed = grammar::query_parser::query(raw) + .map_err(|e| fmt_parse_error(raw, e))?; + + // Fetch the latest query end time referred to in the parsed query, or + // use now if there isn't one. + let query_end_time = parsed.query_end_time().unwrap_or_else(Utc::now); + Ok(Self { parsed, end_time: query_end_time }) + } + + /// Return the end time of the query. + pub fn end_time(&self) -> &DateTime { + &self.end_time + } + + /// Return the next referenced timeseries name. + /// + /// Queries always start with either a single `get` operation, which refers + /// to one timeseries; or a subquery, each component of which is a query. So + /// it is always true that there is exactly one next timeseries name, since + /// that comes from the current query, or the next subquery. + pub fn timeseries_name(&self) -> &TimeseriesName { + self.parsed.timeseries_name() + } + + /// Return the transformation table ops, i.e., everything after the initial + /// get operation or subquery. + pub fn transformations(&self) -> &[TableOp] { + self.parsed.transformations() + } + + /// Return the set of all predicates in the query, coalesced. + /// + /// Query optimization is a large topic. There are few rules, and many + /// heuristics. However, one of those is extremely useful for our case: + /// predicate pushdown. This is where one moves predicates as close as + /// possible to the data, filtering out unused data as early as possible in + /// query processing. + /// + /// In our case, _currently_, we can implement this pretty easily. Filtering + /// operations can usually be coalesced into a single item. That means: + /// + /// - successive filtering operations are merged: `filter a | filter b -> + /// `filter (a) && (b)`. + /// - filtering operations are "pushed down", to just after the initial + /// `get` operation in the query. + /// + /// # Group by + /// + /// While filters can be combined and pushed down through many operations, + /// special care is taken for `group_by`. Specifically, the filter must only + /// name columns explicitly named in the `group_by`. If we pushed through + /// filters which named one of the columns _within_ the group (one not + /// named), then that would change the set of data in a group, and thus the + /// result. + /// + /// # Datum filters + /// + /// We currently only push down filters on the timestamps, and that is only + /// because we do _not_ support aggregations across time, only values. If + /// and when we do support that, then filters which reference time also + /// cannot be pushed down. + /// + /// # No predicates + /// + /// Note that this may return `None`, in the case where there are zero + /// predicates of any kind. + // + // Pushing filters through a group by. Consider the following data: + // + // a b timestamp datum + // 0 0 0 0 + // 0 0 1 1 + // 0 1 0 2 + // 0 1 1 3 + // 1 0 0 4 + // 1 0 1 5 + // 1 1 0 6 + // 1 1 1 7 + // + // So there are two groups for a and b columns each with two samples. + // + // Consider `get a:b | group_by [a] | filter a == 0`. + // + // After the group by, the result is: + // + // a timestamp datum + // 0 0 avg([0, 2]) -> 1 + // 0 1 avg([1, 3]) -> 2 + // 1 0 avg([4, 6]) -> 5 + // 1 1 avg([5, 7]) -> 6 + // + // Then after the filter, it becomes: + // + // a timestamp datum + // 0 0 avg([0, 2]) -> 1 + // 0 1 avg([1, 3]) -> 2 + // + // Now, let's do the filter first, as if we pushed that down. + // i.e., `get a:b | filter a == 0 | group_by [a]`. After the filter, we get: + // + // a b timestamp datum + // 0 0 0 0 + // 0 0 1 1 + // 0 1 0 2 + // 0 1 1 3 + // + // Then we apply the group by: + // + // a timestamp datum + // 0 0 avg([0, 2]) -> 1 + // 0 1 avg([1, 3]) -> 2 + // + // So we get the same result. Let's suppose we had a filter on the column + // `b` instead. Doing the group_by first, we get the exact same result as + // the first one above. Or we really get an error, because the resulting + // table does not have a `b` column. + // + // If instead we did the filter first, we'd get a different result. Starting + // from: + // + // a b timestamp datum + // 0 0 0 0 + // 0 0 1 1 + // 0 1 0 2 + // 0 1 1 3 + // 1 0 0 4 + // 1 0 1 5 + // 1 1 0 6 + // 1 1 1 7 + // + // Apply `filter b == 0`: + // + // + // a b timestamp datum + // 0 0 0 0 + // 0 0 1 1 + // 1 0 0 4 + // 1 0 1 5 + // + // Then apply group_by [a] + // + // a timestamp datum + // 0 0 avg([0, 1]) -> 0.5 + // 0 1 avg([4, 5]) -> 4.5 + // + // So we get something very different. + // + // What about filtering by timestamp? Starting from the raw data again: + // + // a b timestamp datum + // 0 0 0 0 + // 0 0 1 1 + // 0 1 0 2 + // 0 1 1 3 + // 1 0 0 4 + // 1 0 1 5 + // 1 1 0 6 + // 1 1 1 7 + // + // Let's add a `filter timestamp >= 1`. After the `group_by [a]`, we get: + // + // a timestamp datum + // 0 0 avg([0, 2]) -> 1 + // 0 1 avg([1, 3]) -> 2 + // 1 0 avg([4, 6]) -> 5 + // 1 1 avg([5, 7]) -> 6 + // + // Then after `filter timestamp >= 1`: + // + // a timestamp datum + // 0 1 avg([1, 3]) -> 2 + // 1 1 avg([5, 7]) -> 6 + // + // Now, filtering the timestamps first, after that we get: + // + // a b timestamp datum + // 0 0 1 1 + // 0 1 1 3 + // 1 0 1 5 + // 1 1 1 7 + // + // Then grouping: + // + // a timestamp datum + // 0 1 avg([1, 3]) -> 2 + // 1 1 avg([5, 7]) -> 6 + // + // So that also works fine. + pub(crate) fn coalesced_predicates( + &self, + mut outer: Option, + ) -> Option { + let maybe_filter = self.transformations().iter().rev().fold( + None, + |maybe_filter, next_tr| { + // Transformations only return basic ops, since all the + // subqueries must be at the prefix of the query. + let TableOp::Basic(op) = next_tr else { + unreachable!(); + }; + + match op { + BasicTableOp::GroupBy(GroupBy { identifiers, .. }) => { + // We may have been passed predicates from an outer + // query. Those also need to be restricted, if we're + // trying to push them through a group_by operation. + outer = outer.as_ref().and_then(|outer| { + restrict_filter_idents(outer, identifiers) + }); + + // Only push through columns referred to in the group by + // itself, which replaces the current filter. + maybe_filter.as_ref().and_then(|current| { + restrict_filter_idents(current, identifiers) + }) + } + BasicTableOp::Filter(filter) => { + // Merge with any existing filter. + if let Some(left) = maybe_filter { + Some(left.merge(&filter, LogicalOp::And)) + } else { + Some(filter.clone()) + } + } + _ => maybe_filter, + } + }, + ); + + // Merge in any predicates passed from an outer query, which may have + // been restricted as we moved through group_by operations. + match (outer, maybe_filter) { + (None, any) => any, + (Some(outer), None) => Some(outer), + (Some(outer), Some(inner)) => { + Some(outer.merge(&inner, LogicalOp::And)) + } + } + } + + pub(crate) fn split(&self) -> SplitQuery { + self.parsed.split(self.end_time) + } +} + +// Return a new filter containing only parts that refer to either: +// +// - a `timestamp` column +// - a column listed in `identifiers` +fn restrict_filter_idents( + current_filter: &Filter, + identifiers: &[Ident], +) -> Option { + match ¤t_filter.expr { + FilterExpr::Simple(inner) => { + let ident = inner.ident.as_str(); + if ident == "timestamp" + || identifiers.iter().map(Ident::as_str).any(|id| id == ident) + { + Some(current_filter.clone()) + } else { + None + } + } + FilterExpr::Compound(CompoundFilter { left, op, right }) => { + let maybe_left = restrict_filter_idents(left, identifiers); + let maybe_right = restrict_filter_idents(right, identifiers); + match (maybe_left, maybe_right) { + (Some(left), Some(right)) => Some(Filter { + negated: current_filter.negated, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(left), + op: *op, + right: Box::new(right), + }), + }), + (Some(single), None) | (None, Some(single)) => Some(single), + (None, None) => None, + } + } + } +} + +/// Describes the time alignment for an OxQL query. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Alignment { + /// The end time of the query, which the temporal reference point. + pub end_time: DateTime, + /// The alignment period, the interval on which values are produced. + pub period: Duration, +} + +#[cfg(test)] +mod tests { + use super::Filter; + use super::Ident; + use super::Query; + use crate::oxql::ast::cmp::Comparison; + use crate::oxql::ast::literal::Literal; + use crate::oxql::ast::logical_op::LogicalOp; + use crate::oxql::ast::table_ops::filter::CompoundFilter; + use crate::oxql::ast::table_ops::filter::FilterExpr; + use crate::oxql::ast::table_ops::filter::SimpleFilter; + use crate::oxql::ast::table_ops::join::Join; + use crate::oxql::ast::table_ops::BasicTableOp; + use crate::oxql::ast::table_ops::TableOp; + use crate::oxql::ast::SplitQuery; + use crate::oxql::query::restrict_filter_idents; + use chrono::NaiveDateTime; + use chrono::Utc; + use std::time::Duration; + + #[test] + fn test_restrict_filter_idents_single_atom() { + let ident = Ident("foo".into()); + let filter = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: ident.clone(), + cmp: Comparison::Eq, + value: Literal::Boolean(false), + }), + }; + assert_eq!( + restrict_filter_idents(&filter, &[ident.clone()]).unwrap(), + filter + ); + assert_eq!(restrict_filter_idents(&filter, &[]), None); + } + + #[test] + fn test_restrict_filter_idents_single_atom_with_timestamp() { + let filter = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("timestamp".into()), + cmp: Comparison::Eq, + value: Literal::Boolean(false), + }), + }; + assert_eq!(restrict_filter_idents(&filter, &[]).unwrap(), filter); + } + + #[test] + fn test_restrict_filter_idents_expr() { + let idents = [Ident("foo".into()), Ident("bar".into())]; + let left = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: idents[0].clone(), + cmp: Comparison::Eq, + value: Literal::Boolean(false), + }), + }; + let right = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: idents[1].clone(), + cmp: Comparison::Eq, + value: Literal::Boolean(false), + }), + }; + let filter = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(left.clone()), + op: LogicalOp::And, + right: Box::new(right.clone()), + }), + }; + assert_eq!(restrict_filter_idents(&filter, &idents).unwrap(), filter); + + // This should remove the right filter. + assert_eq!( + restrict_filter_idents(&filter, &idents[..1]).unwrap(), + left + ); + + // And both + assert_eq!(restrict_filter_idents(&filter, &[]), None); + } + + #[test] + fn test_split_query() { + let q = Query::new("get a:b").unwrap(); + let split = q.split(); + assert_eq!(split, SplitQuery::Flat(q)); + + let q = Query::new("get a:b | filter x == 0").unwrap(); + let split = q.split(); + assert_eq!(split, SplitQuery::Flat(q)); + + let q = Query::new("{ get a:b } | join").unwrap(); + let split = q.split(); + let mut inner = Query::new("get a:b").unwrap(); + inner.end_time = q.end_time; + assert_eq!( + split, + SplitQuery::Nested { + subqueries: vec![inner], + transformations: vec![TableOp::Basic(BasicTableOp::Join(Join))], + } + ); + + let q = Query::new("{ get a:b | filter x == 0 } | join").unwrap(); + let split = q.split(); + let mut inner = Query::new("get a:b | filter x == 0").unwrap(); + inner.end_time = q.end_time; + assert_eq!( + split, + SplitQuery::Nested { + subqueries: vec![inner], + transformations: vec![TableOp::Basic(BasicTableOp::Join(Join))], + } + ); + + let q = Query::new("{ get a:b ; get a:b } | join").unwrap(); + let split = q.split(); + let mut inner = Query::new("get a:b").unwrap(); + inner.end_time = q.end_time; + assert_eq!( + split, + SplitQuery::Nested { + subqueries: vec![inner; 2], + transformations: vec![TableOp::Basic(BasicTableOp::Join(Join))], + } + ); + + let q = Query::new("{ { get a:b ; get a:b } | join } | join").unwrap(); + let split = q.split(); + let mut subqueries = + vec![Query::new("{ get a:b; get a:b } | join").unwrap()]; + subqueries[0].end_time = q.end_time; + let expected = SplitQuery::Nested { + subqueries: subqueries.clone(), + transformations: vec![TableOp::Basic(BasicTableOp::Join(Join))], + }; + assert_eq!(split, expected); + let split = subqueries[0].split(); + let mut inner = Query::new("get a:b").unwrap(); + inner.end_time = q.end_time; + assert_eq!( + split, + SplitQuery::Nested { + subqueries: vec![inner; 2], + transformations: vec![TableOp::Basic(BasicTableOp::Join(Join))], + } + ); + } + + #[test] + fn test_coalesce_predicates() { + // Passed through group-by unchanged. + let q = Query::new("get a:b | group_by [a] | filter a == 0").unwrap(); + let preds = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Integer(0), + }), + }; + assert_eq!(q.coalesced_predicates(None), Some(preds)); + + // Merge the first two, then pass through group by. + let q = Query::new( + "get a:b | group_by [a] | filter a == 0 | filter a == 0", + ) + .unwrap(); + let atom = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Integer(0), + }), + }; + let preds = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(atom.clone()), + op: LogicalOp::And, + right: Box::new(atom.clone()), + }), + }; + assert_eq!(q.coalesced_predicates(None), Some(preds)); + + // These are also merged, even though they're on different sides of the + // group by. + let q = Query::new( + "get a:b | filter a == 0 | group_by [a] | filter a == 0", + ) + .unwrap(); + let atom = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Integer(0), + }), + }; + let preds = Filter { + negated: false, + expr: FilterExpr::Compound(CompoundFilter { + left: Box::new(atom.clone()), + op: LogicalOp::And, + right: Box::new(atom.clone()), + }), + }; + assert_eq!(q.coalesced_predicates(None), Some(preds)); + + // Second filter is _not_ passed through, because it refers to columns + // not in the group by. We have only the first filter. + let q = Query::new( + "get a:b | filter a == 0 | group_by [a] | filter b == 0", + ) + .unwrap(); + let preds = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("a".to_string()), + cmp: Comparison::Eq, + value: Literal::Integer(0), + }), + }; + assert_eq!(q.coalesced_predicates(None), Some(preds)); + } + + #[test] + fn test_coalesce_predicates_into_subqueries() { + let q = "{ get a:b; get a:b } | join | filter foo == 'bar'"; + let query = Query::new(q).unwrap(); + let preds = query.coalesced_predicates(None).unwrap(); + let expected_predicate = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("foo".to_string()), + cmp: Comparison::Eq, + value: Literal::String("bar".into()), + }), + }; + assert_eq!(preds, expected_predicate); + + // Split the query, which should give us a list of two subqueries, + // followed by the join and filter. + let SplitQuery::Nested { subqueries, .. } = query.split() else { + panic!(); + }; + for subq in subqueries.iter() { + let inner = subq + .coalesced_predicates(Some(expected_predicate.clone())) + .unwrap(); + assert_eq!( + inner, expected_predicate, + "Predicates passed into an inner subquery should be preserved" + ); + } + } + + #[test] + fn test_coalesce_predicates_into_subqueries_with_group_by() { + let q = "{ get a:b | group_by [baz]; get a:b | group_by [foo] } | \ + join | filter foo == 'bar'"; + let query = Query::new(q).unwrap(); + let preds = query.coalesced_predicates(None).unwrap(); + let expected_predicate = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("foo".to_string()), + cmp: Comparison::Eq, + value: Literal::String("bar".into()), + }), + }; + assert_eq!(preds, expected_predicate); + + // Split the query, which should give us a list of two subqueries, + // followed by the join and filter. + let SplitQuery::Nested { subqueries, .. } = query.split() else { + panic!(); + }; + + // The first subquery groups by a field "baz", which isn't in the outer + // filter. It should have that outer predicate removed, and have no + // predicates at all. + let subq = &subqueries[0]; + assert!( + subq.coalesced_predicates(Some(expected_predicate.clone())) + .is_none(), + "Should not push an outer predicate into a subquery, when that \ + subquery includes a group_by that does not name a field in the \ + outer predicate" + ); + + // The second subquery should include the expected predicate, since the + // group_by includes the field named in the filter itself. + let subq = &subqueries[1]; + let inner = subq + .coalesced_predicates(Some(expected_predicate.clone())) + .unwrap(); + assert_eq!( + inner, expected_predicate, + "Predicates passed into an inner subquery should be preserved, \ + when that inner subquery includes a group_by that names the \ + ident in the outer filter" + ); + } + + #[test] + fn test_coalesce_predicates_merged_into_subqueries() { + let q = "{ get a:b | filter baz == 0; get a:b | filter baz == 0 } \ + | join | filter foo == 'bar'"; + let query = Query::new(q).unwrap(); + let preds = query.coalesced_predicates(None).unwrap(); + let expected_predicate = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("foo".to_string()), + cmp: Comparison::Eq, + value: Literal::String("bar".into()), + }), + }; + assert_eq!(preds, expected_predicate); + let expected_inner_predicate = Filter { + negated: false, + expr: FilterExpr::Simple(SimpleFilter { + ident: Ident("baz".to_string()), + cmp: Comparison::Eq, + value: Literal::Integer(0), + }), + }; + + // Split the query, which should give us a list of two subqueries, + // followed by the join and filter. + let SplitQuery::Nested { subqueries, .. } = query.split() else { + panic!(); + }; + for subq in subqueries.iter() { + let inner = subq + .coalesced_predicates(Some(expected_predicate.clone())) + .unwrap(); + assert_eq!( + inner, + expected_predicate.merge(&expected_inner_predicate, LogicalOp::And), + "Predicates passed into an inner subquery should be preserved, \ + and merged with any subquery predicates", + ); + } + } + + #[test] + fn test_query_end_time() { + const MAX_DIFF: i64 = 1_000; + let q = Query::new("get a:b").unwrap(); + assert!( + (q.end_time - Utc::now()).num_nanoseconds().unwrap() < MAX_DIFF, + "Query which does not explicitly name an end time should \ + use now as the end time", + ); + + let q = Query::new("get a:b | filter timestamp > @now() - 1s").unwrap(); + assert!( + (q.end_time - Utc::now()).num_nanoseconds().unwrap() < MAX_DIFF, + "Query which does not explicitly name an end time should \ + use now as the end time", + ); + + let then = Utc::now() - Duration::from_secs(60); + let as_str = then.format("%Y-%m-%dT%H:%M:%S.%f"); + let q = Query::new(&format!("get a:b | filter timestamp < @{as_str}")) + .unwrap(); + assert_eq!( + q.end_time, then, + "Query with a less-than filter and a timestamp should \ + set the query end time" + ); + + let q = Query::new(&format!("get a:b | filter timestamp <= @{as_str}")) + .unwrap(); + assert_eq!( + q.end_time, then, + "Query with a less-than-or-equal filter and a timestamp should \ + set the query end time" + ); + + let q = Query::new(&format!("get a:b | filter timestamp > @{as_str}")) + .unwrap(); + assert!( + (q.end_time - Utc::now()).num_nanoseconds().unwrap() < MAX_DIFF, + "Query with a greater-than timestamp filter should not set an \ + explicit query end time, and so use now" + ); + + let q = Query::new("get a:b | filter timestamp > @now() - 1d").unwrap(); + assert!( + (q.end_time - Utc::now()).num_nanoseconds().unwrap() < MAX_DIFF, + "Query which does not explicitly name an end time should \ + use now as the end time", + ); + + let q = Query::new(&format!( + "get a:b | filter timestamp > @now() - 1d && timestamp < @{as_str}" + )) + .unwrap(); + assert_eq!( + q.end_time, + then, + "Query with a compound less-than-or-equal filter and a timestamp should \ + set the query end time" + ); + + let then = Utc::now() - Duration::from_secs(60); + let then_as_str = then.format("%Y-%m-%dT%H:%M:%S.%f"); + let even_earlier = then - Duration::from_secs(10); + let even_earlier_as_str = even_earlier.format("%Y-%m-%dT%H:%M:%S.%f"); + let q = Query::new(&format!( + "get a:b | filter timestamp < @{then_as_str} || timestamp < @{even_earlier_as_str}" + )) + .unwrap(); + assert_eq!( + q.end_time, + then, + "Query with two less-than timestamp filters should use the later timestamp" + ); + + let expected = NaiveDateTime::parse_from_str( + "2024-03-13T06:24:00", + "%Y-%m-%dT%H:%M:%S%.f", + ) + .unwrap() + .and_utc(); + let q = "{ \ + get physical_data_link:bytes_sent ; \ + get physical_data_link:bytes_received \ + } | filter timestamp > @2024-03-13T06:20:00 && timestamp < @2024-03-13T06:24:00"; + let query = Query::new(q).unwrap(); + assert_eq!(query.end_time, expected); + } + + #[test] + fn test_query_end_time_across_subqueries() { + let now = Utc::now(); + const FMT: &str = "%Y-%m-%dT%H:%M:%S.%f"; + let first = now - Duration::from_secs(1); + let second = now - Duration::from_secs_f64(1e-3); + let q = format!( + "{{ \ + get a:b | filter timestamp > @{}; \ + get a:b | filter timestamp > @{} \ + }}", + first.format(FMT), + second.format(FMT), + ); + let query = Query::new(q).unwrap(); + assert!( + query.end_time > second, + "This nested query should have used Utc::now() as the end time" + ); + let end_time = query.end_time; + let SplitQuery::Nested { subqueries, .. } = query.split() else { + unreachable!(); + }; + for subq in subqueries.iter() { + assert_eq!( + subq.end_time, end_time, + "All subqueries should have the same end time." + ); + } + } +} diff --git a/oximeter/db/src/oxql/table.rs b/oximeter/db/src/oxql/table.rs new file mode 100644 index 0000000000..025935090b --- /dev/null +++ b/oximeter/db/src/oxql/table.rs @@ -0,0 +1,293 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Definitions of timeseries and groups of them, a [`Table`]. + +// Copyright 2024 Oxide Computer Company + +use super::point::DataType; +use super::point::MetricType; +use super::point::Points; +use super::query::Alignment; +use super::Error; +use crate::TimeseriesKey; +use highway::HighwayHasher; +use oximeter::FieldValue; +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::hash::Hash; +use std::hash::Hasher; + +/// A timeseries contains a timestamped set of values from one source. +/// +/// This includes the typed key-value pairs that uniquely identify it, and the +/// set of timestamps and data values from it. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct Timeseries { + pub fields: BTreeMap, + pub points: Points, + #[serde(skip)] + pub(crate) alignment: Option, +} + +impl Timeseries { + /// Construct a new timeseries, from its fields. + /// + /// It holds no points or type information. That will be enforced by the + /// points type as they are added. + pub fn new( + fields: impl Iterator, + data_type: DataType, + metric_type: MetricType, + ) -> Result { + let fields: BTreeMap<_, _> = fields.collect(); + anyhow::ensure!(!fields.is_empty(), "Fields cannot be empty"); + Ok(Self { + fields, + points: Points::empty(data_type, metric_type), + alignment: None, + }) + } + + pub fn key(&self) -> TimeseriesKey { + // NOTE: The key here is _not_ stable, like the one used in the database + // itself to identify timeseries. That's OK, however, because we do not + // serialize this value anywhere -- it's used entirely for the lifetime + // of one query, and then thrown away, and only needs to be consistent + // for that long. + let mut hasher = HighwayHasher::default(); + for (name, value) in self.fields.iter() { + name.hash(&mut hasher); + value.hash(&mut hasher); + } + hasher.finish() + } + + /// Return a copy of the timeseries, keeping only the provided fields. + /// + /// An error is returned if the timeseries does not contain those fields. + pub(crate) fn copy_with_fields( + &self, + kept_fields: &[&str], + ) -> Result { + let mut fields = BTreeMap::new(); + for field in kept_fields { + let Some(f) = self.fields.get(*field) else { + anyhow::bail!("Timeseries does not contain field '{}'", field); + }; + fields.insert(field.to_string(), f.clone()); + } + Ok(Self { + fields, + points: self.points.clone(), + alignment: self.alignment, + }) + } + + // Return `true` if the schema in `other` matches that of `self`. + fn matches_schema(&self, other: &Timeseries) -> bool { + if self.fields.len() != other.fields.len() { + return false; + } + for (f0, f1) in self.fields.iter().zip(other.fields.iter()) { + // Check the field names. + if f0.0 != f1.0 { + return false; + } + // And types. + if f0.1.field_type() != f1.1.field_type() { + return false; + } + } + + // And the type info is the same as well. + if !self + .points + .data_types() + .zip(other.points.data_types()) + .all(|(x, y)| x == y) + { + return false; + } + self.points + .metric_types() + .zip(other.points.metric_types()) + .all(|(x, y)| x == y) + } + + /// Return a new timeseries, with the points cast to the provided list of + /// data types. + /// + /// This returns an error if the points cannot be so cast, or the + /// dimensionality of the types requested differs from the dimensionality of + /// the points themselves. + pub(crate) fn cast(&self, types: &[DataType]) -> Result { + let fields = self.fields.clone(); + Ok(Self { + fields, + points: self.points.cast(types)?, + alignment: self.alignment, + }) + } +} + +/// A table represents one or more timeseries with the same schema. +/// +/// A table is the result of an OxQL query. It contains a name, usually the name +/// of the timeseries schema from which the data is derived, and any number of +/// timeseries, which contain the actual data. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct Table { + // The name of the table. + // + // This starts as the name of the timeseries schema the data is derived + // from, but can be modified as operations are done. + pub(super) name: String, + // The set of timeseries in the table, ordered by key. + timeseries: BTreeMap, +} + +impl Table { + /// Create a new table, with no timeseries. + pub fn new(name: impl AsRef) -> Self { + Self { name: name.as_ref().to_string(), timeseries: BTreeMap::new() } + } + + /// Create a table from a set of timeseries. + pub fn from_timeseries( + name: impl AsRef, + t: impl Iterator, + ) -> Result { + let mut out = Self::new(name); + for each in t { + out.insert(each)?; + } + Ok(out) + } + + /// Return the name of the table. + pub fn name(&self) -> &str { + self.name.as_str() + } + + /// Return the number of timeseries in this table. + pub fn n_timeseries(&self) -> usize { + self.timeseries.len() + } + + /// Return the list of timeseries in this table, ordered by key. + pub fn timeseries(&self) -> impl ExactSizeIterator { + self.timeseries.values() + } + + // Check that the schema of `other` matches `self`. + // + // That means the fields have the same names and types, and the timeseries + // have the same type info. + fn matches_schema(&self, other: &Timeseries) -> bool { + if let Some((_, first)) = self.timeseries.first_key_value() { + first.matches_schema(other) + } else { + // Table is empty. + true + } + } + + /// Get a timeseries matching the provided key, if any. + pub fn get_mut(&mut self, key: TimeseriesKey) -> Option<&mut Timeseries> { + self.timeseries.get_mut(&key) + } + + /// Insert a new timeseries into the table. + /// + /// If the timeseries already exists, an error is returned. Use + /// [`Table::replace()`] to replace an existing timeseries. + /// + /// It is an error if the timeseries does not have the same schema as the + /// others in the table (if any). + pub fn insert(&mut self, timeseries: Timeseries) -> Result<(), Error> { + anyhow::ensure!( + self.matches_schema(×eries), + "Timeseries in a table must have the same schema", + ); + let key = timeseries.key(); + let Entry::Vacant(e) = self.timeseries.entry(key) else { + return Err(anyhow::anyhow!( + "Timeseries with key {} already exists", + key, + )); + }; + e.insert(timeseries); + Ok(()) + } + + /// Replace a timeseries in the table. + pub fn replace(&mut self, timeseries: Timeseries) { + let key = timeseries.key(); + let _ = self.timeseries.insert(key, timeseries); + } + + /// Add multiple timeseries to the table. + /// + /// An error is returned if any timeseries already exist. + pub fn extend( + &mut self, + timeseries: impl Iterator, + ) -> Result<(), Error> { + for t in timeseries { + self.insert(t)?; + } + Ok(()) + } + + /// Return the number of timeseries in the table. + pub fn len(&self) -> usize { + self.timeseries.len() + } + + /// Return a mutable iterator over timeseries in the table. + pub fn iter_mut(&mut self) -> impl Iterator { + self.timeseries.values_mut() + } + + /// Return an iterator over timeseries in the table. + pub fn iter(&self) -> impl Iterator { + self.timeseries.values() + } + + /// Consume the table and return an iterator over its timeseries. + pub fn into_iter(self) -> impl Iterator { + self.timeseries.into_values() + } + + /// Return `true` if all the timeseries in this table are aligned, with the + /// same alignment information. + /// + /// If there are no timeseries, `false` is returned. + pub fn is_aligned(&self) -> bool { + let mut timeseries = self.timeseries.values(); + let Some(t) = timeseries.next() else { + return false; + }; + let Some(alignment) = t.alignment else { + return false; + }; + timeseries.all(|t| t.alignment == Some(alignment)) + } + + /// Return the alignment of this table, if all timeseries are aligned with + /// the same alignment. + pub fn alignment(&self) -> Option { + if self.is_aligned() { + Some( + self.timeseries.first_key_value().unwrap().1.alignment.unwrap(), + ) + } else { + None + } + } +} diff --git a/oximeter/db/src/query.rs b/oximeter/db/src/query.rs index 9212769573..e14dfbbc55 100644 --- a/oximeter/db/src/query.rs +++ b/oximeter/db/src/query.rs @@ -576,33 +576,32 @@ impl SelectQuery { match self.field_selectors.len() { 0 => None, n => { - // Select timeseries key for first column, plus field name and field value for - // all columns. - const SELECTED_COLUMNS: &[&str] = - &["field_name", "field_value"]; + // Select timeseries key for first column, plus the field value + // for all columns, aliased to the field name. const JOIN_COLUMNS: &[&str] = &["timeseries_name", "timeseries_key"]; - let mut top_level_columns = - Vec::with_capacity(1 + SELECTED_COLUMNS.len() * n); + let mut top_level_columns = Vec::with_capacity(2 + n); top_level_columns.push(String::from( "filter0.timeseries_key as timeseries_key", )); let mut from_statements = String::new(); - for (i, subquery) in self + for (i, (field_name, subquery)) in self .field_selectors - .values() - .map(|sel| { - sel.as_query(&self.timeseries_schema.timeseries_name) + .iter() + .map(|(field_schema, selector)| { + ( + &field_schema.name, + selector.as_query( + &self.timeseries_schema.timeseries_name, + ), + ) }) .enumerate() { - for column in SELECTED_COLUMNS { - top_level_columns.push(format!( - "filter{i}.{column}", - i = i, - column = column - )); - } + top_level_columns.push(format!( + "filter{}.field_value AS {}", + i, field_name, + )); if i == 0 { from_statements.push_str(&format!( @@ -1028,8 +1027,8 @@ mod tests { concat!( "SELECT ", "filter0.timeseries_key as timeseries_key, ", - "filter0.field_name, filter0.field_value, ", - "filter1.field_name, filter1.field_value ", + "filter0.field_value AS f0, ", + "filter1.field_value AS f1 ", "FROM (", "SELECT * FROM oximeter.fields_i64 ", "WHERE timeseries_name = 'foo:bar' ", @@ -1095,8 +1094,8 @@ mod tests { concat!( "SELECT ", "filter0.timeseries_key as timeseries_key, ", - "filter0.field_name, filter0.field_value, ", - "filter1.field_name, filter1.field_value ", + "filter0.field_value AS f0, ", + "filter1.field_value AS f1 ", "FROM (", "SELECT * FROM oximeter.fields_i64 ", "WHERE timeseries_name = 'foo:bar' AND field_name = 'f0' AND field_value = 0", @@ -1152,8 +1151,8 @@ mod tests { query.field_query().unwrap(), concat!( "SELECT filter0.timeseries_key as timeseries_key, ", - "filter0.field_name, filter0.field_value, ", - "filter1.field_name, filter1.field_value ", + "filter0.field_value AS f0, ", + "filter1.field_value AS f1 ", "FROM (", "SELECT * FROM oximeter.fields_i64 ", "WHERE timeseries_name = 'foo:bar' AND field_name = 'f0' AND field_value = 0", diff --git a/oximeter/db/src/sql/mod.rs b/oximeter/db/src/sql/mod.rs index 5d9685d19f..8a5bd20bde 100644 --- a/oximeter/db/src/sql/mod.rs +++ b/oximeter/db/src/sql/mod.rs @@ -32,6 +32,7 @@ use crate::query::measurement_table_name; use crate::DatumType; use crate::Error as OxdbError; use crate::FieldType; +use crate::QuerySummary; use crate::TimeseriesName; use crate::TimeseriesSchema; use indexmap::IndexSet; @@ -131,6 +132,31 @@ macro_rules! unsupported { }; } +/// A tabular result from a SQL query against a timeseries. +#[derive(Clone, Debug, Default, serde::Serialize)] +pub struct Table { + /// The name of each column in the result set. + pub column_names: Vec, + /// The rows of the result set, one per column. + pub rows: Vec>, +} + +/// The full result of running a SQL query against a timeseries. +#[derive(Clone, Debug)] +pub struct QueryResult { + /// The query as written by the client. + pub original_query: String, + /// The rewritten query, run against the JOINed representation of the + /// timeseries. + /// + /// This is the query that is actually run in the database itself. + pub rewritten_query: String, + /// Summary of the resource usage of the query. + pub summary: QuerySummary, + /// The result of the query, with column names and rows. + pub table: Table, +} + /// A helper type to preprocess any ClickHouse-specific SQL, and present a /// known-safe version of it to the main `sqlparser` code. /// diff --git a/oximeter/oximeter/src/types.rs b/oximeter/oximeter/src/types.rs index eff5c399e3..04289a7297 100644 --- a/oximeter/oximeter/src/types.rs +++ b/oximeter/oximeter/src/types.rs @@ -311,7 +311,7 @@ pub enum DatumType { impl DatumType { /// Return `true` if this datum type is cumulative, and `false` otherwise. - pub fn is_cumulative(&self) -> bool { + pub const fn is_cumulative(&self) -> bool { matches!( self, DatumType::CumulativeI64 @@ -331,9 +331,26 @@ impl DatumType { ) } + /// Return `true` if this datum type is a scalar, and `false` otherwise. + pub const fn is_scalar(&self) -> bool { + !self.is_histogram() + } + /// Return `true` if this datum type is a histogram, and `false` otherwise. pub const fn is_histogram(&self) -> bool { - matches!(self, DatumType::HistogramF64 | DatumType::HistogramI64) + matches!( + self, + DatumType::HistogramI8 + | DatumType::HistogramU8 + | DatumType::HistogramI16 + | DatumType::HistogramU16 + | DatumType::HistogramI32 + | DatumType::HistogramU32 + | DatumType::HistogramI64 + | DatumType::HistogramU64 + | DatumType::HistogramF32 + | DatumType::HistogramF64 + ) } } @@ -450,6 +467,11 @@ impl Datum { Datum::Missing(ref inner) => inner.start_time(), } } + + /// Return true if this datum is missing. + pub fn is_missing(&self) -> bool { + matches!(self, Datum::Missing(_)) + } } // Helper macro to generate `From` and `From<&T>` for the datum types. @@ -580,7 +602,7 @@ impl Measurement { /// Return true if this measurement represents a missing datum. pub fn is_missing(&self) -> bool { - matches!(self.datum, Datum::Missing(_)) + self.datum.is_missing() } /// Return the datum for this measurement diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index a2d853ac23..659b10c721 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -74,6 +74,7 @@ num-integer = { version = "0.1.46", features = ["i128"] } num-iter = { version = "0.1.44", default-features = false, features = ["i128"] } num-traits = { version = "0.2.18", features = ["i128", "libm"] } openapiv3 = { version = "2.0.0", default-features = false, features = ["skip_serializing_defaults"] } +peg-runtime = { version = "0.8.2", default-features = false, features = ["std"] } pem-rfc7468 = { version = "0.7.0", default-features = false, features = ["std"] } petgraph = { version = "0.6.4", features = ["serde-1"] } postgres-types = { version = "0.2.6", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } @@ -179,6 +180,7 @@ num-integer = { version = "0.1.46", features = ["i128"] } num-iter = { version = "0.1.44", default-features = false, features = ["i128"] } num-traits = { version = "0.2.18", features = ["i128", "libm"] } openapiv3 = { version = "2.0.0", default-features = false, features = ["skip_serializing_defaults"] } +peg-runtime = { version = "0.8.2", default-features = false, features = ["std"] } pem-rfc7468 = { version = "0.7.0", default-features = false, features = ["std"] } petgraph = { version = "0.6.4", features = ["serde-1"] } postgres-types = { version = "0.2.6", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] }