From 289146b142b32ccc1872dc278089752e89022a70 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Tue, 3 Dec 2024 17:54:42 -0600 Subject: [PATCH] [nexus] Project-scoped OxQL endpoint (#6873) #7047 moves the existing `/v1/timeseries/query` endpoint (which requires the fleet viewer role) to `/v1/system/timeseries/query`. This PR is on top of #7047 and adds `/v1/timeseries/query?project=my-project`, using a sneaky trick to let us ensure that the user has read permissions on a project before we let them see metrics for that project. See https://github.com/oxidecomputer/omicron/issues/5298#issuecomment-2284591633 for a discussion of the OxQL authz problem more broadly. - [x] Add `/v1/timeseries/query` with required project query param - [x] Integration tests showing the authz works in an expected scenarios - ~~Add a list schemas endpoint that only lists schemas with a `project_id` field~~ - ~~Move schema filtering logic inside oximeter client~~ - ~~Fully test schema list endpoint~~ ## The trick 1. Require the user to say (in a query param) what project they're interested in 2. Look up that project and make sure they can read it 3. Jam `| filter silo_id == "" && project_id == ""` on the end of whatever query they passed in It sounds silly, but I talked it over with @bnaecker and we couldn't find any holes. If the user tries to fetch metrics from another project inside their query, the query will end up with the filter `project_id == "def" && project_id == "xyz"` and the result set will always be empty. If they try to query a metric that doesn't have a `project_id` on it, it will error out. It works! ## API design Initially I had the endpoint as `/v1/timeseries/query/project/{project}`. This is really horrible. It should be `/projects/`, but that doesn't feel any better. I also considered `/v1/projects/{project}/timeseries/query`, which has some precedent: https://github.com/oxidecomputer/omicron/blob/45f5f1cc2c7eec2a1de0a5143e85e0794134f175/nexus/external-api/output/nexus_tags.txt#L59-L64 But it also feels awful. I like the query param approach. Right now, there are only fleet-scoped metrics and project-scoped metrics, so requiring the project ID makes sense, but the query params are nicely flexible in that we could make project optional or add other optional fields and just do different auth checks based on what's passed in. Neither path nor operation ID mention projects.
Examples of fleet- and project-scoped metrics ## No list endpoint yet I backed out the schema list endpoint. We can't list project-scoped schemas because `authz_scope` is not in the database! See https://github.com/oxidecomputer/omicron/issues/5942. Currently the schema list endpoint hard-codes `authz_scope: Fleet`. https://github.com/oxidecomputer/omicron/blob/69de8b6288fde36fbcd6cabb4d632d62851230ad/oximeter/db/src/model/from_block.rs#L142 I am using the tag `hidden` on the query endpoint so that it goes in the OpenAPI definition (so I can use it in the console) but it will not show up on the docs site. --- nexus/external-api/output/nexus_tags.txt | 1 + nexus/external-api/src/lib.rs | 20 +++ nexus/src/app/metrics.rs | 70 +++++--- nexus/src/external_api/http_entrypoints.rs | 27 +++ nexus/tests/integration_tests/endpoints.rs | 23 ++- nexus/tests/integration_tests/metrics.rs | 187 +++++++++++++++++++-- openapi/nexus.json | 49 ++++++ 7 files changed, 337 insertions(+), 40 deletions(-) diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index 8102ebce08..a979a9804b 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -30,6 +30,7 @@ probe_create POST /experimental/v1/probes probe_delete DELETE /experimental/v1/probes/{probe} probe_list GET /experimental/v1/probes probe_view GET /experimental/v1/probes/{probe} +timeseries_query POST /v1/timeseries/query API operations found with tag "images" OPERATION ID METHOD URL PATH diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 1c5c7c1d2d..e2b53a7e6f 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -2567,6 +2567,26 @@ pub trait NexusExternalApi { body: TypedBody, ) -> Result, HttpError>; + // TODO: list endpoint for project-scoped schemas is blocked on + // https://github.com/oxidecomputer/omicron/issues/5942: the authz scope for + // each schema is not stored in Clickhouse yet. + + /// Run project-scoped timeseries query + /// + /// Queries are written in OxQL. Project must be specified by name or ID in + /// URL query parameter. The OxQL query will only return timeseries data + /// from the specified project. + #[endpoint { + method = POST, + path = "/v1/timeseries/query", + tags = ["hidden"], + }] + async fn timeseries_query( + rqctx: RequestContext, + query_params: Query, + body: TypedBody, + ) -> Result, HttpError>; + // Updates /// Upload TUF repository diff --git a/nexus/src/app/metrics.rs b/nexus/src/app/metrics.rs index 40f7882281..5b77e681b1 100644 --- a/nexus/src/app/metrics.rs +++ b/nexus/src/app/metrics.rs @@ -140,28 +140,52 @@ impl super::Nexus { self.timeseries_client .oxql_query(query) .await - .map(|result| { - // TODO-observability: The query method returns information - // about the duration of the OxQL query and the database - // resource usage for each contained SQL query. We should - // publish this as a timeseries itself, so that we can track - // improvements to query processing. - // - // For now, simply return the tables alone. - result.tables - }) - .map_err(|e| match e { - oximeter_db::Error::DatabaseUnavailable(_) - | oximeter_db::Error::Connection(_) => { - Error::ServiceUnavailable { - internal_message: e.to_string(), - } - } - oximeter_db::Error::Oxql(_) - | oximeter_db::Error::TimeseriesNotFound(_) => { - Error::invalid_request(e.to_string()) - } - _ => Error::InternalError { internal_message: e.to_string() }, - }) + // TODO-observability: The query method returns information + // about the duration of the OxQL query and the database + // resource usage for each contained SQL query. We should + // publish this as a timeseries itself, so that we can track + // improvements to query processing. + // + // For now, simply return the tables alone. + .map(|result| result.tables) + .map_err(map_timeseries_err) + } + + /// Run an OxQL query against the timeseries database, scoped to a specific project. + pub(crate) async fn timeseries_query_project( + &self, + _opctx: &OpContext, + project_lookup: &lookup::Project<'_>, + query: impl AsRef, + ) -> Result, Error> { + // Ensure the user has read access to the project + let (authz_silo, authz_project) = + project_lookup.lookup_for(authz::Action::Read).await?; + + // Ensure the query only refers to the project + let filtered_query = format!( + "{} | filter silo_id == \"{}\" && project_id == \"{}\"", + query.as_ref(), + authz_silo.id(), + authz_project.id() + ); + + self.timeseries_client + .oxql_query(filtered_query) + .await + .map(|result| result.tables) + .map_err(map_timeseries_err) + } +} + +fn map_timeseries_err(e: oximeter_db::Error) -> Error { + match e { + oximeter_db::Error::DatabaseUnavailable(_) + | oximeter_db::Error::Connection(_) => Error::unavail(&e.to_string()), + oximeter_db::Error::Oxql(_) + | oximeter_db::Error::TimeseriesNotFound(_) => { + Error::invalid_request(e.to_string()) + } + _ => Error::internal_error(&e.to_string()), } } diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index a285542442..740895b7e4 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -5544,6 +5544,33 @@ impl NexusExternalApi for NexusExternalApiImpl { .await } + async fn timeseries_query( + rqctx: RequestContext, + query_params: Query, + body: TypedBody, + ) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + let opctx = + crate::context::op_context_for_external_api(&rqctx).await?; + let project_selector = query_params.into_inner(); + let query = body.into_inner().query; + let project_lookup = + nexus.project_lookup(&opctx, project_selector)?; + nexus + .timeseries_query_project(&opctx, &project_lookup, &query) + .await + .map(|tables| HttpResponseOk(views::OxqlQueryResult { tables })) + .map_err(HttpError::from) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + // Updates async fn system_update_put_repository( diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 2e7b68eaca..466cae17a8 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -948,10 +948,14 @@ pub static DEMO_SILO_METRICS_URL: Lazy = Lazy::new(|| { ) }); -pub static TIMESERIES_LIST_URL: Lazy = +pub static TIMESERIES_QUERY_URL: Lazy = Lazy::new(|| { + format!("/v1/timeseries/query?project={}", *DEMO_PROJECT_NAME) +}); + +pub static SYSTEM_TIMESERIES_LIST_URL: Lazy = Lazy::new(|| String::from("/v1/system/timeseries/schemas")); -pub static TIMESERIES_QUERY_URL: Lazy = +pub static SYSTEM_TIMESERIES_QUERY_URL: Lazy = Lazy::new(|| String::from("/v1/system/timeseries/query")); pub static DEMO_TIMESERIES_QUERY: Lazy = @@ -2208,7 +2212,18 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { }, VerifyEndpoint { - url: &TIMESERIES_LIST_URL, + url: &TIMESERIES_QUERY_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Post( + serde_json::to_value(&*DEMO_TIMESERIES_QUERY).unwrap() + ), + ], + }, + + VerifyEndpoint { + url: &SYSTEM_TIMESERIES_LIST_URL, visibility: Visibility::Public, unprivileged_access: UnprivilegedAccess::None, allowed_methods: vec![ @@ -2217,7 +2232,7 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { }, VerifyEndpoint { - url: &TIMESERIES_QUERY_URL, + url: &SYSTEM_TIMESERIES_QUERY_URL, visibility: Visibility::Public, unprivileged_access: UnprivilegedAccess::None, allowed_methods: vec![ diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 33cf7e2073..7e5441c16a 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -9,16 +9,20 @@ use crate::integration_tests::instances::{ }; use chrono::Utc; use dropshot::test_util::ClientTestContext; -use dropshot::ResultsPage; +use dropshot::{HttpErrorResponseBody, ResultsPage}; use http::{Method, StatusCode}; +use nexus_auth::authn::USER_TEST_UNPRIVILEGED; +use nexus_db_queries::db::identity::Asset; +use nexus_test_utils::background::activate_background_task; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::resource_helpers::{ create_default_ip_pool, create_disk, create_instance, create_project, - objects_list_page_authz, DiskTest, + grant_iam, object_create_error, objects_list_page_authz, DiskTest, }; use nexus_test_utils::wait_for_producer; use nexus_test_utils::ControlPlaneTestContext; use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::shared::ProjectRole; use nexus_types::external_api::views::OxqlQueryResult; use nexus_types::silo::DEFAULT_SILO_ID; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; @@ -266,7 +270,7 @@ async fn test_metrics( /// Test that we can correctly list some timeseries schema. #[nexus_test] -async fn test_timeseries_schema_list( +async fn test_system_timeseries_schema_list( cptestctx: &ControlPlaneTestContext, ) { // Nexus registers itself as a metric producer on startup, with its own UUID @@ -298,16 +302,44 @@ async fn test_timeseries_schema_list( } /// Run an OxQL query until it succeeds or panics. -pub async fn timeseries_query_until_success( +pub async fn system_timeseries_query( cptestctx: &ControlPlaneTestContext, query: impl ToString, +) -> Vec { + timeseries_query_until_success( + cptestctx, + "/v1/system/timeseries/query", + query, + ) + .await +} + +/// Run a project-scoped OxQL query until it succeeds or panics. +pub async fn project_timeseries_query( + cptestctx: &ControlPlaneTestContext, + project: &str, + query: impl ToString, +) -> Vec { + timeseries_query_until_success( + cptestctx, + &format!("/v1/timeseries/query?project={}", project), + query, + ) + .await +} + +/// Run an OxQL query until it succeeds or panics. +async fn timeseries_query_until_success( + cptestctx: &ControlPlaneTestContext, + endpoint: &str, + query: impl ToString, ) -> Vec { const POLL_INTERVAL: Duration = Duration::from_secs(1); const POLL_MAX: Duration = Duration::from_secs(30); let query_ = query.to_string(); wait_for_condition( || async { - match timeseries_query(cptestctx, &query_).await { + match execute_timeseries_query(cptestctx, endpoint, &query_).await { Some(r) => Ok(r), None => Err(CondCheckError::<()>::NotYet), } @@ -331,8 +363,9 @@ pub async fn timeseries_query_until_success( /// This returns `None` if the query resulted in client error and the body /// indicates that a timeseries named in the query could not be found. In all /// other cases, it either succeeds or panics. -pub async fn timeseries_query( +pub async fn execute_timeseries_query( cptestctx: &ControlPlaneTestContext, + endpoint: &str, query: impl ToString, ) -> Option> { // first, make sure the latest timeseries have been collected. @@ -351,7 +384,7 @@ pub async fn timeseries_query( nexus_test_utils::http_testing::RequestBuilder::new( &cptestctx.external_client, http::Method::POST, - "/v1/system/timeseries/query", + endpoint, ) .body(Some(&body)), ) @@ -490,7 +523,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -506,7 +539,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -523,7 +556,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -548,7 +581,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -577,7 +610,7 @@ async fn test_instance_watcher_metrics( // activate the instance watcher background task. activate_instance_watcher().await; - let metrics = timeseries_query_until_success(&cptestctx, OXQL_QUERY).await; + let metrics = system_timeseries_query(&cptestctx, OXQL_QUERY).await; let checks = metrics .iter() .find(|t| t.name() == "virtual_machine:check") @@ -597,6 +630,134 @@ async fn test_instance_watcher_metrics( assert_gte!(ts2_running, 2); } +#[nexus_test] +async fn test_project_timeseries_query( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + + create_default_ip_pool(&client).await; // needed for instance create to work + + // Create two projects + let p1 = create_project(&client, "project1").await; + let _p2 = create_project(&client, "project2").await; + + // Create resources in each project + let i1 = create_instance(&client, "project1", "instance1").await; + let _i2 = create_instance(&client, "project2", "instance2").await; + + let internal_client = &cptestctx.internal_client; + + // get the instance metrics to show up + let _ = + activate_background_task(&internal_client, "instance_watcher").await; + + // Query with no project specified + let q1 = "get virtual_machine:check"; + + let result = project_timeseries_query(&cptestctx, "project1", q1).await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + // also works with project ID + let result = + project_timeseries_query(&cptestctx, &p1.identity.id.to_string(), q1) + .await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + let result = project_timeseries_query(&cptestctx, "project2", q1).await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + // with project specified + let q2 = &format!("{} | filter project_id == \"{}\"", q1, p1.identity.id); + + let result = project_timeseries_query(&cptestctx, "project1", q2).await; + assert_eq!(result.len(), 1); + assert!(result[0].timeseries().len() > 0); + + let result = project_timeseries_query(&cptestctx, "project2", q2).await; + assert_eq!(result.len(), 1); + assert_eq!(result[0].timeseries().len(), 0); + + // with instance specified + let q3 = &format!("{} | filter instance_id == \"{}\"", q1, i1.identity.id); + + // project containing instance gives me something + let result = project_timeseries_query(&cptestctx, "project1", q3).await; + assert_eq!(result.len(), 1); + assert_eq!(result[0].timeseries().len(), 1); + + // should be empty or error + let result = project_timeseries_query(&cptestctx, "project2", q3).await; + assert_eq!(result.len(), 1); + assert_eq!(result[0].timeseries().len(), 0); + + // expect error when querying a metric that has no project_id on it + let q4 = "get integration_target:integration_metric"; + let url = "/v1/timeseries/query?project=project1"; + let body = nexus_types::external_api::params::TimeseriesQuery { + query: q4.to_string(), + }; + let result = + object_create_error(client, url, &body, StatusCode::BAD_REQUEST).await; + assert_eq!(result.error_code.unwrap(), "InvalidRequest"); + // Notable that the error confirms that the metric exists and says what the + // fields are. This is helpful generally, but here it would be better if + // we could say something more like "you can't query this timeseries from + // this endpoint" + assert_eq!(result.message, "The filter expression contains identifiers that are not valid for its input timeseries. Invalid identifiers: [\"project_id\", \"silo_id\"], timeseries fields: {\"datum\", \"metric_name\", \"target_name\", \"timestamp\"}"); + + // nonexistent project + let url = "/v1/timeseries/query?project=nonexistent"; + let body = nexus_types::external_api::params::TimeseriesQuery { + query: q4.to_string(), + }; + let result = + object_create_error(client, url, &body, StatusCode::NOT_FOUND).await; + assert_eq!(result.message, "not found: project with name \"nonexistent\""); + + // unprivileged user gets 404 on project that exists, but which they can't read + let url = "/v1/timeseries/query?project=project1"; + let body = nexus_types::external_api::params::TimeseriesQuery { + query: q1.to_string(), + }; + + let request = RequestBuilder::new(client, Method::POST, url) + .body(Some(&body)) + .expect_status(Some(StatusCode::NOT_FOUND)); + let result = NexusRequest::new(request) + .authn_as(AuthnMode::UnprivilegedUser) + .execute() + .await + .unwrap() + .parsed_body::() + .unwrap(); + assert_eq!(result.message, "not found: project with name \"project1\""); + + // now grant the user access to that project only + grant_iam( + client, + "/v1/projects/project1", + ProjectRole::Viewer, + USER_TEST_UNPRIVILEGED.id(), + AuthnMode::PrivilegedUser, + ) + .await; + + // now they can access the timeseries. how cool is that + let request = RequestBuilder::new(client, Method::POST, url) + .body(Some(&body)) + .expect_status(Some(StatusCode::OK)); + let result = NexusRequest::new(request) + .authn_as(AuthnMode::UnprivilegedUser) + .execute_and_parse_unwrap::() + .await; + assert_eq!(result.tables.len(), 1); + assert_eq!(result.tables[0].timeseries().len(), 1); +} + #[nexus_test] async fn test_mgs_metrics( cptestctx: &ControlPlaneTestContext, @@ -763,7 +924,7 @@ async fn test_mgs_metrics( .try_force_collect() .await .expect("Could not force oximeter collection"); - let table = timeseries_query_until_success(&cptestctx, &query) + let table = system_timeseries_query(&cptestctx, &query) .await .into_iter() .find(|t| t.name() == name) diff --git a/openapi/nexus.json b/openapi/nexus.json index 79186e379a..c0b6a96fcf 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -8890,6 +8890,55 @@ } } }, + "/v1/timeseries/query": { + "post": { + "tags": [ + "hidden" + ], + "summary": "Run project-scoped timeseries query", + "description": "Queries are written in OxQL. Project must be specified by name or ID in URL query parameter. The OxQL query will only return timeseries data from the specified project.", + "operationId": "timeseries_query", + "parameters": [ + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TimeseriesQuery" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OxqlQueryResult" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/users": { "get": { "tags": [