diff --git a/DESCRIPTION b/DESCRIPTION index 50e0da0..26d9f85 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cbioportalR Title: Browse and Query Clinical and Genomic Data from cBioPortal -Version: 1.0.1.9000 +Version: 1.0.1.9002 Authors@R: c(person(given = "Karissa", family = "Whiting", diff --git a/NAMESPACE b/NAMESPACE index 4342e5a..cd88902 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,8 @@ export(get_mutations_by_sample) export(get_mutations_by_study) export(get_panel_by_sample) export(get_samples_by_patient) +export(get_segments_by_sample) +export(get_segments_by_study) export(get_structural_variants_by_sample) export(get_structural_variants_by_study) export(get_study_info) diff --git a/NEWS.md b/NEWS.md index 867ff6a..303bf15 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ * Added `available_sample_lists()` function which returns all available sample list IDs for a given study ID * Added `sample_list_id` argument to `available_samples()` which returns all samples IDs in specific sample list within a study (#53). * Fixed {cli} errors to make package compatible with {cli} v3.4.1. +* Added CNA segmentation retrieval endpoint accessible via `get_segments_by_sample()` and `get_segments_by_study()`). You can use `get_genomics_by_*(return_segments = TRUE)` as well to access this data. # cbioportalR 1.0.1 diff --git a/R/by_database.R b/R/by_database.R index fbe5d3b..353fc23 100644 --- a/R/by_database.R +++ b/R/by_database.R @@ -18,7 +18,7 @@ available_studies <- function(base_url = NULL) { url_path <- paste0("studies/") res <- cbp_api(url_path, base_url = base_url)$content %>% dplyr::bind_rows(.) %>% - select(.data$studyId, everything()) + select("studyId", everything()) return(res) } diff --git a/R/by_genes_or_panel.R b/R/by_genes_or_panel.R index 14070e4..7b76c76 100644 --- a/R/by_genes_or_panel.R +++ b/R/by_genes_or_panel.R @@ -101,8 +101,8 @@ get_alias <- function(hugo_symbol = NULL, mutate(alias = purrr::map(.data$url_path, ~cbp_api(.x, base_url = base_url)$content)) %>% mutate(alias = purrr::simplify_all(.data$alias)) %>% - tidyr::unnest(.data$alias) %>% - select(-.data$url_path) + tidyr::unnest("alias") %>% + select(-"url_path") return(res) } @@ -133,8 +133,8 @@ get_gene_panel <- function(panel_id = NULL, base_url = NULL) { tib %>% mutate(data = purrr::map(.data$genes, ~as_tibble(.x))) %>% - select(.data$genePanelId, .data$data, .data$description) %>% - tidyr::unnest(cols = .data$data) + select("genePanelId", "data", "description") %>% + tidyr::unnest(cols = "data") }) diff --git a/R/by_patients.R b/R/by_patients.R index 2aad2f5..20dea2f 100644 --- a/R/by_patients.R +++ b/R/by_patients.R @@ -40,8 +40,8 @@ get_samples_by_patient <- function(patient_id = NULL, res <- cbp_api(url_path = x, base_url = base_url) res$content df <- as.data.frame(res$content) %>% - select(.data$patientId, .data$sampleId, - .data$sampleType, .data$studyId) + select("patientId", "sampleId", + "sampleType", "studyId") df }) @@ -109,10 +109,8 @@ get_clinical_by_patient <- function(study_id = NULL, # get study ID - resolved_study_id <- study_id %>% - purrr::when(!is.null(.) ~ ., - ~ suppressMessages(.guess_study_id(study_id, resolved_url))) - + resolved_study_id <- study_id %||% + suppressMessages(.guess_study_id(study_id, resolved_url)) # create lookup dataframe - @@ -127,7 +125,7 @@ get_clinical_by_patient <- function(study_id = NULL, # Prep data frame for Query ------------------------------------------------ patient_study_pairs_nest <- patient_study_pairs %>% group_by(.data$study_id) %>% - tidyr::nest(sample_id_nest = .data$patient_id) + tidyr::nest(sample_id_nest = "patient_id") # Query -------------------------------------------------------------------- df <- purrr::map2_dfr(patient_study_pairs_nest$study_id, patient_study_pairs_nest$sample_id_nest, @@ -164,7 +162,7 @@ get_clinical_by_patient <- function(study_id = NULL, resolved_clinical_attributes <- clinical_attribute %||% (available_clinical_attributes(study_id, base_url = base_url) %>% - pull(.data$clinicalAttributeId) %>% + dplyr::pull(.data$clinicalAttributeId) %>% unique()) if(is.null(clinical_attribute)) { diff --git a/R/by_samples.R b/R/by_samples.R index 8b26f45..5fe8f29 100644 --- a/R/by_samples.R +++ b/R/by_samples.R @@ -51,9 +51,8 @@ get_clinical_by_sample <- function(study_id = NULL, # get study ID - resolved_study_id <- study_id %>% - purrr::when(!is.null(.) ~ ., - ~ suppressMessages(.guess_study_id(study_id, resolved_url))) + resolved_study_id <- study_id %||% + suppressMessages(.guess_study_id(study_id, resolved_url)) @@ -69,7 +68,7 @@ get_clinical_by_sample <- function(study_id = NULL, # Prep data frame for Query ------------------------------------------------ sample_study_pairs_nest <- sample_study_pairs %>% group_by(.data$study_id) %>% - tidyr::nest(sample_id_nest = .data$sample_id) + tidyr::nest(sample_id_nest = "sample_id") # Query -------------------------------------------------------------------- df <- purrr::map2_dfr(sample_study_pairs_nest$study_id, sample_study_pairs_nest$sample_id_nest, @@ -102,7 +101,7 @@ get_clinical_by_sample <- function(study_id = NULL, resolved_clinical_attributes <- clinical_attribute %||% (available_clinical_attributes(study_id, base_url = base_url) %>% - pull(.data$clinicalAttributeId) %>% + dplyr::pull(.data$clinicalAttributeId) %>% unique()) if(is.null(clinical_attribute)) { @@ -159,13 +158,13 @@ get_panel_by_sample <- function(study_id = NULL, base_url = base_url) - res %>% - purrr::when(nrow(.) < 1 ~ cli::cli_abort("No gene panel data found. Did you specify the correct {.code study_id} for your {.code sample_id}? - Is {.val GENE_PANEL} an available clinical attribute in your queried studies?"), - - TRUE ~ transmute(., .data$sampleId, .data$studyId, genePanel = .data$value)) - + if(nrow(res) < 1) { + cli::cli_abort(c("No gene panel data found. Did you specify the correct {.code study_id} for your {.code sample_id}? ,", + "Is {.val GENE_PANEL} an available clinical attribute in your queried studies?")) + } + res <- transmute(res, .data$sampleId, .data$studyId, genePanel = .data$value) + return(res) } diff --git a/R/by_studies.R b/R/by_studies.R index 00de2b3..c9affbc 100644 --- a/R/by_studies.R +++ b/R/by_studies.R @@ -98,11 +98,17 @@ get_clinical_by_study <- function(study_id = NULL, df_samp <- purrr::map_df(res$content, ~ tibble::as_tibble(.x)) # filter selected clinical attributes if not NULL + df_samp <- + if(nrow(df_samp) > 0 & is_not_null(clinical_attribute)) { + filter(df_samp, .data$clinicalAttributeId %in% clinical_attribute) + + } else { + cli_alert_warning("Sample Level Clinical Data: No {.var clinical_attribute} passed. Defaulting to returning all clinical attributes in {.val {study_id}} study") + df_samp + + } + df_samp <- df_samp %>% - purrr::when( - nrow(df_samp) > 0 & !is.null(clinical_attribute) ~ filter(., clinicalAttributeId %in% clinical_attribute), - ~{cli_alert_warning("Sample Level Clinical Data: No {.var clinical_attribute} passed. Defaulting to returning all clinical attributes in {.val {study_id}} study") - .}) %>% mutate(dataLevel = "SAMPLE") @@ -117,11 +123,15 @@ get_clinical_by_study <- function(study_id = NULL, df_pat <- purrr::map_df(res$content, ~ tibble::as_tibble(.x)) # filter selected clinical attributes if not NULL + df_pat <- + if(nrow(df_pat) > 0 & !is.null(clinical_attribute)) { + filter(df_pat, .data$clinicalAttributeId %in% clinical_attribute) + } else { + cli_alert_warning("Patient Level Clinical Data: No {.var clinical_attribute} passed. Defaulting to returning all clinical attributes in {.val {study_id}} study") + df_pat + } + df_pat <- df_pat %>% - purrr::when( - nrow(df_pat) > 0 & !is.null(clinical_attribute) ~ filter(., clinicalAttributeId %in% clinical_attribute), - ~{cli_alert_warning("Patient Level Clinical Data: No {.var clinical_attribute} passed. Defaulting to returning all clinical attributes in {.val {study_id}} study") - .})%>% mutate(dataLevel = "PATIENT", sampleId = NA_character_) @@ -213,8 +223,8 @@ available_samples <- function(study_id = NULL, sample_list_id = NULL, res$content df <- bind_rows(res$content) %>% select( - .data$patientId, .data$sampleId, - .data$sampleType, .data$studyId + "patientId", "sampleId", + "sampleType", "studyId" ) df }) @@ -254,7 +264,7 @@ available_patients <- function(study_id = NULL, res <- cbp_api(url_path = x, base_url = base_url) res$content df <- bind_rows(res$content) %>% - select(.data$patientId, .data$studyId) + select("patientId", "studyId") df }) diff --git a/R/genomics_by_sample.R b/R/genomics_by_sample.R index 25765e1..5d58d7e 100644 --- a/R/genomics_by_sample.R +++ b/R/genomics_by_sample.R @@ -64,7 +64,7 @@ study_id = NULL, molecular_profile_id = NULL, sample_study_pairs = NULL, - data_type = c("mutation", "cna", "fusion", "structural_variant"), + data_type = c("mutation", "cna", "fusion", "structural_variant", "segment"), genes = NULL, panel = NULL, @@ -87,16 +87,20 @@ if(length(study_id) > 1 | length(molecular_profile_id) > 1) { cli::cli_abort("More than 1 {.code study_id} or {.code molecular_profile_id} was passed. Please use the {.code sample_study_pairs} argument instead") } - data_type <- match.arg(data_type) %>% - purrr::when(. == "structural_variant" ~ "fusion", - TRUE ~ .) + + data_type <- match.arg(data_type) + + data_type <- dplyr::case_when( + data_type == "structural_variant" ~ "fusion", + TRUE ~ data_type) # this has to go in query URL url_data_type <- switch( data_type, "mutation" = "mutations", "fusion" = "structural-variant", - "cna" = "discrete-copy-number") + "cna" = "discrete-copy-number", + "segment" = "copy-number-segments") # Make Informed guesses on parameters --------------------------------------- @@ -111,20 +115,19 @@ # Get study ID --------- - resolved_study_id <- study_id %>% - purrr::when(!is.null(.) ~ ., - !is.null(molecular_profile_id) ~ .lookup_study_name(molecular_profile_id = molecular_profile_id, - study_id = ., - base_url = base_url), - # if both NULL - ~ suppressMessages(.guess_study_id(study_id, resolved_url))) - + resolved_study_id <- study_id %||% { + if(is_not_null(molecular_profile_id)) { + .lookup_study_name(molecular_profile_id = molecular_profile_id, + study_id = study_id, + base_url = base_url) + } else { + suppressMessages(.guess_study_id(study_id, resolved_url)) + } + } # Get molecular profile ID --------- - resolved_molecular_profile_id <- molecular_profile_id %>% - purrr::when( - !is.null(.) ~ ., - ~ .lookup_profile_name(data_type, study_id = resolved_study_id, base_url)) + resolved_molecular_profile_id <- molecular_profile_id %||% + .lookup_profile_name(data_type, resolved_study_id, base_url = base_url) @@ -145,13 +148,12 @@ panel_genes <- .get_panel_entrez(panel_id = panel, base_url = base_url) # if genes arg passed, get gene IDs if panel, or entrez IDs if hugo - resolved_genes <- genes %>% - purrr::when( - is.character(.) ~ { -# cli::cli_inform("Hugo symbols were converted to entrez IDs in order to query the cBioPortal API (see {.code ?get_entrez_id} for more info)") - get_entrez_id(., base_url = base_url)$entrezGeneId - }, - TRUE ~ .) + resolved_genes <- + if(is.character(genes)) { + get_entrez_id(genes, base_url = base_url)$entrezGeneId + } else { + genes + } resolved_genes <- c(panel_genes, resolved_genes) %>% unique() @@ -162,7 +164,7 @@ # If user passes study_id and data_type we can pull the correct molecular ID if(!("molecular_profile_id" %in% colnames(sample_study_pairs))) { - unique_study_id <- distinct(select(sample_study_pairs, .data$study_id)) %>% + unique_study_id <- distinct(select(sample_study_pairs, "study_id")) %>% mutate(molecular_profile_id = purrr::map(.data$study_id, ~.lookup_profile_name(.x, @@ -182,12 +184,12 @@ sample_study_pairs_nest <- sample_study_pairs %>% group_by(.data$study_id, .data$molecular_profile_id) %>% - tidyr::nest(sample_id_nest = .data$sample_id) %>% + tidyr::nest(sample_id_nest = "sample_id") %>% mutate(url_path = paste0("molecular-profiles/", .data$molecular_profile_id, "/", url_data_type, "/fetch?")) %>% ungroup() %>% - select(.data$url_path, .data$sample_id_nest) + select("url_path", "sample_id_nest") quer_res <- purrr::map2_dfr( sample_study_pairs_nest$url_path, @@ -218,28 +220,36 @@ } - # * FUSIONS query ---------------------------------------------------------------------- + # * FUSIONS/SEGMENTS query ---------------------------------------------------------------------- + + # POST: /structural-variant/fetch --- + # BODY: sampleMolecularIdentifiers: molecularProfileId, sampleId # Fusions endpoint works a little differently than Mut and CNA # Instead of passing a sample list, you pass individual sample IDs (retrieved using list) # Main Goal in this function is to return all results without specifying specific genes to query (as is needed in other endpoints). - if(data_type == "fusion") { + if(data_type %in% c("fusion", "segment")) { - quer_res <- purrr::map2_dfr( - sample_study_pairs$sample_id, - sample_study_pairs$molecular_profile_id, - - function(x, y) { + quer_res <- purrr::pmap_dfr( + sample_study_pairs, + function(sample_id, molecular_profile_id, study_id) { - body_n <- list( - sampleMolecularIdentifiers = as.data.frame(list( - molecularProfileId = jsonlite::unbox(y), - sampleId = x - )) - ) - + body_n <- switch(data_type, + fusion = { + list( + sampleMolecularIdentifiers = as.data.frame(list( + molecularProfileId = jsonlite::unbox(molecular_profile_id), + sampleId = sample_id + )) + ) + }, + segment = { + data.frame( + "sampleId" = sample_id, + "studyId" = study_id) + }) res <- cbp_api(url_path = paste0(url_data_type, "/fetch?"), method = "post", @@ -259,11 +269,16 @@ # Since you don't query by genes, filter genes at end so behaviour is consistent # with mutation/cna endpoints where you have to specify genes to query - df <- df_fus %>% - purrr::when( - (nrow(.) > 0 & !is.null(resolved_genes)) ~ filter(., (.data$site1EntrezGeneId %in% resolved_genes) | (.data$site2EntrezGeneId %in% resolved_genes)), - TRUE ~ .) - + df <- switch(data_type, + fusion = { + if (nrow(df_fus) > 0 & !is.null(resolved_genes)) { + filter(df_fus, (.data$site1EntrezGeneId %in% resolved_genes) | (.data$site2EntrezGeneId %in% resolved_genes)) + } else { + df_fus + } + }, + segment = df_fus + ) } @@ -275,6 +290,7 @@ # Fusions already has hugo by default from API df <- switch(data_type, "fusion" = df, + "segment" = df, "mutation" = if(nrow(df) > 0) .lookup_hugo(df, base_url = base_url), "cna" = if(nrow(df) > 0) .lookup_hugo(df, base_url = base_url)) @@ -416,17 +432,55 @@ get_fusions_by_sample <- function(sample_id = NULL, get_structural_variants_by_sample <- get_fusions_by_sample + +#' Get Copy Number Segmentation Data By Sample ID +#' +#' @inheritParams .get_data_by_sample +#' @return A dataframe of CNA segments +#' @export +#' +#' @examples +#' \dontrun{ +#' set_cbioportal_db("public") +#' +#' get_segments_by_sample(sample_id = c("s_C_CAUWT7_P001_d"), +#' study_id = "prad_msk_2019") +#' } + + +get_segments_by_sample <- function(sample_id = NULL, + study_id = NULL, + sample_study_pairs = NULL, + base_url = NULL) { + + .get_data_by_sample(sample_id = sample_id, + study_id = study_id, + molecular_profile_id = NULL, + sample_study_pairs = sample_study_pairs, + data_type = c("segment"), + genes = NULL, + panel = NULL, + # this shouldn't matter for seg data + add_hugo = TRUE, + base_url = base_url) + + +} + #' Get All Genomic Information By Sample IDs #' #' @inheritParams .get_data_by_sample -#' @return A list of mutations, cna and structural variants (including fusions), if available. +#' @param return_segments Default is `FALSE` where copy number segmentation data won't be returned in addition to the mutation, cna and structural variant data. +#' `TRUE` will return any available segmentation data with results. +#' @return A list of mutations, cna and structural variants (including fusions), if available. Will also return copy number segmentation data if `return_segments = TRUE`. #' @export #' #' #' @examples #' \dontrun{ #' get_genetics_by_sample(sample_id = c("TCGA-OR-A5J2-01","TCGA-OR-A5J6-01"), -#' study_id = "acc_tcga") +#' study_id = "acc_tcga", +#' return_segments = TRUE) #' } # get_genetics_by_sample <- function(sample_id = NULL, @@ -435,11 +489,18 @@ get_genetics_by_sample <- function(sample_id = NULL, genes = NULL, panel = NULL, add_hugo = TRUE, - base_url = NULL) { + base_url = NULL, + return_segments = FALSE) { + + data_types <- c("mutation", "cna", "structural_variant") + + if(return_segments) { + data_types <- c(data_types, "segment") + } safe_get_data <- purrr::safely(.get_data_by_sample, quiet = TRUE) - res <- c("mutation", "cna", "structural_variant") %>% + res <- data_types %>% purrr::set_names() %>% purrr::map(., function(x) { safe_get_data(sample_id = sample_id, diff --git a/R/genomics_by_study.R b/R/genomics_by_study.R index e24bcdf..6e18135 100644 --- a/R/genomics_by_study.R +++ b/R/genomics_by_study.R @@ -11,7 +11,8 @@ #' on molecular_profile_id. #' @param molecular_profile_id a molecular profile to query mutations. #' If NULL, guesses molecular_profile_id based on study ID. -#' @param data_type specify what type of data to return. Options are`mutation`, `cna`, `fusion`, or `structural_variant` (same as `fusion`). +#' @param data_type specify what type of data to return. Options are`mutation`, `cna`, `fusion`, or`structural_variant` (same as `fusion`), +#' and `segment` (copy number segmentation data).. #' @param add_hugo Logical indicating whether `HugoGeneSymbol` should be added to your resulting data frame, if not already present in raw API results. #' Argument is `TRUE` by default. If `FALSE`, results will be returned as is (i.e. any existing Hugo Symbol columns in raw results will not be removed). #' @param base_url The database URL to query @@ -35,7 +36,7 @@ #' .get_data_by_study <- function(study_id = NULL, molecular_profile_id = NULL, - data_type = c("mutation", "cna", "fusion", "structural_variant"), + data_type = c("mutation", "cna", "fusion", "structural_variant", "segment"), base_url = NULL, add_hugo = TRUE) { @@ -51,20 +52,24 @@ } # fusions and structural_variants are the same. fusion is older nomenclature. - data_type <- match.arg(data_type) %>% - purrr::when(. == "structural_variant" ~ "fusion", - TRUE ~ .) + data_type <- match.arg(data_type) - # study ID provided and profile is NULL - # If study ID is not correct, informative error thrown - molecular_profile_id <- molecular_profile_id %||% - .lookup_profile_name(data_type, study_id, base_url = base_url) + data_type <- dplyr::case_when( + data_type == "structural_variant" ~ "fusion", + TRUE ~ data_type) + if(data_type != "segment") { + # study ID provided and profile is NULL + # If study ID is not correct, informative error thrown + molecular_profile_id <- molecular_profile_id %||% + .lookup_profile_name(data_type, study_id, base_url = base_url) - # if study_id is NULL or not NULL molecular profile ID can't be NULL - study_id <- .lookup_study_name(molecular_profile_id = molecular_profile_id, - study_id = study_id, - base_url = base_url) + + # if study_id is NULL or not NULL molecular profile ID can't be NULL + study_id <- .lookup_study_name(molecular_profile_id = molecular_profile_id, + study_id = study_id, + base_url = base_url) + } # this text goes in query URL path @@ -72,7 +77,8 @@ data_type, "mutation" = "mutations", "fusion" = "structural-variant", - "cna" = "discrete-copy-number") + "cna" = "discrete-copy-number", + "segment" = "copy-number-segments") # Some API endpoints require that you pass a sample list ID. All studies should have an "all" list which is the default for this function sample_list_id <- paste0(study_id, "_all") @@ -80,6 +86,8 @@ # MUTATION/CNA query ---------------------------------------------------------------------- + # GET: /molecular-profiles/{molecularProfileId}/molecular-data?{sample_list_id} + if(data_type %in% c("mutation", "cna")) { url_list <- paste0( @@ -102,13 +110,16 @@ }) } - # FUSIONS query ---------------------------------------------------------------------- + # FUSIONS/SEGMENTS query ---------------------------------------------------------------------- + + # POST: /structural-variant/fetch --- + # BODY: sampleMolecularIdentifiers: molecularProfileId, sampleId # Fusions endpoint works a little differently than Mutation and CNA # Instead of passing a sample list, you pass individual sample IDs (retrieved using list) # Main Goal in this function is to return all results without specifying specific genes to query (as is needed in other endpoints). - if(data_type == "fusion") { + if(data_type %in% c("fusion", "segment")) { # Need to get all sample IDs in study for fusion retrieval sample_list_url <- paste0("sample-lists/", sample_list_id) @@ -122,13 +133,18 @@ fus_imp <- purrr::map_dfr(all_samples_in_study, function(x) { - body <- list( - sampleMolecularIdentifiers = as.data.frame(list( - molecularProfileId = jsonlite::unbox(molecular_profile_id), - sampleId = x - )) - ) - + body <- switch(data_type, + fusion = { + list( + sampleMolecularIdentifiers = as.data.frame(list( + molecularProfileId = jsonlite::unbox(molecular_profile_id), + sampleId = x))) + }, + segment = { + data.frame( + "sampleId" = x, + "studyId" = study_id) + }) fus <- cbp_api( url_path = paste0(url_data_type, "/fetch?"), @@ -154,15 +170,24 @@ if(add_hugo) { - # Fusions already has hugo by default from API + # Fusions already has hugo by default from API, segment doesn't need df <- switch(data_type, "fusion" = df, + "segment" = df, "mutation" = if(nrow(df) > 0) .lookup_hugo(df, base_url = base_url), "cna" = if(nrow(df) > 0) .lookup_hugo(df, base_url = base_url)) } - cli::cli_alert_info("Returning all data for the {.val {molecular_profile_id}} molecular profile in the {.val {study_id}} study") + alert_string = ifelse(data_type != "segment", + "Returning all data for the {.val {molecular_profile_id}} molecular profile in the {.val {study_id}} study", + "Returning all {.val copy number segmentation} data for the {.val {study_id}} study") + + if(data_type == "segment" & nrow(df) == 0) { + cli::cli_abort("No {.val copy number segmentation} data found for {.val {study_id}} study") + } + + cli::cli_alert_info(alert_string) return(df) @@ -250,10 +275,37 @@ get_fusions_by_study <- function(study_id = NULL, #' @export get_structural_variants_by_study <- get_fusions_by_study + +#' Get Copy Number Segmentation Data By Study +#' +#' @inheritParams .get_data_by_study +#' @return A dataframe of CNA segments +#' @export +#' @examples +#' \dontrun{ +#' get_segments_by_study(study_id = "prad_msk_2019") +#' get_segments_by_study(molecular_profile_id = "prad_msk_2019_cna") +#' } + +get_segments_by_study <- function(study_id = NULL, + add_hugo = TRUE, + base_url = NULL) { + + .get_data_by_study(study_id = study_id, + molecular_profile_id = NULL, + data_type = c("segment"), + add_hugo = add_hugo, + base_url = base_url) +} + + + #' Get All Genomic Information By Study #' #' @inheritParams .get_data_by_study -#' @return A list of mutations, cna and structural variants (including fusions), if available. +#' @param return_segments Default is `FALSE` where copy number segmentation data won't be returned in addition to the mutation, cna and structural variant data. +#' `TRUE` will return any available segmentation data with results. +#' @return A list of mutations, cna and structural variants (including fusions), if available. Will also return copy number segmentation data if `return_segments = TRUE`. #' @export #' @examples #' \dontrun{ @@ -262,7 +314,8 @@ get_structural_variants_by_study <- get_fusions_by_study # get_genetics_by_study <- function(study_id = NULL, add_hugo = TRUE, - base_url = NULL) { + base_url = NULL, + return_segments = FALSE) { # ** Not using `.check_for_study_id()` here because we allow no study ID to be passed, # but still don't allow study_id > 1. Maybe generalized that check function to @@ -270,10 +323,15 @@ get_genetics_by_study <- function(study_id = NULL, if(length(study_id) > 1) { cli::cli_abort(c("{.code length(study_id)} must be 1. You can only pass one {.code study_id} at a time"))} + data_types <- c("mutation", "cna", "structural_variant") + + if(return_segments) { + data_types <- c(data_types, "segment") + } safe_get_data <- purrr::safely(.get_data_by_study, quiet = TRUE) - res <- c("mutation", "cna", "structural_variant") %>% + res <- data_types %>% purrr::set_names() %>% purrr::map(., function(x) { safe_get_data(study_id = study_id, diff --git a/R/utils.R b/R/utils.R index f0f2e66..2693b3c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -9,11 +9,14 @@ #' .check_for_study_id <- function(study_id) { - study_id %>% purrr::when( - is.null(.) ~ cli::cli_abort(c("You must provide a {.code study_id}. See {.code get_studies()} to view available studies on your database")), - length(.) > 1 ~ cli::cli_abort(c("{.code length(study_id)} must be 1. You can only pass one {.code study_id} at a time")), - ~ NULL - ) + study_id %||% + cli::cli_abort(c("You must provide a {.code study_id}. See {.code get_studies()} to view available studies on your database")) + + if(length(study_id) > 1) { + cli::cli_abort(c("{.code length(study_id)} must be 1. You can only pass one {.code study_id} at a time")) + } else { + NULL + } } @@ -73,17 +76,18 @@ accepted_names <- c(final_names, stringr::str_remove_all(final_names, "_")) - output_df <- input_df %>% - purrr::when( - !(any(stringr::str_detect(names(.), paste0(accepted_names[c(1, 3)], collapse = "|"))) & - any(stringr::str_detect(names(.), paste0(accepted_names[c(2, 4)], collapse = "|")))) ~ - cli::cli_abort("{arg_name} must have the following columns: {final_names}"), - TRUE ~ select( - ., (contains("sample") | contains("patient")), - contains("study") - ) %>% + output_df <- + if(!(any(stringr::str_detect(names(input_df), paste0(accepted_names[c(1, 3)], collapse = "|"))) & + any(stringr::str_detect(names(input_df), paste0(accepted_names[c(2, 4)], collapse = "|"))))) { + + cli::cli_abort("{arg_name} must have the following columns: {final_names}") + } else { + select(input_df, (contains("sample") | + contains("patient")), + contains("study")) %>% purrr::set_names(final_names) - ) + } + # if molecular_profile_id passed, keep it optional_molec <- c("molecular_profile_id", @@ -148,12 +152,13 @@ resolved_profile <- switch(data_type, mutation = filter(profs, .data$molecularAlterationType == "MUTATION_EXTENDED") %>% - pull(.data$molecularProfileId), + dplyr::pull(.data$molecularProfileId), fusion = filter(profs, .data$molecularAlterationType == "STRUCTURAL_VARIANT") %>% - pull(.data$molecularProfileId), + dplyr::pull(.data$molecularProfileId), cna = filter(profs, .data$molecularAlterationType == "COPY_NUMBER_ALTERATION" & .data$datatype == "DISCRETE") %>% - pull(.data$molecularProfileId)) + dplyr::pull(.data$molecularProfileId), + segment = "Not Applicable") if(length(resolved_profile) == 0) { @@ -180,6 +185,7 @@ # study_id = NULL- will return all studies quietly # If study ID is supplied but wrong (doesn't exist in database) this will fail quiet_available_profiles <- purrr::quietly(available_profiles) + profs <- tryCatch( # ** Maybe there can be a better API fail message that propagates throughout because base_url should always be checked/throw error before @@ -190,7 +196,7 @@ resolved_study_id <- profs$result %>% filter(.data$molecularProfileId == molecular_profile_id) %>% - pull(.data$studyId) + dplyr::pull(.data$studyId) if(length(resolved_study_id) == 0) { cli::cli_abort("Molecular profile {.val {molecular_profile_id}} doesn't exist, or molecular profile doesn't match the {.val study_id} you passed. See {.code available_profiles()} or {.code available_studies()}") @@ -222,10 +228,10 @@ } hugo <- get_hugo_symbol(unique(df$entrezGeneId)) %>% - select(-.data$type) + select(-"type") df_with_hugo <- left_join(df, hugo, by = "entrezGeneId" ) %>% - select(.data$hugoGeneSymbol, .data$entrezGeneId, everything()) + select("hugoGeneSymbol", "entrezGeneId", everything()) # If there happens to be more than 1 hugo per entrez if(!(nrow(df_with_hugo) == nrow(df))) { @@ -276,3 +282,16 @@ } + +#' Check if NULL +#' +#' @param x any R object or expression +#' +#' @noRd +#' @keywords internal +#' +#' @examples +#' is_not_null(NULL) +is_not_null <- function(x) { + !is.null(x) +} diff --git a/codemeta.json b/codemeta.json index 1bbf48d..95cd4a4 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,13 +8,13 @@ "codeRepository": "https://github.com/karissawhiting/cbioportalR", "issueTracker": "https://github.com/karissawhiting/cbioportalR/issues", "license": "https://spdx.org/licenses/MIT", - "version": "1.0.1.9000", + "version": "1.0.1.9002", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", "url": "https://r-project.org" }, - "runtimePlatform": "R version 4.2.1 (2022-06-23)", + "runtimePlatform": "R version 4.2.2 (2022-10-31)", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", @@ -258,7 +258,7 @@ }, "SystemRequirements": null }, - "fileSize": "1961.656KB", + "fileSize": "2286.409KB", "releaseNotes": "https://github.com/karissawhiting/cbioportalR/blob/master/NEWS.md", "readme": "https://github.com/karissawhiting/cbioportalR/blob/main/README.md", "contIntegration": ["https://github.com/karissawhiting/cbioportalR/actions", "https://app.codecov.io/gh/karissawhiting/cbioportalR?branch=master"] diff --git a/man/dot-get_data_by_sample.Rd b/man/dot-get_data_by_sample.Rd index c145a6b..9a41aca 100644 --- a/man/dot-get_data_by_sample.Rd +++ b/man/dot-get_data_by_sample.Rd @@ -9,7 +9,7 @@ study_id = NULL, molecular_profile_id = NULL, sample_study_pairs = NULL, - data_type = c("mutation", "cna", "fusion", "structural_variant"), + data_type = c("mutation", "cna", "fusion", "structural_variant", "segment"), genes = NULL, panel = NULL, add_hugo = TRUE, diff --git a/man/dot-get_data_by_study.Rd b/man/dot-get_data_by_study.Rd index c69760f..396a64b 100644 --- a/man/dot-get_data_by_study.Rd +++ b/man/dot-get_data_by_study.Rd @@ -7,7 +7,7 @@ .get_data_by_study( study_id = NULL, molecular_profile_id = NULL, - data_type = c("mutation", "cna", "fusion", "structural_variant"), + data_type = c("mutation", "cna", "fusion", "structural_variant", "segment"), base_url = NULL, add_hugo = TRUE ) @@ -19,7 +19,8 @@ on molecular_profile_id.} \item{molecular_profile_id}{a molecular profile to query mutations. If NULL, guesses molecular_profile_id based on study ID.} -\item{data_type}{specify what type of data to return. Options are\code{mutation}, \code{cna}, \code{fusion}, or \code{structural_variant} (same as \code{fusion}).} +\item{data_type}{specify what type of data to return. Options are\code{mutation}, \code{cna}, \code{fusion}, or\code{structural_variant} (same as \code{fusion}), +and \code{segment} (copy number segmentation data)..} \item{base_url}{The database URL to query If \code{NULL} will default to URL set with \verb{set_cbioportal_db()}} diff --git a/man/get_genetics_by_sample.Rd b/man/get_genetics_by_sample.Rd index 11272ce..4804cfb 100644 --- a/man/get_genetics_by_sample.Rd +++ b/man/get_genetics_by_sample.Rd @@ -11,7 +11,8 @@ get_genetics_by_sample( genes = NULL, panel = NULL, add_hugo = TRUE, - base_url = NULL + base_url = NULL, + return_segments = FALSE ) } \arguments{ @@ -36,9 +37,12 @@ Argument is \code{TRUE} by default. If \code{FALSE}, results will be returned as \item{base_url}{The database URL to query If \code{NULL} will default to URL set with \verb{set_cbioportal_db()}} + +\item{return_segments}{Default is \code{FALSE} where copy number segmentation data won't be returned in addition to the mutation, cna and structural variant data. +\code{TRUE} will return any available segmentation data with results.} } \value{ -A list of mutations, cna and structural variants (including fusions), if available. +A list of mutations, cna and structural variants (including fusions), if available. Will also return copy number segmentation data if \code{return_segments = TRUE}. } \description{ Get All Genomic Information By Sample IDs @@ -46,6 +50,7 @@ Get All Genomic Information By Sample IDs \examples{ \dontrun{ get_genetics_by_sample(sample_id = c("TCGA-OR-A5J2-01","TCGA-OR-A5J6-01"), - study_id = "acc_tcga") + study_id = "acc_tcga", + return_segments = TRUE) } } diff --git a/man/get_genetics_by_study.Rd b/man/get_genetics_by_study.Rd index 132940d..9872b69 100644 --- a/man/get_genetics_by_study.Rd +++ b/man/get_genetics_by_study.Rd @@ -4,7 +4,12 @@ \alias{get_genetics_by_study} \title{Get All Genomic Information By Study} \usage{ -get_genetics_by_study(study_id = NULL, add_hugo = TRUE, base_url = NULL) +get_genetics_by_study( + study_id = NULL, + add_hugo = TRUE, + base_url = NULL, + return_segments = FALSE +) } \arguments{ \item{study_id}{A study ID to query mutations. If NULL, guesses study ID based @@ -15,9 +20,12 @@ Argument is \code{TRUE} by default. If \code{FALSE}, results will be returned as \item{base_url}{The database URL to query If \code{NULL} will default to URL set with \verb{set_cbioportal_db()}} + +\item{return_segments}{Default is \code{FALSE} where copy number segmentation data won't be returned in addition to the mutation, cna and structural variant data. +\code{TRUE} will return any available segmentation data with results.} } \value{ -A list of mutations, cna and structural variants (including fusions), if available. +A list of mutations, cna and structural variants (including fusions), if available. Will also return copy number segmentation data if \code{return_segments = TRUE}. } \description{ Get All Genomic Information By Study diff --git a/man/get_segments_by_sample.Rd b/man/get_segments_by_sample.Rd new file mode 100644 index 0000000..42de58f --- /dev/null +++ b/man/get_segments_by_sample.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/genomics_by_sample.R +\name{get_segments_by_sample} +\alias{get_segments_by_sample} +\title{Get Copy Number Segmentation Data By Sample ID} +\usage{ +get_segments_by_sample( + sample_id = NULL, + study_id = NULL, + sample_study_pairs = NULL, + base_url = NULL +) +} +\arguments{ +\item{sample_id}{a vector of sample IDs (character)} + +\item{study_id}{A string indicating the study ID from which to pull data. If no study ID, will +guess the study ID based on your URL and inform. Only 1 study ID can be passed. If mutations/cna from +more than 1 study needed, see \code{sample_study_pairs}} + +\item{sample_study_pairs}{A dataframe with columns: \code{sample_id}, \code{study_id} and \code{molecular_profile_id} (optional). Variations in capitalization of column names are accepted. +This can be used in place of \code{sample_id}, \code{study_id}, \code{molecular_profile_id} arguments above if you +need to pull samples from several different studies at once. If passed this will take overwrite \code{sample_id}, \code{study_id}, \code{molecular_profile_id} if also passed.} + +\item{base_url}{The database URL to query +If \code{NULL} will default to URL set with \verb{set_cbioportal_db()}} +} +\value{ +A dataframe of CNA segments +} +\description{ +Get Copy Number Segmentation Data By Sample ID +} +\examples{ +\dontrun{ +set_cbioportal_db("public") + +get_segments_by_sample(sample_id = c("s_C_CAUWT7_P001_d"), + study_id = "prad_msk_2019") + } +} diff --git a/man/get_segments_by_study.Rd b/man/get_segments_by_study.Rd new file mode 100644 index 0000000..b9ae3cf --- /dev/null +++ b/man/get_segments_by_study.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/genomics_by_study.R +\name{get_segments_by_study} +\alias{get_segments_by_study} +\title{Get Copy Number Segmentation Data By Study} +\usage{ +get_segments_by_study(study_id = NULL, add_hugo = TRUE, base_url = NULL) +} +\arguments{ +\item{study_id}{A study ID to query mutations. If NULL, guesses study ID based +on molecular_profile_id.} + +\item{add_hugo}{Logical indicating whether \code{HugoGeneSymbol} should be added to your resulting data frame, if not already present in raw API results. +Argument is \code{TRUE} by default. If \code{FALSE}, results will be returned as is (i.e. any existing Hugo Symbol columns in raw results will not be removed).} + +\item{base_url}{The database URL to query +If \code{NULL} will default to URL set with \verb{set_cbioportal_db()}} +} +\value{ +A dataframe of CNA segments +} +\description{ +Get Copy Number Segmentation Data By Study +} +\examples{ +\dontrun{ +get_segments_by_study(study_id = "prad_msk_2019") +get_segments_by_study(molecular_profile_id = "prad_msk_2019_cna") +} +} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 738a0d3..3aafb62 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -46,6 +46,7 @@ reference: - get_cna_by_study - get_fusions_by_study - get_structural_variants_by_study + - get_segments_by_study - available_samples - available_patients - available_sample_lists @@ -57,6 +58,7 @@ reference: - get_cna_by_sample - get_fusions_by_sample - get_structural_variants_by_sample + - get_segments_by_sample - get_genetics_by_sample - get_panel_by_sample - subtitle: Gene & Gene Panel Level diff --git a/tests/testthat/test-genomics-by-sample.R b/tests/testthat/test-genomics-by-sample.R index 4f5d470..b1d1b0a 100644 --- a/tests/testthat/test-genomics-by-sample.R +++ b/tests/testthat/test-genomics-by-sample.R @@ -244,7 +244,7 @@ test_that("Unknown Hugo Symbol returns Unk ", { df[16, ] <- df[15,] df[16, 'entrezGeneId'] <- 1000000 - df <- df %>% select(-.data$hugoGeneSymbol) + df <- df %>% select(-"hugoGeneSymbol") df2 <- .lookup_hugo(df) expect_true(any(stringr::str_detect(df2$hugoGeneSymbol, "unk"))) diff --git a/tests/testthat/test-genomics-by-study.R b/tests/testthat/test-genomics-by-study.R index 42468a6..5ed018b 100644 --- a/tests/testthat/test-genomics-by-study.R +++ b/tests/testthat/test-genomics-by-study.R @@ -212,6 +212,15 @@ test_that("data is same regardless of function", { by_prof2<- get_structural_variants_by_study(molecular_profile_id = molecular_profile_id) expect_identical(by_study, by_study2, by_prof, by_prof2, get_gen$structural_variant) + # Segmentation ---- + study_id = "acc_tcga" + get_gen2 <- get_genetics_by_study(study_id, return_segments = TRUE) + by_study <- get_segments_by_study(study = study_id) + expect_identical(by_study, get_gen2$segment) + + # test return_segments + get_gen3 <- get_genetics_by_study(study_id, return_segments = FALSE) + expect_gt(length(get_gen2), length(get_gen3)) }) test_that("get_genetics- one data type non existant", { diff --git a/vignettes/overview-of-workflow.Rmd.orig b/vignettes/overview-of-workflow.Rmd.orig index c972807..f1e2417 100644 --- a/vignettes/overview-of-workflow.Rmd.orig +++ b/vignettes/overview-of-workflow.Rmd.orig @@ -218,6 +218,18 @@ all.equal(mut_study_subset, mut_sample) Both results are equal. + +Note: some studies also have copy number segments data available that can be pulled by study ID or sample ID: + + +```{r} +seg_blca <- get_segments_by_study("blca_nmibc_2017") + +# To pull alongside other genomic data types, use the `return_segments` argument +all_genomic_blca <- get_genetics_by_study("blca_nmibc_2017", return_segments = TRUE) + +``` + #### Limit Results to Specified Genes or Panels When pulling by sample IDs, we can also limit our results to a specific set of genes by passing a vector of Entrez Gene IDs or Hugo Symbols to the `gene` argument, or a specified panel by passing a panel ID to the `panel` argument (see `available_gene_panels()` for supported panels). This can be useful if, for example, we want to pull all IMPACT gene results for two studies but one of the two uses a much larger panel. In that case, we can limit our query to just the genes for which we want results: