Skip to content

Commit

Permalink
Merge pull request #324 from MSKCC-Epi-Bio/internal-colname-messages
Browse files Browse the repository at this point in the history
Internal colname messages
  • Loading branch information
hfuchs5 authored Oct 9, 2023
2 parents f4e75b6 + 9b5e06f commit 2408e08
Show file tree
Hide file tree
Showing 9 changed files with 101 additions and 64 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Fixed bug in `add_pathways()` where `custom_pathways` wasn't catching all types of alterations when `GENE.all` was used due to `paste0()` vectorization.
- Changed some arguments to strict matching (`rlang::arg_match()`) instead of partial matching (`match.arg()`) (e.g. `mut_type = "s"` doesn't work anymore and must be fully specified `mut_type = "somatic_only"`).
- Added unit tests for gnomeR plots/visuals (#144).
- A dictionary of old to new names for `rename_columns()` output is now an attribute of the returned object. Now messages can reference the original names of data columns (ex: `TumorAllele2` not `tumor_allele_2`) to make it more intuitive to users (#302).


# gnomeR 1.2.0
Expand Down
9 changes: 6 additions & 3 deletions R/create-gene-binary.R
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ create_gene_binary <- function(samples = NULL,
include_silent = include_silent
)
)
names_mut_dict <- attr(mutation, "names_dict")

# * Fusion checks ----------
fusion <- switch(!is.null(fusion),
Expand Down Expand Up @@ -249,7 +250,8 @@ create_gene_binary <- function(samples = NULL,
mut_type = mut_type,
snp_only = snp_only,
include_silent = include_silent,
specify_panel = specify_panel
specify_panel = specify_panel,
names_mut_dict = names_mut_dict
)
)

Expand Down Expand Up @@ -377,7 +379,8 @@ create_gene_binary <- function(samples = NULL,
mut_type,
snp_only,
include_silent,
specify_panel) {
specify_panel,
names_mut_dict) {

# apply filters --------------

Expand Down Expand Up @@ -411,7 +414,7 @@ create_gene_binary <- function(samples = NULL,

if ((blank_muts > 0)) {
cli::cli_alert_warning(
"{(blank_muts)} mutations have {.code NA} or blank in mutation status column instead of 'SOMATIC' or 'GERMLINE'. These were assumed to be 'SOMATIC' and were retained in the resulting binary matrix.")
"{(blank_muts)} mutations have {.code NA} or blank in the {.field {dplyr::first(c(names_mut_dict['mutation_status'], 'mutation_status'), na_rm = TRUE)}} column instead of 'SOMATIC' or 'GERMLINE'. These were assumed to be 'SOMATIC' and were retained in the resulting binary matrix.")
}
},
"somatic_only" = {
Expand Down
24 changes: 13 additions & 11 deletions R/sanitize-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ sanitize_mutation_input <- function(mutation, include_silent, ...) {
arguments <- list(...)

mutation <- rename_columns(mutation)
names_dict <- attr(mutation, "names_dict")
column_names <- colnames(mutation)

# Check required columns & data types ------------------------------------------
Expand Down Expand Up @@ -57,28 +58,29 @@ sanitize_mutation_input <- function(mutation, include_silent, ...) {

# Variant_Type ---
if (!("variant_type" %in% column_names)) {
if (("reference_allele" %in% column_names) & ("tumor_seq_allele2" %in% column_names)) {
if (("reference_allele" %in% column_names) & ("tumor_seq_allele_2" %in% column_names)) {
mutation %>%
mutate(
reference_allele = as.character(.data$reference_allele),
tumor_seq_allele2 = as.character(.data$tumor_seq_allele2),
tumor_seq_allele_2 = as.character(.data$tumor_seq_allele_2),
variant_type = case_when(
.data$reference_allele %in% c("A", "T", "C", "G") &
.data$tumor_seq_allele2 %in% c("A", "T", "C", "G") ~ "SNP",
nchar(.data$tumor_seq_allele2) < nchar(.data$reference_allele) |
.data$tumor_seq_allele2 == "-" ~ "DEL",
.data$tumor_seq_allele_2 %in% c("A", "T", "C", "G") ~ "SNP",
nchar(.data$tumor_seq_allele_2) < nchar(.data$reference_allele) |
.data$tumor_seq_allele_2 == "-" ~ "DEL",
.data$reference_allele == "-" |
nchar(.data$tumor_seq_allele2) > nchar(.data$reference_allele) ~ "INS",
nchar(.data$reference_allele) == 2 & nchar(.data$tumor_seq_allele2) == 2 ~ "DNP",
nchar(.data$reference_allele) == 3 & nchar(.data$tumor_seq_allele2) == 3 ~ "TNP",
nchar(.data$reference_allele) > 3 & nchar(.data$tumor_seq_allele2) == nchar(.data$reference_allele) ~ "ONP",
nchar(.data$tumor_seq_allele_2) > nchar(.data$reference_allele) ~ "INS",
nchar(.data$reference_allele) == 2 & nchar(.data$tumor_seq_allele_2) == 2 ~ "DNP",
nchar(.data$reference_allele) == 3 & nchar(.data$tumor_seq_allele_2) == 3 ~ "TNP",
nchar(.data$reference_allele) > 3 & nchar(.data$tumor_seq_allele_2) == nchar(.data$reference_allele) ~ "ONP",
TRUE ~ "Undefined"
)
)

cli::cli_warn("Column {.field variant_type} is missing from your data. We inferred variant types using {.field reference_allele} and {.field tumor_seq_allele2} columns")
cli::cli_warn(c("Column {.field variant_type} is missing from your data. We inferred variant types using ",
"{.field {dplyr::first(c(names_dict['reference_allele'], 'reference_allele'), na_rm = TRUE)}} and {.field {dplyr::first(c(names_dict['tumor_seq_allele_2'], 'tumor_seq_allele_2'), na_rm = TRUE)}} columns"))
} else {
cli::cli_abort("Column {.field variant_type} is missing from your data and {.field reference_allele} and {.field tumor_seq_allele2}
cli::cli_abort("Column {.field variant_type} is missing from your data and {.field reference_allele} and {.field tumor_seq_allele_2}
columns were not available from which to infer variant type.
To proceed, add a column specifying {.field variant_type} (e.g. {.code mutate(<your-mutation-df>, variant_type = 'SNP')}")
}
Expand Down
28 changes: 17 additions & 11 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#' Rename columns from API results to work with gnomeR functions
#'
#' @param df_to_check a data frame to check and recode names as needed
#'
#' Will return a named vector of internal column names as values and original data set names
#' as names as an attribute (`attr(x, "names_dict")`)
#' @param df_to_check A data frame to check and recode names as needed
#' @return a renamed data frame
#' @export
#' @examples
#'
#' rename_columns(df_to_check = gnomeR::mutations)
#' rename_columns(df_to_check = gnomeR::sv)
#'
#' x <- rename_columns(df_to_check = gnomeR::sv)
#' attr(x, "names_dict")
rename_columns <- function(df_to_check) {

names_df_long <- gnomeR::names_df %>%
Expand All @@ -18,32 +19,37 @@ rename_columns <- function(df_to_check) {

which_to_replace <- intersect(names(df_to_check), unique(names_df_long$value))

# create a temporary dictionary as a named vector
temp_dict <- names_df_long %>%
# create a temporary dictionary as a named vector- this should have all relevant values, including those unchanged
names_dict <- names_df_long %>%
dplyr::filter(.data$value %in% which_to_replace) %>%
select("internal_column_name", "value") %>%
dplyr::distinct() %>%
tibble::deframe()


if(length(temp_dict) > 0) {
if(length(names_dict) > 0) {

# store details on what has been changed.
message <- purrr::map2_chr(names(temp_dict),
temp_dict,
message <- purrr::map2_chr(names(names_dict),
names_dict,
~paste0(.y, " renamed ", .x))

names(message) <- rep("!", times = length(message))


# rename those variables only
df_to_check %>%
dplyr::rename(!!temp_dict)
df_to_check <- df_to_check %>%
dplyr::rename(!!names_dict)

attr(df_to_check, "names_dict") <- names_dict
}

return(df_to_check)
}




#' Utility Function to Extract SNV
#'
#' @param x string
Expand Down
2 changes: 1 addition & 1 deletion codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@
},
"SystemRequirements": null
},
"fileSize": "2591.928KB",
"fileSize": "2581.433KB",
"releaseNotes": "https://github.com/MSKCC-Epi-Bio/gnomeR/blob/master/NEWS.md",
"readme": "https://github.com/MSKCC-Epi-Bio/gnomeR/blob/main/README.md",
"contIntegration": ["https://github.com/MSKCC-Epi-Bio/gnomeR/actions", "https://app.codecov.io/gh/MSKCC-Epi-Bio/gnomeR?branch=main"],
Expand Down
3 changes: 2 additions & 1 deletion man/dot-mutations_gene_binary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions man/rename_columns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 26 additions & 31 deletions tests/testthat/test-rename_columns.R
Original file line number Diff line number Diff line change
@@ -1,34 +1,5 @@
#
#
# #thoughts so far:
# # still need title case for binary matrix function
#
# cbioportalR::set_cbioportal_db("public")
#
#
# test_that("test rename_columns runs with no errors", {
#
# expect_error(rename_columns(gnomeR::muations), NA)
#
# })
#
# # do we want it to display what it changed? currently does not.
# test_that("test rename_columns does not have messages", {
#
# expect_message(rename_columns(gnomeR::muations), NA)
#
# })
#
#
# test_that("test colnames are renamed properly", {
#
# expect_snapshot(
# waldo::compare(colnames(gnomeR::muations),
# colnames(rename_columns(gnomeR::muations)))
# )
#
# })
#


# #issue is that this is not title case
test_that("binary matrix runs with renamed columns without error", {

Expand All @@ -43,4 +14,28 @@ test_that("binary matrix runs with renamed columns without error", {

})

test_that("test that it returns warning message with input data column names", {

mut2 <- rename_columns(gnomeR::mutations )
expect_true(!is.null(attr(mut2, "names_dict")))

cna <- rename_columns(gnomeR::cna)
expect_true(!is.null(attr(cna, "names_dict")))

fus <- rename_columns(gnomeR::sv)
expect_true(!is.null(attr(fus, "names_dict")))


})




test_that("test that it returns warning message with input data column names", {

mut2 <- capture_messages(create_gene_binary(mutation = gnomeR::mutations))

expect_true(str_detect(mut2[2], "mutationStatus"))


})
32 changes: 30 additions & 2 deletions tests/testthat/test-sanitize.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ test_that("test variant type inference", {
mutation <- rename_columns(mutation)
column_names <- colnames(mutation)
mutation = mutation %>% select(-c(variant_type))
mutation$tumor_seq_allele2 = mutation$reference_allele
mutation$tumor_seq_allele_2 = mutation$reference_allele

expect_warning(sanitize_mutation_input(mutation, include_silent = F))
})
Expand All @@ -101,11 +101,39 @@ test_that("test variant type inference error", {
mutation = gnomeR::mutations
mutation <- rename_columns(mutation)
column_names <- colnames(mutation)
mutation$tumor_seq_allele2 = mutation$reference_allele
mutation$tumor_seq_allele_2 = mutation$reference_allele
mutation = mutation %>% select(-c(variant_type, reference_allele))

expect_error(sanitize_mutation_input(mutation, include_silent = F))
})


test_that("test that attributes are returned for input data col names", {

mutation <- sanitize_mutation_input(gnomeR::mutations, include_silent = F)
expect_true(!is.null(attr(mutation, "names_dict")))

cna <- sanitize_cna_input(gnomeR::cna)
expect_true(!is.null(attr(cna, "names_dict")))

fus <- sanitize_fusion_input(gnomeR::sv)
expect_true(!is.null(attr(fus, "names_dict")))


})



test_that("Check input colname is used in messaging", {

mut_maf <- gnomeR::mutations %>%
select(-variantType) %>%
mutate(Tumor_Seq_Allele2 = NA)

x <- capture_warning(sanitize_mutation_input(mut_maf, include_silent = FALSE))
expect_true(str_detect(x$message, "Tumor_Seq_Allele2"))


})


0 comments on commit 2408e08

Please sign in to comment.