From 25dd19be76358c644be5b61c58d007173487bdac Mon Sep 17 00:00:00 2001 From: Hannah Fuchs Date: Mon, 28 Aug 2023 14:50:35 -0400 Subject: [PATCH 1/5] update to allow other vars in summarize --- R/summarize-by-gene.R | 27 +++++++++++++++++++--- tests/testthat/test-summarize-by-gene.R | 30 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/R/summarize-by-gene.R b/R/summarize-by-gene.R index 0b875cfe..e069f8a3 100644 --- a/R/summarize-by-gene.R +++ b/R/summarize-by-gene.R @@ -5,6 +5,8 @@ #' columns for mutation/cna/fusion. #' #' @param gene_binary a 0/1 matrix of gene alterations +#' @param other_vars One or more column names (quoted or unquoted) in data to be retained +#' in resulting data frame. Default is NULL. #' #' @return a binary matrix with a row for each sample and one column per gene #' @export @@ -19,7 +21,7 @@ #' ) %>% #' summarize_by_gene() #' -summarize_by_gene <- function(gene_binary) { +summarize_by_gene <- function(gene_binary, other_vars = NULL) { # Checks ------------------------------------------------------------------ @@ -35,6 +37,14 @@ summarize_by_gene <- function(gene_binary) { cli::cli_abort("Your {.field gene_binary} must have unique samples in {.code sample_id} column") } + # Capture Other Columns to Retain ----------------------------------- + + other_vars <- + .select_to_varnames({{ other_vars }}, + data = gene_binary, + arg_name = "other_vars" + ) + # Create Sample Index ----------------------------------------------------- @@ -42,8 +52,16 @@ summarize_by_gene <- function(gene_binary) { select("sample_id") %>% mutate(sample_index = paste0("samp", 1:nrow(gene_binary))) +<<<<<<< Updated upstream alt_only <- as.matrix(select(gene_binary, -"sample_id")) rownames(alt_only) <- sample_index$sample_index +======= + # data frame of only alterations + alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars)) %>% + as.matrix() + + row.names(alt_only) <- sample_index$sample_index +>>>>>>> Stashed changes # check numeric class --------- is_numeric <- apply(alt_only, 2, is.numeric) @@ -85,11 +103,14 @@ summarize_by_gene <- function(gene_binary) { all_bin <- as.data.frame(t(all_bin)) %>% tibble::rownames_to_column("sample_index") - # join back to sample ID + # join back to sample ID and other vars simp_gene_binary <- all_bin %>% left_join(sample_index, ., by = "sample_index") %>% select(-c("sample_index")) %>% - as.data.frame() + as.data.frame()%>% + left_join(gene_binary %>% + select(any_of(c("sample_id", other_vars))))%>% + suppressMessages() simp_gene_binary diff --git a/tests/testthat/test-summarize-by-gene.R b/tests/testthat/test-summarize-by-gene.R index 0d410056..b359b42e 100644 --- a/tests/testthat/test-summarize-by-gene.R +++ b/tests/testthat/test-summarize-by-gene.R @@ -106,5 +106,35 @@ test_that("test what happens to columns with all NA", { }) +<<<<<<< Updated upstream +======= +# test_that("all columns must be numeric to continue", { +# +# }) + +test_that("other vars are retained", { + samples <- Reduce(intersect, list(gnomeR::mutations$sampleId, gnomeR::cna$sampleId, + gnomeR::sv$sampleId)) + + + bin_impact <- create_gene_binary(samples=samples, + mutation = gnomeR::mutations, + cna = gnomeR::cna, + fusion = gnomeR::sv, + specify_panel = "impact") %>% + select(c(sample_id, starts_with("AR"), starts_with("PLCG2"), starts_with("PPM1D"))) + + set.seed(20230828) + + bin_impact$random_color = sample(c("blue", "red", + "yellow"), size = 50, replace = TRUE) + + expect_true("random_color" %in% names(bin_impact)) + sum_impact <- summarize_by_gene(bin_impact, + other_vars = "random_color") + + expect_true("random_color" %in% names(sum_impact)) + expect_true("blue" %in% sum_impact$random_color) +>>>>>>> Stashed changes From 4d75bf4fc6c0712a60bc1644a07b7760a326d6b1 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Fri, 15 Sep 2023 14:49:49 -0400 Subject: [PATCH 2/5] Remove github notes and small updates --- R/subset-by-frequency.R | 2 +- R/summarize-by-gene.R | 21 +++++++-------------- man/summarize_by_gene.Rd | 5 ++++- tests/testthat/test-summarize-by-gene.R | 15 ++++++++------- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/R/subset-by-frequency.R b/R/subset-by-frequency.R index b1eb84e9..cd6ec6ae 100644 --- a/R/subset-by-frequency.R +++ b/R/subset-by-frequency.R @@ -51,7 +51,7 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) { # Remove all NA columns ---------------------------------------------- all_na_alt <- apply(alt_only, 2, function(x) { - sum(is.na(x)) == nrow(alt_only) + # sum(is.na(x)) == nrow(alt_only) }) all_non_na_alt <- names(all_na_alt[!all_na_alt]) diff --git a/R/summarize-by-gene.R b/R/summarize-by-gene.R index e069f8a3..37d4c31b 100644 --- a/R/summarize-by-gene.R +++ b/R/summarize-by-gene.R @@ -52,16 +52,10 @@ summarize_by_gene <- function(gene_binary, other_vars = NULL) { select("sample_id") %>% mutate(sample_index = paste0("samp", 1:nrow(gene_binary))) -<<<<<<< Updated upstream - alt_only <- as.matrix(select(gene_binary, -"sample_id")) - rownames(alt_only) <- sample_index$sample_index -======= - # data frame of only alterations - alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars)) %>% - as.matrix() + # data frame of only alterations + alt_only <- as.data.frame(select(gene_binary, -"sample_id", -any_of(other_vars))) row.names(alt_only) <- sample_index$sample_index ->>>>>>> Stashed changes # check numeric class --------- is_numeric <- apply(alt_only, 2, is.numeric) @@ -106,13 +100,12 @@ summarize_by_gene <- function(gene_binary, other_vars = NULL) { # join back to sample ID and other vars simp_gene_binary <- all_bin %>% left_join(sample_index, ., by = "sample_index") %>% - select(-c("sample_index")) %>% - as.data.frame()%>% - left_join(gene_binary %>% - select(any_of(c("sample_id", other_vars))))%>% - suppressMessages() + select(-c("sample_index")) + + simp_gene_binary <- simp_gene_binary %>% + left_join(select(gene_binary, any_of(c("sample_id", other_vars))), by = "sample_id") - simp_gene_binary + return(simp_gene_binary) } diff --git a/man/summarize_by_gene.Rd b/man/summarize_by_gene.Rd index d2bbb5cc..ff9537b0 100644 --- a/man/summarize_by_gene.Rd +++ b/man/summarize_by_gene.Rd @@ -4,10 +4,13 @@ \alias{summarize_by_gene} \title{Simplify binary matrix to one column per gene that counts any alteration type as 1} \usage{ -summarize_by_gene(gene_binary) +summarize_by_gene(gene_binary, other_vars = NULL) } \arguments{ \item{gene_binary}{a 0/1 matrix of gene alterations} + +\item{other_vars}{One or more column names (quoted or unquoted) in data to be retained +in resulting data frame. Default is NULL.} } \value{ a binary matrix with a row for each sample and one column per gene diff --git a/tests/testthat/test-summarize-by-gene.R b/tests/testthat/test-summarize-by-gene.R index b359b42e..70d69055 100644 --- a/tests/testthat/test-summarize-by-gene.R +++ b/tests/testthat/test-summarize-by-gene.R @@ -106,18 +106,18 @@ test_that("test what happens to columns with all NA", { }) -<<<<<<< Updated upstream -======= + # test_that("all columns must be numeric to continue", { # # }) test_that("other vars are retained", { - samples <- Reduce(intersect, list(gnomeR::mutations$sampleId, gnomeR::cna$sampleId, + samples <- Reduce(intersect, list(gnomeR::mutations$sampleId, + gnomeR::cna$sampleId, gnomeR::sv$sampleId)) - bin_impact <- create_gene_binary(samples=samples, + bin_impact <- create_gene_binary(samples = samples, mutation = gnomeR::mutations, cna = gnomeR::cna, fusion = gnomeR::sv, @@ -126,8 +126,8 @@ test_that("other vars are retained", { set.seed(20230828) - bin_impact$random_color = sample(c("blue", "red", - "yellow"), size = 50, replace = TRUE) + bin_impact$random_color = sample(c("blue", "red", "yellow"), + size = 50, replace = TRUE) expect_true("random_color" %in% names(bin_impact)) sum_impact <- summarize_by_gene(bin_impact, @@ -135,6 +135,7 @@ test_that("other vars are retained", { expect_true("random_color" %in% names(sum_impact)) expect_true("blue" %in% sum_impact$random_color) ->>>>>>> Stashed changes +}) + From 8aef07ceae3af85451714fdd87c9efcba1682dda Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Fri, 15 Sep 2023 15:08:01 -0400 Subject: [PATCH 3/5] not sure why I commented this out --- R/subset-by-frequency.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/subset-by-frequency.R b/R/subset-by-frequency.R index cd6ec6ae..3d77955f 100644 --- a/R/subset-by-frequency.R +++ b/R/subset-by-frequency.R @@ -51,7 +51,7 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) { # Remove all NA columns ---------------------------------------------- all_na_alt <- apply(alt_only, 2, function(x) { - # sum(is.na(x)) == nrow(alt_only) + sum(is.na(x)) == nrow(alt_only) }) all_non_na_alt <- names(all_na_alt[!all_na_alt]) From e2ec06b05cb5ce586741f0493094f7b3c19659c0 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Fri, 15 Sep 2023 15:12:12 -0400 Subject: [PATCH 4/5] update tests --- tests/testthat/test-summarize-by-gene.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/test-summarize-by-gene.R b/tests/testthat/test-summarize-by-gene.R index 96c5bcec..11cae682 100644 --- a/tests/testthat/test-summarize-by-gene.R +++ b/tests/testthat/test-summarize-by-gene.R @@ -63,7 +63,6 @@ test_that("test that genes are properly summarized", { mutate(name = str_remove(name, ".Amp|.Del|.fus"))%>% tidyr::pivot_wider(names_from = name, values_from = value, values_fn = function (x) sum(x))%>% mutate(across(!sample_id, ~ifelse(. > 0, 1, 0)))%>% - as.data.frame()%>% relocate(colnames(sum_impact)) expect_equal(sum_impact, bin_impact_test) From 69936c85950c953cbef2ad7f4eecff655cfd0399 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Fri, 15 Sep 2023 15:16:43 -0400 Subject: [PATCH 5/5] update test --- tests/testthat/test-summarize-by-gene.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/test-summarize-by-gene.R b/tests/testthat/test-summarize-by-gene.R index 11cae682..4e3cb0b8 100644 --- a/tests/testthat/test-summarize-by-gene.R +++ b/tests/testthat/test-summarize-by-gene.R @@ -94,7 +94,6 @@ test_that("test what happens to columns with all NA", { mutate(name = str_remove(name, ".Amp|.Del|.fus"))%>% tidyr::pivot_wider(names_from = name, values_from = value, values_fn = function (x) sum(x))%>% mutate(across(!sample_id, ~ifelse(. > 0, 1, 0)))%>% - as.data.frame()%>% relocate(colnames(sum_impact))%>% mutate_if(~ all(is.na(.)), ~as.numeric(NA_integer_))