code speedup

caravagnalab · Apr 8, 2024 · dd6810a · dd6810a
1 parent d70bc55
commit dd6810a
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 139 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -1,121 +1,3 @@
-any(!is.integer(genomic_data$NV))
-any(!is.numeric(genomic_data$VAF))
-any(is.na(gene))
-any(is.na(genomic_data$\gene))
-any(is.na(genomic_data$gene))
-any(is.na(genomic_data$NV))
-any(is.na(genomic_data$DP))
-genomic_data %>% filter(DP<=0)
-genomic_data %>% filter(NV<=0)
-genomic_data %>% filter(VAF<=0)
-genomic_data %>% filter(is.na(gene))
-genomic_data %>% filter(!(gene %in% cancer_gene_census$gene))
-input = init(
-genomic_data = genomic_data %>% filter(!(gene %in% cancer_gene_census$gene)),
-clinical_data = x$clinical_data %>% filter(tumor_type == 'CSCC'),
-gene_roles = cancer_gene_census
-)
-source('./scripts/library.R')
-x = prepare_msk_data()
-library(INCOMMON)
-input = init(
-genomic_data = genomic_data %>% filter(!(gene %in% cancer_gene_census$gene)),
-clinical_data = x$clinical_data %>% filter(tumor_type == 'CSCC'),
-gene_roles = cancer_gene_census
-)
-output = classify(
-x = input,
-priors = pcawg_priors,
-entropy_cutoff = 0.2,
-rho = 0.01
-)
-priors = pcawg_priors
-entropy_cutoff = 0.2
-rho = 0.01
-entropy_cutoff = 1
-output = x
-output$classification = list()
-cli::cli_h1("INCOMMON inference of copy number and mutation multiplicity for sample {.field {x$sample}}")
-cat("\n")
-check_input(x)
-cli::cli_alert_info("Performing classification")
-x = idify(x)
-x$input = x$input %>% dplyr::mutate(id = paste(sample, chr,
-from, to, ref, alt, NV, DP, sep = ":"))
-x$input
-input = init(
-genomic_data = genomic_data %>% filter(!(gene %in% cancer_gene_census$gene)),
-clinical_data = x$clinical_data %>% filter(tumor_type == 'CSCC'),
-gene_roles = cancer_gene_census
-)
-input
-input = init(
-genomic_data = genomic_data,
-clinical_data = x$clinical_data %>% filter(tumor_type == 'CSCC'),
-gene_roles = cancer_gene_census
-)
-output = classify(
-x = input,
-priors = pcawg_priors,
-entropy_cutoff = 0.2,
-rho = 0.01
-)
-devtools::install_github('caravagnalab/INCOMMON')
-source('./scripts/library.R')
-x = prepare_msk_data()
-library(INCOMMON)
-input = init(
-genomic_data = genomic_data,
-clinical_data = x$clinical_data %>% filter(tumor_type == 'CSCC'),
-gene_roles = cancer_gene_census
-)
-input = init(
-genomic_data = x$genomic_data,
-clinical_data = x$clinical_data %>% filter(tumor_type == 'CSCC'),
-gene_roles = cancer_gene_census
-)
-output = classify(
-x = input,
-priors = pcawg_priors,
-entropy_cutoff = 0.2,
-rho = 0.01,
-parallel = TRUE,
-num_cores = 8
-)
-devtools::install_github("ebesedina/mutmatch")
-setwd('~/Documents/GitHub/INCOMMON/')
-devtools::load_all()
-genomic_data = MSK_genomic_data
-clinical_data = MSK_clinical_data
-gene_roles = INCOMMON::cancer_gene_census
-x = init(genomic_data,
-clinical_data,
-gene_roles = INCOMMON::cancer_gene_census)
-x
-x = init(genomic_data %>% dplyr::slice_head(n=10),
-clinical_data,
-gene_roles = INCOMMON::cancer_gene_census)
-x
-priors = INCOMMON::pcawg_priors
-entropy_cutoff = 0.2
-rho = 0.01
-parallel = TRUE
-num_cores=2
-karyotypes = c("1:0", "1:1", "2:0", "2:1", "2:2")
-output = x
-output$classification = list()
-cli::cli_h1(
-"INCOMMON inference of copy number and mutation multiplicity for sample {.field {x$sample}}"
-)
-cat("\n")
-check_input(x)
-cli::cli_alert_info("Performing classification")
-x = idify(x)
-output = x
-output$classification = list()
-cli::cli_h1(
-"INCOMMON inference of copy number and mutation multiplicity for sample {.field {x$sample}}"
-)
 cat("\n")
 check_input(x)
 cli::cli_alert_info("Performing classification")
@@ -510,3 +392,121 @@ names(x$classification)
 setwd('~/Documents/GitHub/CNAqc/')
 CNAqc:::advanced_phasing
 INCOMMON::classify
+setwd('~/Documents/GitHub/INCOMMON/')
+devtools::load_all()
+library(INCOMMON)
+library(dplyr)
+library(DT)
+sample = 'P-0002081'
+genomic_data = MSK_genomic_data %>% filter(sample == !!sample)
+clinical_data = MSK_clinical_data %>% filter(sample == !!sample)
+print(genomic_data)
+print(clinical_data)
+x = init(genomic_data = genomic_data,
+clinical_data = clinical_data,
+gene_roles = cancer_gene_census)
+print(x)
+priors = pcawg_priors
+entropy_cutoff = 0.2
+rho = 0.01
+karyotypes = c("1:0", "1:1", "2:0", "2:1", "2:2")
+stopifnot(inherits(x, "INCOMMON"))
+if(is.null(entropy_cutoff)) entropy_cutoff = 1
+# Output
+output = x
+output$classification = list()
+cli::cli_h1(
+"INCOMMON inference of copy number and mutation multiplicity for sample {.field {x$sample}}"
+)
+cat("\n")
+check_input(x)
+cli::cli_alert_info("Performing classification")
+x = idify(x)
+classify_single_mutation = function(x, id){
+# Control for duplicates
+if(info(x, id) %>% nrow() > 1){
+cli_alert_warning(text = "More than one mutation mapped at: {.field {id}}")
+info(x, id)
+cli_alert_warning(text = "Keeping first row by default (check your input data)")
+w = which(ids(x)==id)
+x$data = x$data[-w[2:length(w)],]
+}
+# Compute model likelihood, posterior and entropy
+posterior = compute_posterior(
+NV = NV(x, id),
+DP = DP(x, id),
+gene = gene(x, id),
+priors = priors,
+tumor_type = tumor_type(x, id),
+purity = purity(x, id),
+entropy_cutoff = entropy_cutoff,
+rho = rho,
+karyotypes = karyotypes
+)
+# Maximum a posteriori classification
+map = posterior %>%
+dplyr::filter(NV == NV(x, id)) %>%
+dplyr::filter(value == max(value)) %>%
+dplyr::ungroup()
+if (nrow(map) > 1) {
+cli_alert_warning(text =
+"With purity {.field {purity(x, id)}} karyotype {.field {map$karyotype}} with multiplicities {.field {map$multiplicity}} have the same likelihood")
+cli_alert_warning(text =
+"Simplest case will be selected: {.field {map$karyotype[1]}}")
+map = map[1., ]
+}
+map = map %>% dplyr::select(label, state, value, entropy) %>% dplyr::rename(posterior = value) %>% dplyr::mutate(id = id)
+fit = dplyr::right_join(input(x) %>% dplyr::select(colnames(genomic_data(x, PASS = TRUE)), id), map, by = 'id')
+return(fit)
+}
+if(parallel){
+if(is.null(num_cores)) num_cores = as.integer(0.8*parallel::detectCores())
+tests = parallel::mclapply(X = ids(x),
+FUN = classify_single_mutation,
+x = x,
+mc.cores = num_cores)
+tests = do.call(rbind, tests)
+} else {
+tests = lapply(ids(x), function(id) {
+classify_single_mutation(x = x, id = id)
+})
+}
+tests = parallel::mclapply(X = ids(x),
+FUN = classify_single_mutation,
+x = x,
+mc.cores = num_cores)
+num_cores=2
+tests = parallel::mclapply(X = ids(x),
+FUN = classify_single_mutation,
+x = x,
+mc.cores = num_cores)
+tests
+tests = do.call(rbind, tests)
+tests
+lapply(ids(x), function(id) {
+classify_single_mutation(x = x, id = id)
+})
+parallel::mclapply(X = ids(x),
+FUN = classify_single_mutation,
+x = x,
+mc.cores = num_cores)
+tests
+tests = parallel::mclapply(X = ids(x),
+FUN = classify_single_mutation,
+x = x,
+mc.cores = num_cores)
+tests
+tests %>% do.call(rbind, .)
+output$classification$fit = tests %>% do.call(rbind, .)
+output$classification$parameters = dplyr::tibble(entropy_cutoff = entropy_cutoff,
+rho = rho,
+karyotypes = list(karyotypes))
+output$classification$priors = priors
+classification(output)
+classification(output) %>% dplyr::filter(state == !!state)
+classification(output) %>% dplyr::filter(state == !!state) %>% nrow()
+cli::cli_alert_info('There are: ')
+for (state in c('HMD', 'LOH', 'CNLOH', 'AM', 'Tier-2')) {
+N  = classification(output) %>% dplyr::filter(state == !!state) %>% nrow()
+cli::cli_bullets(c("*" = paste0("N = ", N, ' mutations (', state, ')')))
+}
diff --git a/R/classify.R b/R/classify.R
@@ -28,12 +28,6 @@ classify = function(x,
   stopifnot(inherits(x, "INCOMMON"))
   if(is.null(entropy_cutoff)) entropy_cutoff = 1
 
-  # Output
-
-  output = x
-
-  output$classification = list()
-
   cli::cli_h1(
       "INCOMMON inference of copy number and mutation multiplicity for sample {.field {x$sample}}"
     )
@@ -110,21 +104,27 @@ classify = function(x,
 
   }
 
-    output$classification$fit = tests %>% do.call(rbind, .)
-    output$classification$parameters = dplyr::tibble(entropy_cutoff = entropy_cutoff,
-                                                     rho = rho,
-                                                     karyotypes = list(karyotypes))
-    output$classification$priors = priors
+  # Output
+
+  x$classification = list()
+
+  x$classification$fit = tests %>% do.call(rbind, .)
+  x$classification$parameters = dplyr::tibble(
+    entropy_cutoff = entropy_cutoff,
+    rho = rho,
+    karyotypes = list(karyotypes)
+  )
+  x$classification$priors = priors
 
     cli::cli_alert_info('There are: ')
     for (state in c('HMD', 'LOH', 'CNLOH', 'AM', 'Tier-2')) {
-      N  = classification(output) %>% dplyr::filter(state == !!state) %>% nrow()
+      N  = classification(x) %>% dplyr::filter(state == !!state) %>% nrow()
       cli::cli_bullets(c("*" = paste0("N = ", N, ' mutations (', state, ')')))
     }
 
-    mean_ent = classification(output) %>% dplyr::pull(entropy) %>% mean()
-    min_ent = classification(output) %>% dplyr::pull(entropy) %>% min()
-    max_ent = classification(output) %>% dplyr::pull(entropy) %>% max()
+    mean_ent = classification(x) %>% dplyr::pull(entropy) %>% mean()
+    min_ent = classification(x) %>% dplyr::pull(entropy) %>% min()
+    max_ent = classification(x) %>% dplyr::pull(entropy) %>% max()
 
     cli::cli_alert_info(
       'The mean classification entropy is {.field {round(mean_ent, 2)}} (min: {.field {round(min_ent, 2)}}, max: {.field {round(max_ent, 2)}})'

diff --git a/R/utils.R b/R/utils.R
@@ -132,21 +132,21 @@ info = function(x, id){
 
 
 DP = function(x, id){
-  x = idify(x)
+  if(!("id" %in% colnames(x$input))) x = idify(x)
   x$input %>%
     dplyr::filter(id == !!id) %>%
     dplyr::pull(DP)
 }
 
 NV = function(x, id){
-  x = idify(x)
+  if(!("id" %in% colnames(x$input))) x = idify(x)
   x$input %>%
     dplyr::filter(id == !!id) %>%
     dplyr::pull(NV)
 }
 
 entropy = function(x, id){
-  x = idify(x)
+  if(!("id" %in% colnames(x$input))) x = idify(x)
   posterior(x, id) %>%
     dplyr::filter(NV == NV(x, id)) %>%
     dplyr::arrange(dplyr::desc(value)) %>%
@@ -156,21 +156,21 @@ entropy = function(x, id){
 
 
 VAF = function(x, id){
-  x = idify(x)
+  if(!("id" %in% colnames(x$input))) x = idify(x)
   x$input %>%
     dplyr::filter(id == !!id) %>%
     dplyr::pull(VAF)
 }
 
 gene = function(x, id){
-  x = idify(x)
+  if(!("id" %in% colnames(x$input))) x = idify(x)
   x$input %>%
     dplyr::filter(id == !!id) %>%
     dplyr::pull(gene)
 }
 
 get_gene_role = function(x, id){
-  x = idify(x)
+  if(!("id" %in% colnames(x$input))) x = idify(x)
   x$data %>%
     dplyr::filter(id == !!id) %>%
     dplyr::pull(gene_role)