Skip to content

Commit

Permalink
facto_miner tables are now written
Browse files Browse the repository at this point in the history
  • Loading branch information
JoseCorCab committed Jan 30, 2024
1 parent 4882005 commit e937d93
Show file tree
Hide file tree
Showing 12 changed files with 230 additions and 72 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,6 @@ Imports:
GenomicRanges,
annotatr,
ggridges,
FactoInvestigate
FactoInvestigate,
FactoMineR
Config/testthat/edition: 3
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ importFrom(DOSE,setReadable)
importFrom(DT,datatable)
importFrom(FactoInvestigate,dimRestrict)
importFrom(FactoInvestigate,eigenRef)
importFrom(FactoMineR,PCA)
importFrom(FactoMineR,dimdesc)
importFrom(GO.db,GOBPANCESTOR)
importFrom(GO.db,GOBPPARENTS)
importFrom(GO.db,GOCCANCESTOR)
Expand Down
76 changes: 76 additions & 0 deletions R/factor_mining.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
merge_dim_tables <- function(dim_data_simp){
quali_data <- data.frame()
quanti_data <- data.frame()
cat_data <- data.frame()
dim_data_simp$call <- NULL
for(dimension in names(dim_data_simp)){

dim_data <- dim_data_simp[[dimension]]
if (!is.null(dim_data$quali)) {

quali_data_dim <- as.data.frame(dim_data$quali)
quali_data_dim$dimension <- dimension
quali_data_dim$factor <- rownames(quali_data_dim)
quali_data <- rbind(quali_data, quali_data_dim)
}
if (!is.null(dim_data$quanti)){
quanti_data_dim <- as.data.frame(dim_data$quanti)
quanti_data_dim$dimension <- dimension
quanti_data_dim$factor <- rownames(quanti_data_dim)
quanti_data <- rbind(quanti_data, quanti_data_dim)
}
if (!is.null(dim_data$category)){
cat_data_dim <- as.data.frame(dim_data$category)
cat_data_dim$dimension <- dimension
cat_data_dim$factor <- rownames(cat_data_dim)
cat_data <- rbind(cat_data, cat_data_dim)
}

}
return(list(quantitative = quanti_data,
qualitative = quali_data,
qual_category = cat_data))
}

#' @importFrom FactoInvestigate dimRestrict eigenRef
get_PCA_dimensions <- function(pca_obj, min_dimensions = 2) {
ref = FactoInvestigate::eigenRef(pca_obj, time = "10s", parallel=FALSE) # to avoid use parallel computation that greedy takes all cpu cores
rand = c(ref$inertia[1], diff(ref$inertia)) * 100
keep_dimensions <- FactoInvestigate::dimRestrict(pca_obj, rand = rand)
if(keep_dimensions < min_dimensions){
keep_dimensions <- min_dimensions
message('Significant axis are less than 2. The first two axis will be selected to continue the analysis')
}
return(keep_dimensions)
}

#' @importFrom FactoMineR PCA dimdesc
compute_pca <- function(pca_data,
target,
string_factors = NULL,
numeric_factors = NULL) {
pca_data <- as.data.frame(t(pca_data))

std_pca <- FactoMineR::PCA(pca_data, scale.unit=TRUE,
graph = FALSE)
dim_to_keep <- get_PCA_dimensions(std_pca)

rownames(target) <- as.character(target$sample)
if (!is.null(numeric_factors))
pca_data <- merge_factors(pca_data, target, numeric_factors)
if (!is.null(string_factors))
pca_data <- merge_factors(pca_data, target, string_factors)

pca_res <- FactoMineR::PCA(pca_data,ncp = dim_to_keep,
scale.unit=TRUE,
graph = FALSE,
quanti.sup = numeric_factors,
quali.sup=string_factors)
dim_data <- FactoMineR::dimdesc(pca_res, axes=seq(1, dim_to_keep))
dim_data_merged <- merge_dim_tables(dim_data)

return(list(pca_data = pca_res,
dim_to_keep = dim_to_keep,
dim_data = dim_data,
dim_data_merged = dim_data_merged))
}
3 changes: 2 additions & 1 deletion R/general_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ parallel_list <- function(X, FUNC, workers=2, task_size=1, ...){
log <- TRUE
dir.create(log_path, recursive = TRUE)
}
utils::str(log_path)
param <- BiocParallel::MulticoreParam(
workers, tasks = ceiling(length(X)/task_size), stop.on.error = TRUE,
log = log, threshold = "INFO", logdir = log_path
Expand Down Expand Up @@ -242,7 +241,9 @@ annotate_genomic_ranges <-function(intervals_df, genome){
split_str <- function(string, split = NULL) {
if (is.null(split))
stop("split character must be specifyed")
if (is.null(string)) return(NULL)
if (nchar(string) <= 1) return(string)

splitted_str <- unlist(strsplit(string, split = split))
return(splitted_str)
}
Expand Down
46 changes: 33 additions & 13 deletions R/main_degenes_Hunter.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ main_degenes_Hunter <- function(
minpack_common = 4,
model_variables = "",
numerics_as_factors = TRUE,
string_factors = "",
numeric_factors = "",
string_factors = NULL,
numeric_factors = NULL,
WGCNA_memory = 5000,
WGCNA_norm_method = "DESeq2",
WGCNA_deepsplit = 2,
Expand Down Expand Up @@ -116,11 +116,12 @@ main_degenes_Hunter <- function(


numeric_factors <- split_str(numeric_factors, ",")
string_factors <- split_str(string_factors, ",")
final_main_params[["numeric_factors"]] <- numeric_factors
final_main_params[["string_factors"]] <- c("treat", string_factors)
if (sum(numeric_factors == "") >= 1) numeric_factors <- NULL #esto esta para controlar que no haya elementos vacios

string_factors <- split_str(string_factors, ",")
string_factors <- c("treat", string_factors)


if(!is.null(target) & grepl("W", modules)) {
target_numeric_factors <- build_design_for_WGCNA(target,
numeric_factors=numeric_factors)
Expand Down Expand Up @@ -158,6 +159,13 @@ main_degenes_Hunter <- function(
raw_filter <- var_filter[["fil_count_mtrx"]]



#computing PCA for all_genes
full_pca <- compute_pca(pca_data = default_norm$default,
target = target,
string_factors = string_factors,
numeric_factors = numeric_factors)
PCA_res <- list("all_genes" = full_pca)
############################################################
## PERFORM EXPRESION ANALYSIS ##
############################################################
Expand Down Expand Up @@ -219,7 +227,7 @@ main_degenes_Hunter <- function(
#################################################################
mean_cpm <- rowMeans(raw_filter)
DE_all_genes <- NULL

DEG_pca <- NULL
if (any(grepl("[DENL]",modules))){
DE_all_genes <- unite_DEG_pack_results(exp_results, p_val_cutoff,
lfc, minpack_common)
Expand All @@ -228,6 +236,18 @@ main_degenes_Hunter <- function(
DE_all_genes <- transform(DE_all_genes, row.names=Row.names, Row.names=NULL)
# Add the filtered genes back
DE_all_genes <- add_filtered_genes(DE_all_genes, raw)


#computing PCA for PREVALENT DEG

prevalent_degs <- rownames(DE_all_genes[DE_all_genes$genes_tag == "PREVALENT_DEG",])
pca_deg_data <- default_norm$default
pca_deg_data <- pca_deg_data[rownames(pca_deg_data) %in% prevalent_degs,]

PCA_res[["DEGs"]] <- compute_pca(pca_data = pca_deg_data,
target = target,
string_factors = string_factors,
numeric_factors = numeric_factors)
}

if(grepl("W", modules)) { # Check WGCNA was run and returned proper results
Expand Down Expand Up @@ -260,6 +280,9 @@ main_degenes_Hunter <- function(
aux %in% results_diffcoexp$DCLs$Gene.2
}




coverage_df <- get_counts(cnts_mtx=raw, library_sizes=library_sizes)
mean_counts_df <- get_mean_counts(cnts_mtx=raw, cpm_table=cpm_table, reads=reads, minlibraries=minlibraries)
exp_genes_df <- get_gene_stats(cpm_table=cpm_table, reads=reads)
Expand All @@ -282,7 +305,9 @@ main_degenes_Hunter <- function(
final_results[["mean_counts_df"]] <- mean_counts_df
final_results[["exp_genes_df"]] <- exp_genes_df
final_results[["target"]] <- target

final_results[["numeric_factors"]] <- numeric_factors
final_results[["string_factors"]] <- string_factors
final_results[["PCA_res"]] <- PCA_res
if(!is.null(combinations_WGCNA)){
final_results <- c(final_results, combinations_WGCNA)
}
Expand Down Expand Up @@ -376,12 +401,7 @@ check_input_main_degenes_Hunter <- function(raw,
}

# If factors are specified but WGCNA not selected, throw a warning.
if((string_factors != "" | numeric_factors != "") &
(!grepl("W", modules) | is.null(target))) {
warning(paste0("If you wish to use factors for the correlation analysis",
" you must also run WGCNA and include a target file. The -S",
" and -N options will be ignored"))
}

return(list(modules=modules,
active_modules=active_modules,
minpack_common=minpack_common,
Expand Down
11 changes: 0 additions & 11 deletions R/statistics_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -324,14 +324,3 @@ filter_and_integrate_OR <- function(cont_table, p.adjust.method = "BH" ,p_thr =
return(list(integrated_stats = integrated_stats, miRNA_ct = miRNA_ct))
}

#' @importFrom FactoInvestigate dimRestrict eigenRef
get_PCA_dimensions <- function(pca_obj, min_dimensions = 2) {
ref = FactoInvestigate::eigenRef(pca_obj, time = "10s", parallel=FALSE) # to avoid use parallel computation that greedy takes all cpu cores
rand = c(ref$inertia[1], diff(ref$inertia)) * 100
keep_dimensions <- FactoInvestigate::dimRestrict(pca_obj, rand = rand)
if(keep_dimensions < min_dimensions){
keep_dimensions <- min_dimensions
message('Significant axis are less than 2. The first two axis will be selected to continue the analysis')
}
return(keep_dimensions)
}
58 changes: 57 additions & 1 deletion R/write_report.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,68 @@ write_expression_report <- function(exp_results,
coverage_df <- exp_results[['coverage_df']]
mean_counts_df <- exp_results[['mean_counts_df']]
exp_genes_df <- exp_results[['exp_genes_df']]

numeric_factors <- exp_results[["numeric_factors"]]
string_factors <- exp_results[["string_factors"]]
PCA_res <- exp_results[["PCA_res"]]
outf <- file.path(normalizePath(output_files),"DEG_report.html")
rmarkdown::render(file.path(template_folder, 'main_report.Rmd'),
output_file = outf, intermediates_dir = output_files)
}

write_expression_data <- function(final_results, output_files, opt = NULL, template_folder){

write.table(final_results[['raw_filter']],
file=file.path(output_files, "filtered_count_data.txt"), quote=FALSE,
col.names=NA, sep="\t")
write.table(final_results[['sample_groups']], file=file.path(output_files,
"control_treatment.txt"), row.names=FALSE, quote=FALSE, sep="\t")
write_df_list_as_tables(final_results[['all_data_normalized']],
prefix = 'Normalized_counts_', root = output_files)
write_df_list_as_tables(final_results[['all_counts_for_plotting']],
prefix = 'allgenes_', root = output_files)
dir.create(file.path(output_files, "Common_results"))
write.table(final_results[['DE_all_genes']], file=file.path(output_files,
"Common_results", "hunter_results_table.txt"), quote=FALSE,
row.names=TRUE, sep="\t")

write_pca_data(final_results[['PCA_res']], output_files)
}

write_pca_data <- function(PCA_res, output_files){
pca_output <- file.path(output_files, "PCA_results")
dir.create(pca_output)
all_genes_pca <- PCA_res$all_genes$dim_data_merged

all_genes_merged_metrics <- merge_dim_table_metrics(all_genes_pca)

write.table(all_genes_merged_metrics, file = file.path(pca_output, "all_genes_dim_metrics.txt"),sep = "\t", quote = FALSE, row.names=FALSE)

prevalent_pca <- PCA_res$DEGs$dim_data_merged
if (!is.null(prevalent_pca)){
prevalent_merged_metrics <- merge_dim_table_metrics(prevalent_pca)
write.table(prevalent_merged_metrics, file = file.path(pca_output, "prevalent_dim_metrics.txt"),sep = "\t", quote = FALSE, row.names=FALSE)

}

}

merge_dim_table_metrics <- function(merged_dim_table){

names(merged_dim_table$qualitative)[names(merged_dim_table$qualitative) == "R2"] <- "metric"
merged_dim_table$qualitative$metric_type <- "R2"

names(merged_dim_table$quantitative)[names(merged_dim_table$quantitative) == "correlation"] <- "metric"
merged_dim_table$quantitative$metric_type <- "correlation"

names(merged_dim_table$qual_category)[names(merged_dim_table$qual_category) == "Estimate"] <- "metric"
merged_dim_table$qual_category$metric_type <- "coord_var_barycentre"
dim_data_merged <-data.table::rbindlist(merged_dim_table, use.names = TRUE,idcol = "var_type")
dim_data_merged <- as.data.frame(dim_data_merged)
dim_data_merged <- dim_data_merged[,c("factor","var_type" ,"metric_type", "dimension","metric","p.value")]
return(dim_data_merged)
}


write_global_cormit <- function(
strategies,
cont_tables,
Expand Down
23 changes: 8 additions & 15 deletions inst/scripts/degenes_Hunter.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ if( Sys.getenv('DEGHUNTER_MODE') == 'DEVELOPMENT' ){
custom_libraries <- c('main_degenes_Hunter.R', 'io_handling.R',
'general_functions.R', 'dif_expression_packages.R',
'qc_and_benchmarking_functions.R', 'correlation_packages.R',
'plotting_functions.R', 'write_report.R', "statistics_functions.R")
'plotting_functions.R', 'write_report.R', "statistics_functions.R", "factor_mining.R")
for (lib in custom_libraries){
source(file.path(root_path, 'R', lib))
}
Expand Down Expand Up @@ -274,17 +274,10 @@ final_results <- main_degenes_Hunter(
#############################################################################
#WRITE OUTPUT
############################################################################
write.table(final_results[['raw_filter']],
file=file.path(opt$output_files, "filtered_count_data.txt"), quote=FALSE,
col.names=NA, sep="\t")
write.table(final_results[['sample_groups']], file=file.path(opt$output_files,
"control_treatment.txt"), row.names=FALSE, quote=FALSE, sep="\t")
write_df_list_as_tables(final_results[['all_data_normalized']],
prefix = 'Normalized_counts_', root = opt$output_files)
write_df_list_as_tables(final_results[['all_counts_for_plotting']],
prefix = 'allgenes_', root = opt$output_files)
dir.create(file.path(opt$output_files, "Common_results"))
write.table(final_results[['DE_all_genes']], file=file.path(opt$output_files,
"Common_results", "hunter_results_table.txt"), quote=FALSE,
row.names=TRUE, sep="\t")
write_expression_report(final_results, opt$output_files, template_folder, opt)


write_expression_data(final_results, opt$output_files, template_folder, opt)

write_expression_report(final_results, opt$output_files, template_folder, opt)


7 changes: 4 additions & 3 deletions inst/templates/dea_results.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,10 @@ plot(evp)

```{r config_PCA_DEA, echo = FALSE, results = "asis", warning = TRUE, message = TRUE}
pca_data <- as.data.frame(t(all_data_normalized[["default"]]))
prevalen_degs <- rownames(DE_all_genes[DE_all_genes$genes_tag == "PREVALENT_DEG",])
pca_data <- pca_data[,prevalen_degs]
pca_data <- PCA_res$DEGs$pca_data
dim_to_keep <- PCA_res$DEGs$dim_to_keep
dim_data <- PCA_res$DEGs$dim_data
dim_data_merged <- PCA_res$DEGs$dim_data_merged
```

Expand Down
5 changes: 4 additions & 1 deletion inst/templates/expression_qc.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ gplots::heatmap.2(x = res, col = col, symm = TRUE, margins = rep(max(nchar(colna
```

```{r config_PCA, echo = FALSE, results = "asis", warning = TRUE, message = TRUE}
pca_data <- as.data.frame(t(all_data_normalized[["default"]]))
pca_data <- PCA_res$all_genes$pca_data
dim_to_keep <- PCA_res$all_genes$dim_to_keep
dim_data <- PCA_res$all_genes$dim_data
dim_data_merged <- PCA_res$all_genes$dim_data_merged
```

Expand Down
Loading

0 comments on commit e937d93

Please sign in to comment.