Merge remote-tracking branch 'origin/master' into aberrant_expression

seoanezonjic · Oct 9, 2024 · f1409ce · f1409ce
2 parents 4f13c82 + 708d7e3
commit f1409ce
Show file tree

Hide file tree

Showing 38 changed files with 1,849 additions and 402 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -60,6 +60,7 @@ importFrom(DOSE,setReadable)
 importFrom(DT,datatable)
 importFrom(FactoInvestigate,dimRestrict)
 importFrom(FactoInvestigate,eigenRef)
+importFrom(FactoMineR,HCPC)
 importFrom(FactoMineR,PCA)
 importFrom(FactoMineR,dimdesc)
 importFrom(GO.db,GOBPANCESTOR)

diff --git a/R/factor_mining.R b/R/factor_mining.R
@@ -33,8 +33,8 @@ merge_dim_tables <- function(dim_data_simp){
 }
 
 #' @importFrom FactoInvestigate dimRestrict eigenRef
-get_PCA_dimensions <- function(pca_obj, min_dimensions = 2) {
-    ref = FactoInvestigate::eigenRef(pca_obj, time = "10s", parallel=FALSE) # to avoid use parallel computation that greedy takes all cpu cores
+get_PCA_dimensions <- function(pca_obj, min_dimensions = 2, time = "10s") {
+    ref = FactoInvestigate::eigenRef(pca_obj, time = time, parallel=FALSE) # to avoid use parallel computation that greedy takes all cpu cores
     rand = c(ref$inertia[1], diff(ref$inertia)) * 100
     keep_dimensions <- FactoInvestigate::dimRestrict(pca_obj, rand = rand)
     if(keep_dimensions < min_dimensions){
@@ -44,38 +44,136 @@ get_PCA_dimensions <- function(pca_obj, min_dimensions = 2) {
     return(keep_dimensions)
 }
 
-#' @importFrom FactoMineR PCA dimdesc
+#' @importFrom FactoMineR PCA dimdesc HCPC
 compute_pca <- function(pca_data, 
-												target, 
-												string_factors = NULL, 
-												numeric_factors = NULL) {
-	pca_data <- as.data.frame(t(pca_data))
+						target = NULL, 
+						string_factors = NULL, 
+						numeric_factors = NULL, 
+						transpose = TRUE,
+						add_samples = NULL,
+						min_dimensions = 2) {
+
+	if (transpose) 
+		pca_data <- as.data.frame(t(pca_data))
 
-	std_pca <- FactoMineR::PCA(pca_data, scale.unit=TRUE, 
-																				graph = FALSE)                                                     
-	dim_to_keep <- get_PCA_dimensions(std_pca)
+	if (!is.null(target)) {
+		rownames(target) <- as.character(target$sample)
 
-	rownames(target) <- as.character(target$sample)
+		if (!is.null(numeric_factors)){
+			pca_data <- merge_factors(pca_data, target, numeric_factors)
 
-	if (!is.null(numeric_factors)){
-		pca_data <- merge_factors(pca_data, target, numeric_factors)
+		}
 
-	}
+		if (!is.null(string_factors)) {
+		  pca_data <- merge_factors(pca_data, target, string_factors)
+		}
+	} 
 
-	if (!is.null(string_factors)) {
-	  pca_data <- merge_factors(pca_data, target, string_factors)
+	raw_pca_data <- pca_data[!rownames(pca_data) %in% add_samples,
+							 ! colnames(pca_data) %in% c(numeric_factors, string_factors)]
+
+	std_pca <- FactoMineR::PCA(raw_pca_data, scale.unit=TRUE, 
+	  							graph = FALSE)                                                     
+	dim_to_keep <- get_PCA_dimensions(std_pca, min_dimensions = min_dimensions)
+
+
+	if (!is.null(add_samples)) {
+		add_samples_idx <- match(add_samples, rownames(pca_data))
+	} else {
+		add_samples_idx <- NULL
 	}
 
-	pca_res <- FactoMineR::PCA(pca_data,ncp = dim_to_keep, 
+	pca_res <- FactoMineR::PCA(pca_data, ncp = dim_to_keep, 
 										scale.unit=TRUE, 
 										graph = FALSE,
 										quanti.sup = numeric_factors, 
-										quali.sup=string_factors)	
-	dim_data <- FactoMineR::dimdesc(pca_res, axes=seq(1, dim_to_keep))
-  dim_data_merged <- merge_dim_tables(dim_data)
+										quali.sup=string_factors,
+										ind.sup = add_samples_idx)	
+ 	dim_data <- FactoMineR::dimdesc(pca_res, axes=seq(1, dim_to_keep))
+    dim_data_merged <- merge_dim_tables(dim_data)
+
+  res.hcpc <- FactoMineR::HCPC(pca_res, graph = FALSE, nb.clust = -1)
 
 	return(list(pca_data = pca_res,
 							dim_to_keep = dim_to_keep,
 							dim_data = dim_data,
-							dim_data_merged = dim_data_merged))
+							dim_data_merged = dim_data_merged,
+							res.hcpc = res.hcpc))
+}
+
+
+
+get_cluster_string_assoc <- function(res.hcpc, string_factors){
+
+	all_factor_clusters <- data.frame()
+	for (add_factor in string_factors) {
+		clusters_ct <- ct_from_qual(res.hcpc$data.clust[,c("clust", add_factor)])
+		clusters_ct$sub <- paste(add_factor,clusters_ct$sub, sep = ":")
+		colnames(clusters_ct)[match(c("ref","sub"),colnames(clusters_ct))] <- c("Cluster","Category")
+		clusters_ct$fisher.test.pval <- v.fisher.test(clusters_ct)
+		clusters_ct$FDR <- p.adjust(clusters_ct$fisher.test.pval, method = "BH")
+		all_factor_clusters <- rbind(all_factor_clusters, clusters_ct)
+	}
+	return(all_factor_clusters)
+}
+
+parse_eigenvectors <- function(eigenvectors, eig_abs_cutoff = NULL) {
+
+	parsed_eigvec <- list()
+	for (Dim in names(eigenvectors)) {
+		pDim <- gsub("Dim","PC",Dim)
+		eigen_vec <- eigenvectors[[Dim]]
+
+		if (!is.null(eig_abs_cutoff))
+			eigen_vec <- eigen_vec[abs(eigen_vec) > eig_abs_cutoff]
+			# eigen_vec[abs(eigen_vec) < eig_abs_cutoff] <- 0
+
+		parsed_eigvec[[paste("positive", pDim, sep = ".")]] <- eigen_vec
+		parsed_eigvec[[paste("negative", pDim, sep = ".")]] <- eigen_vec * -1
+	}
+	return(parsed_eigvec)
+}
+
+get_and_parse_pca_eigenvectors <- 	function(pca_res, cor_pval_cutoff = 0.05, eig_abs_cutoff = NULL){
+	dimnames(pca_res$pca_data$svd$V) <- dimnames(pca_res$pca_data$var$cor)
+	eigenvectors <- name_all_columns(as.data.frame(pca_res$pca_data$svd$V))
+	correlations <- pca_res$pca_data$var$cor
+	n <- nrow(pca_res$pca_data$ind$cos2)
+	cor_sig <- cor_pval(correlations, n)
+	eigenvectors <- filter_eigenvectors(eigenvectors, cor_sig, pval_cutoff = cor_pval_cutoff)
+	eigenvectors <- parse_eigenvectors(eigenvectors, eig_abs_cutoff = eig_abs_cutoff)
+	return(eigenvectors)
+
+}
+
+get_and_parse_clusters <- function(pca_res){
+	clusters <- pca_res$res.hcpc$call$X
+	dims <- colnames(clusters)[grepl("Dim.", colnames(clusters))]
+	weights <- clusters |> dplyr::group_by(clust) |> dplyr::summarise_at(dims, mean)
+	dimnames(pca_res$pca_data$svd$V) <- dimnames(pca_res$pca_data$var$cor)
+	clust_names <- paste("Cluster",weights$clust )
+	weights <- as.data.frame(dplyr::select(weights, -clust))
+	rownames(weights) <- clust_names
+	weighted_eigen <- apply(weights,1, get_clust_contributions, 
+	                                   eigenvectors = pca_res$pca_data$svd$V)
+	colnames(weighted_eigen) <- rownames(weights)
+	weighted_eigen <- as.data.frame(weighted_eigen)
+	weighted_eigen <- lapply(weighted_eigen, function(col) setNames(col, rownames(weighted_eigen)))
+	return(weighted_eigen)
 }
+
+get_clust_contributions <- function(weigths, eigenvectors)  {
+	weighted_eig <- weigths * eigenvectors
+	rowSums(weighted_eig)
+}
+
+
+filter_eigenvectors <- function(eigenvectors, cor_sig, pval_cutoff) {
+ 	eigenvec_fil <- lapply(names(eigenvectors), function(dimension) {
+		eigenvec <- eigenvectors[[dimension]]
+		significance <- cor_sig[,dimension]
+	  eigenvec[significance <= pval_cutoff] 
+	})
+	names(eigenvec_fil) <- names(eigenvectors)
+	return(eigenvec_fil)
+}
diff --git a/R/functional_analysis_library.R b/R/functional_analysis_library.R
@@ -280,54 +280,111 @@ translate_gmt <- function(gmt, gene_keytype, org_db){
 #' @importClassesFrom topGO topGOdata
 #' @importFrom topGO annFUN
 multienricher_topGO <- function(all_funsys, genes_list, universe=NULL, 
-  organism_info, gene_id="entrez", algorithm = "classic", statistic = "fisher", 
-  nodeSize = 5, task_size=1, workers=1) {
+  organism_info, gene_id="entrez", p_value_cutoff = 0.05, algorithm = "classic", statistic = "fisher", 
+  nodeSize = 5, task_size=1, workers=1, scoreOrder = "increasing", clean_parentals = FALSE) {
 
-  unlisted_input_flag <- FALSE
-  if(! is.list(genes_list)) {
-    unlisted_input_flag <- TRUE
+
+  #checking input format
+  if (!is.list(genes_list)) 
     genes_list <- list(genes_list)
+
+
+  if (!any(all_funsys %in% c("CC","BP","MF"))) {
+    message("None GO subontology has been selected")
+    return(NULL)
   }
 
- org_db <- organism_info$Bioconductor_DB[1]
- enrichments_topGO <- vector("list", length(all_funsys))
- names(enrichments_topGO) <- all_funsys
- for(funsys in all_funsys) {
-  if (funsys %in% c("CC","BP","MF")){
-    if(is.null(universe)) {
-      go_to_genes <- topGO::annFUN.org(funsys, mapping = org_db, ID = gene_id)
-      universe <- unique(unlist(go_to_genes))
-    }
+  org_db <- organism_info$Bioconductor_DB[1]
 
-    geneList <- factor(as.integer(universe %in% genes_list[[1]]))
-    names(geneList) <- universe
-
+  all_funsys <- all_funsys[all_funsys %in% c("CC","BP","MF")]
+  enrichments_topGO <- as.list(all_funsys)
+  names(enrichments_topGO) <- all_funsys
+  enrichments_topGO <- lapply(enrichments_topGO, multi_topGOTest, genes_list = genes_list,
+                            nodeSize = nodeSize, org_db = org_db, p_value_cutoff = p_value_cutoff, universe = universe, 
+                            gene_id = gene_id, algorithm = algorithm, statistic = statistic,
+                            scoreOrder = scoreOrder, clean_parentals = clean_parentals,workers = workers, task_size = task_size)
+  return(enrichments_topGO)
+}
 
-    # Create environment variables used for initialization of the topGOdata obj
-    topGO::groupGOTerms()
-    GOdata <- new("topGOdata",
-                 ontology = funsys,
-                 allGenes = geneList,
-                 nodeSize = nodeSize,
-                 mapping = org_db,
-                 annotationFun = topGO::annFUN.org,
-                 ID = gene_id)
-    }
+multi_topGOTest <- function(funsys, genes_list, nodeSize = 5, org_db, 
+                            universe = NULL, gene_id = "entrez",p_value_cutoff = 0.05, algorithm = "classic", statistic = "fisher",
+                            scoreOrder = "increasing", workers = 1, task_size = 1, geneSel = NULL, clean_parentals = FALSE) {
+  if(is.null(universe) && statistic == "fisher") {
+    go_to_genes <- topGO::annFUN.org(funsys, mapping = org_db, ID = gene_id)
+    universe <- unique(unlist(go_to_genes))
+  }
 
-    enriched_cats <- parallel_list(genes_list, function(l_genes) {
-      geneList <- factor(as.integer(universe %in% l_genes))
-      names(geneList) <- universe
-      l_GOdata <- topGO::updateGenes(object = GOdata, geneList = geneList)
-      resultFis <- topGO::runTest(l_GOdata, algorithm = algorithm, 
-      statistic = statistic)
-    }, workers = workers, task_size = task_size )
-
-    if(unlisted_input_flag) enriched_cats <- enriched_cats[[1]]
-    enrichments_topGO[[funsys]] <- enriched_cats
+  if (is.null(geneSel)) 
+    geneSel <- get_default_geneSel(statistic)
+
+  genes_list <- lapply(genes_list, function(l_genes){
+    if (is.numeric(l_genes)) { 
+      if (!statistic %in%  c("ks", "t"))
+        stop("ERROR: numeric scores can be only computed using 'ks' and 't' statistic")
+
+      return(l_genes)      
+    } else {
+      if (statistic %in% c("ks", "t"))
+        stop("ERROR: 'ks' and 't' can be only computed using numeric scores") 
+      l_genes <- factor(as.integer(universe %in% l_genes))
+      names(l_genes) <- universe
+      return(l_genes)
+    }   
+  })
+  all_names <- unique(unlist(lapply(genes_list, names)))
+  if (statistic == "fisher") {
+    rand <- factor(c(1,2))
+  } else {
+    rand <- seq(0,1,0.1)
   }
-  return(enrichments_topGO)
+
+  all_genes <- sample(rand, length(all_names), replace = TRUE)
+  names(all_genes) <-  all_names 
+  # Create environment variables used for initialization of the topGOdata obj
+  topGO::groupGOTerms()
+  enriched_cats <- parallel_list(genes_list, function(l_genes) {
+  if(length(l_genes) == 0)
+    return(data.frame())
+  GOdata <- new("topGOdata",
+               ontology = funsys,
+               allGenes = l_genes,
+               nodeSize = nodeSize,
+               mapping = org_db,
+               geneSel = geneSel,
+               annotationFun = topGO::annFUN.org,
+               ID = gene_id)
+
+    result <- topGO::runTest(GOdata, algorithm = algorithm, 
+    statistic = statistic, scoreOrder = scoreOrder)
+    res_table <- topGO::GenTable(GOdata, 
+                                 p.value = result,
+                                 topNodes = length(result@score), 
+                                 format.FUN = function(x, dig, eps){x})
+    if(clean_parentals)
+      res_table <- clean_topGO_parentals(res_table, funsys, p_value_cutoff)
+    res_table$fdr <- p.adjust(res_table$p.value, method = "BH", n = length(res_table$p.value))
+    res_table
+
+  }, workers = workers, task_size = task_size )
+
+  if (length(enriched_cats) == 1)
+    enriched_cats <- enriched_cats[[1]]
+  return(enriched_cats)
+
+}
+
+get_default_geneSel <-function(statistic){
+  if (statistic == "fisher"){
+    return(NULL) 
+  } else {
+    return(function(x){x})
+  } 
 }
 
+
+
+
+
 #' Perform gsea enrichment analysis of a list of genes
 #' @export
 #' @param all_funsys vector of funsys to use (e.g. MF, Reatcome)
@@ -1054,6 +1111,36 @@ clean_all_parentals <- function(enr_obj, subont){
   return(enr_obj)
 }
 
+
+#' @importFrom GO.db GOBPANCESTOR GOMFANCESTOR GOCCANCESTOR
+clean_topGO_parentals <- function(top_GO_enr, subont, p_value_cutoff = 0.05){
+  if (nrow(top_GO_enr) == 0) return(top_GO_enr)
+
+  if (subont=="BP"){
+    GO_ancestors <- GO.db::GOBPANCESTOR
+  } else if (subont=="MF"){
+    GO_ancestors <-  GO.db::GOMFANCESTOR
+  } else if (subont=="CC"){
+    GO_ancestors <- GO.db::GOCCANCESTOR
+  }
+  GO_ancestors <- as.list(GO_ancestors)
+  GO_ancestors <- GO_ancestors[!is.na(GO_ancestors)]
+  obsolete_terms <- names(as.list(GO.db::GOOBSOLETE))
+  top_GO_enr <- top_GO_enr[!top_GO_enr$GO.ID %in% obsolete_terms,]
+  go_to_keep <- rep(TRUE, nrow(top_GO_enr))
+  names(go_to_keep) <- top_GO_enr$GO.ID
+
+  for (go_term in top_GO_enr[top_GO_enr$p.value < p_value_cutoff,"GO.ID"]){ 
+    term_stats <- top_GO_enr[top_GO_enr$GO.ID == go_term,]
+    go_to_keep[names(go_to_keep) %in% GO_ancestors[[go_term]]] <- FALSE  
+  }
+  top_GO_enr <- top_GO_enr[go_to_keep,]
+  return(top_GO_enr)
+}
+
+
+
+
 #' @importFrom GO.db GOTERM
 get_GOid_term <- function(GOid, output = "term"){
  term <- AnnotationDbi::Term(GO.db::GOTERM[GOid])
@@ -1200,11 +1287,7 @@ enrich_density <- function(enrich_result,
                            showCategory = 30) {
 
   terms_attr <- get_attr_by_terms(enrich_result, attributes)
-  enrich_result <- enrichplot:::fortify.internal(
-                                  enrich_result,
-                                  showCategory = showCategory,
-                                  by = "p.adjust",
-                                  )
+  enrich_result <- enrichplot:::fortify.enrichResult(enrich_result,showCategory = showCategory,by = "p.adjust")
 
   enriched_dist <- merge(terms_attr, enrich_result, by ="ID", all.y = TRUE)
   plot <- ggplot2::ggplot(enriched_dist, ggplot2::aes(x = attribute, 

diff --git a/R/general_functions.R b/R/general_functions.R
@@ -269,4 +269,13 @@ remove_pattern_from_df <- function(dataframe, rgx){
 remove_text_from_column <- function(column, rgx) {
   parsed_col <- gsub(rgx, "", column)
   return(parsed_col)
+}
+
+name_column <- function(column, row_names){
+  names(column) <- row_names
+  column
+}
+
+name_all_columns <- function(data_frame) {
+   lapply(data_frame, name_column, row_names = rownames(data_frame))
 }