Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into aberrant_expression
Browse files Browse the repository at this point in the history
  • Loading branch information
AEstebanMar committed Oct 9, 2024
2 parents 4f13c82 + 708d7e3 commit f1409ce
Show file tree
Hide file tree
Showing 38 changed files with 1,849 additions and 402 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ importFrom(DOSE,setReadable)
importFrom(DT,datatable)
importFrom(FactoInvestigate,dimRestrict)
importFrom(FactoInvestigate,eigenRef)
importFrom(FactoMineR,HCPC)
importFrom(FactoMineR,PCA)
importFrom(FactoMineR,dimdesc)
importFrom(GO.db,GOBPANCESTOR)
Expand Down
140 changes: 119 additions & 21 deletions R/factor_mining.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ merge_dim_tables <- function(dim_data_simp){
}

#' @importFrom FactoInvestigate dimRestrict eigenRef
get_PCA_dimensions <- function(pca_obj, min_dimensions = 2) {
ref = FactoInvestigate::eigenRef(pca_obj, time = "10s", parallel=FALSE) # to avoid use parallel computation that greedy takes all cpu cores
get_PCA_dimensions <- function(pca_obj, min_dimensions = 2, time = "10s") {
ref = FactoInvestigate::eigenRef(pca_obj, time = time, parallel=FALSE) # to avoid use parallel computation that greedy takes all cpu cores
rand = c(ref$inertia[1], diff(ref$inertia)) * 100
keep_dimensions <- FactoInvestigate::dimRestrict(pca_obj, rand = rand)
if(keep_dimensions < min_dimensions){
Expand All @@ -44,38 +44,136 @@ get_PCA_dimensions <- function(pca_obj, min_dimensions = 2) {
return(keep_dimensions)
}

#' @importFrom FactoMineR PCA dimdesc
#' @importFrom FactoMineR PCA dimdesc HCPC
compute_pca <- function(pca_data,
target,
string_factors = NULL,
numeric_factors = NULL) {
pca_data <- as.data.frame(t(pca_data))
target = NULL,
string_factors = NULL,
numeric_factors = NULL,
transpose = TRUE,
add_samples = NULL,
min_dimensions = 2) {

if (transpose)
pca_data <- as.data.frame(t(pca_data))

std_pca <- FactoMineR::PCA(pca_data, scale.unit=TRUE,
graph = FALSE)
dim_to_keep <- get_PCA_dimensions(std_pca)
if (!is.null(target)) {
rownames(target) <- as.character(target$sample)

rownames(target) <- as.character(target$sample)
if (!is.null(numeric_factors)){
pca_data <- merge_factors(pca_data, target, numeric_factors)

if (!is.null(numeric_factors)){
pca_data <- merge_factors(pca_data, target, numeric_factors)
}

}
if (!is.null(string_factors)) {
pca_data <- merge_factors(pca_data, target, string_factors)
}
}

if (!is.null(string_factors)) {
pca_data <- merge_factors(pca_data, target, string_factors)
raw_pca_data <- pca_data[!rownames(pca_data) %in% add_samples,
! colnames(pca_data) %in% c(numeric_factors, string_factors)]

std_pca <- FactoMineR::PCA(raw_pca_data, scale.unit=TRUE,
graph = FALSE)
dim_to_keep <- get_PCA_dimensions(std_pca, min_dimensions = min_dimensions)


if (!is.null(add_samples)) {
add_samples_idx <- match(add_samples, rownames(pca_data))
} else {
add_samples_idx <- NULL
}

pca_res <- FactoMineR::PCA(pca_data,ncp = dim_to_keep,
pca_res <- FactoMineR::PCA(pca_data, ncp = dim_to_keep,
scale.unit=TRUE,
graph = FALSE,
quanti.sup = numeric_factors,
quali.sup=string_factors)
dim_data <- FactoMineR::dimdesc(pca_res, axes=seq(1, dim_to_keep))
dim_data_merged <- merge_dim_tables(dim_data)
quali.sup=string_factors,
ind.sup = add_samples_idx)
dim_data <- FactoMineR::dimdesc(pca_res, axes=seq(1, dim_to_keep))
dim_data_merged <- merge_dim_tables(dim_data)

res.hcpc <- FactoMineR::HCPC(pca_res, graph = FALSE, nb.clust = -1)

return(list(pca_data = pca_res,
dim_to_keep = dim_to_keep,
dim_data = dim_data,
dim_data_merged = dim_data_merged))
dim_data_merged = dim_data_merged,
res.hcpc = res.hcpc))
}



get_cluster_string_assoc <- function(res.hcpc, string_factors){

all_factor_clusters <- data.frame()
for (add_factor in string_factors) {
clusters_ct <- ct_from_qual(res.hcpc$data.clust[,c("clust", add_factor)])
clusters_ct$sub <- paste(add_factor,clusters_ct$sub, sep = ":")
colnames(clusters_ct)[match(c("ref","sub"),colnames(clusters_ct))] <- c("Cluster","Category")
clusters_ct$fisher.test.pval <- v.fisher.test(clusters_ct)
clusters_ct$FDR <- p.adjust(clusters_ct$fisher.test.pval, method = "BH")
all_factor_clusters <- rbind(all_factor_clusters, clusters_ct)
}
return(all_factor_clusters)
}

parse_eigenvectors <- function(eigenvectors, eig_abs_cutoff = NULL) {

parsed_eigvec <- list()
for (Dim in names(eigenvectors)) {
pDim <- gsub("Dim","PC",Dim)
eigen_vec <- eigenvectors[[Dim]]

if (!is.null(eig_abs_cutoff))
eigen_vec <- eigen_vec[abs(eigen_vec) > eig_abs_cutoff]
# eigen_vec[abs(eigen_vec) < eig_abs_cutoff] <- 0

parsed_eigvec[[paste("positive", pDim, sep = ".")]] <- eigen_vec
parsed_eigvec[[paste("negative", pDim, sep = ".")]] <- eigen_vec * -1
}
return(parsed_eigvec)
}

get_and_parse_pca_eigenvectors <- function(pca_res, cor_pval_cutoff = 0.05, eig_abs_cutoff = NULL){
dimnames(pca_res$pca_data$svd$V) <- dimnames(pca_res$pca_data$var$cor)
eigenvectors <- name_all_columns(as.data.frame(pca_res$pca_data$svd$V))
correlations <- pca_res$pca_data$var$cor
n <- nrow(pca_res$pca_data$ind$cos2)
cor_sig <- cor_pval(correlations, n)
eigenvectors <- filter_eigenvectors(eigenvectors, cor_sig, pval_cutoff = cor_pval_cutoff)
eigenvectors <- parse_eigenvectors(eigenvectors, eig_abs_cutoff = eig_abs_cutoff)
return(eigenvectors)

}

get_and_parse_clusters <- function(pca_res){
clusters <- pca_res$res.hcpc$call$X
dims <- colnames(clusters)[grepl("Dim.", colnames(clusters))]
weights <- clusters |> dplyr::group_by(clust) |> dplyr::summarise_at(dims, mean)
dimnames(pca_res$pca_data$svd$V) <- dimnames(pca_res$pca_data$var$cor)
clust_names <- paste("Cluster",weights$clust )
weights <- as.data.frame(dplyr::select(weights, -clust))
rownames(weights) <- clust_names
weighted_eigen <- apply(weights,1, get_clust_contributions,
eigenvectors = pca_res$pca_data$svd$V)
colnames(weighted_eigen) <- rownames(weights)
weighted_eigen <- as.data.frame(weighted_eigen)
weighted_eigen <- lapply(weighted_eigen, function(col) setNames(col, rownames(weighted_eigen)))
return(weighted_eigen)
}

get_clust_contributions <- function(weigths, eigenvectors) {
weighted_eig <- weigths * eigenvectors
rowSums(weighted_eig)
}


filter_eigenvectors <- function(eigenvectors, cor_sig, pval_cutoff) {
eigenvec_fil <- lapply(names(eigenvectors), function(dimension) {
eigenvec <- eigenvectors[[dimension]]
significance <- cor_sig[,dimension]
eigenvec[significance <= pval_cutoff]
})
names(eigenvec_fil) <- names(eigenvectors)
return(eigenvec_fil)
}
169 changes: 126 additions & 43 deletions R/functional_analysis_library.R
Original file line number Diff line number Diff line change
Expand Up @@ -280,54 +280,111 @@ translate_gmt <- function(gmt, gene_keytype, org_db){
#' @importClassesFrom topGO topGOdata
#' @importFrom topGO annFUN
multienricher_topGO <- function(all_funsys, genes_list, universe=NULL,
organism_info, gene_id="entrez", algorithm = "classic", statistic = "fisher",
nodeSize = 5, task_size=1, workers=1) {
organism_info, gene_id="entrez", p_value_cutoff = 0.05, algorithm = "classic", statistic = "fisher",
nodeSize = 5, task_size=1, workers=1, scoreOrder = "increasing", clean_parentals = FALSE) {

unlisted_input_flag <- FALSE
if(! is.list(genes_list)) {
unlisted_input_flag <- TRUE

#checking input format
if (!is.list(genes_list))
genes_list <- list(genes_list)


if (!any(all_funsys %in% c("CC","BP","MF"))) {
message("None GO subontology has been selected")
return(NULL)
}

org_db <- organism_info$Bioconductor_DB[1]
enrichments_topGO <- vector("list", length(all_funsys))
names(enrichments_topGO) <- all_funsys
for(funsys in all_funsys) {
if (funsys %in% c("CC","BP","MF")){
if(is.null(universe)) {
go_to_genes <- topGO::annFUN.org(funsys, mapping = org_db, ID = gene_id)
universe <- unique(unlist(go_to_genes))
}
org_db <- organism_info$Bioconductor_DB[1]

geneList <- factor(as.integer(universe %in% genes_list[[1]]))
names(geneList) <- universe

all_funsys <- all_funsys[all_funsys %in% c("CC","BP","MF")]
enrichments_topGO <- as.list(all_funsys)
names(enrichments_topGO) <- all_funsys
enrichments_topGO <- lapply(enrichments_topGO, multi_topGOTest, genes_list = genes_list,
nodeSize = nodeSize, org_db = org_db, p_value_cutoff = p_value_cutoff, universe = universe,
gene_id = gene_id, algorithm = algorithm, statistic = statistic,
scoreOrder = scoreOrder, clean_parentals = clean_parentals,workers = workers, task_size = task_size)
return(enrichments_topGO)
}

# Create environment variables used for initialization of the topGOdata obj
topGO::groupGOTerms()
GOdata <- new("topGOdata",
ontology = funsys,
allGenes = geneList,
nodeSize = nodeSize,
mapping = org_db,
annotationFun = topGO::annFUN.org,
ID = gene_id)
}
multi_topGOTest <- function(funsys, genes_list, nodeSize = 5, org_db,
universe = NULL, gene_id = "entrez",p_value_cutoff = 0.05, algorithm = "classic", statistic = "fisher",
scoreOrder = "increasing", workers = 1, task_size = 1, geneSel = NULL, clean_parentals = FALSE) {
if(is.null(universe) && statistic == "fisher") {
go_to_genes <- topGO::annFUN.org(funsys, mapping = org_db, ID = gene_id)
universe <- unique(unlist(go_to_genes))
}

enriched_cats <- parallel_list(genes_list, function(l_genes) {
geneList <- factor(as.integer(universe %in% l_genes))
names(geneList) <- universe
l_GOdata <- topGO::updateGenes(object = GOdata, geneList = geneList)
resultFis <- topGO::runTest(l_GOdata, algorithm = algorithm,
statistic = statistic)
}, workers = workers, task_size = task_size )

if(unlisted_input_flag) enriched_cats <- enriched_cats[[1]]
enrichments_topGO[[funsys]] <- enriched_cats
if (is.null(geneSel))
geneSel <- get_default_geneSel(statistic)

genes_list <- lapply(genes_list, function(l_genes){
if (is.numeric(l_genes)) {
if (!statistic %in% c("ks", "t"))
stop("ERROR: numeric scores can be only computed using 'ks' and 't' statistic")

return(l_genes)
} else {
if (statistic %in% c("ks", "t"))
stop("ERROR: 'ks' and 't' can be only computed using numeric scores")
l_genes <- factor(as.integer(universe %in% l_genes))
names(l_genes) <- universe
return(l_genes)
}
})
all_names <- unique(unlist(lapply(genes_list, names)))
if (statistic == "fisher") {
rand <- factor(c(1,2))
} else {
rand <- seq(0,1,0.1)
}
return(enrichments_topGO)

all_genes <- sample(rand, length(all_names), replace = TRUE)
names(all_genes) <- all_names
# Create environment variables used for initialization of the topGOdata obj
topGO::groupGOTerms()
enriched_cats <- parallel_list(genes_list, function(l_genes) {
if(length(l_genes) == 0)
return(data.frame())
GOdata <- new("topGOdata",
ontology = funsys,
allGenes = l_genes,
nodeSize = nodeSize,
mapping = org_db,
geneSel = geneSel,
annotationFun = topGO::annFUN.org,
ID = gene_id)

result <- topGO::runTest(GOdata, algorithm = algorithm,
statistic = statistic, scoreOrder = scoreOrder)
res_table <- topGO::GenTable(GOdata,
p.value = result,
topNodes = length(result@score),
format.FUN = function(x, dig, eps){x})
if(clean_parentals)
res_table <- clean_topGO_parentals(res_table, funsys, p_value_cutoff)
res_table$fdr <- p.adjust(res_table$p.value, method = "BH", n = length(res_table$p.value))
res_table

}, workers = workers, task_size = task_size )

if (length(enriched_cats) == 1)
enriched_cats <- enriched_cats[[1]]
return(enriched_cats)

}

get_default_geneSel <-function(statistic){
if (statistic == "fisher"){
return(NULL)
} else {
return(function(x){x})
}
}





#' Perform gsea enrichment analysis of a list of genes
#' @export
#' @param all_funsys vector of funsys to use (e.g. MF, Reatcome)
Expand Down Expand Up @@ -1054,6 +1111,36 @@ clean_all_parentals <- function(enr_obj, subont){
return(enr_obj)
}


#' @importFrom GO.db GOBPANCESTOR GOMFANCESTOR GOCCANCESTOR
clean_topGO_parentals <- function(top_GO_enr, subont, p_value_cutoff = 0.05){
if (nrow(top_GO_enr) == 0) return(top_GO_enr)

if (subont=="BP"){
GO_ancestors <- GO.db::GOBPANCESTOR
} else if (subont=="MF"){
GO_ancestors <- GO.db::GOMFANCESTOR
} else if (subont=="CC"){
GO_ancestors <- GO.db::GOCCANCESTOR
}
GO_ancestors <- as.list(GO_ancestors)
GO_ancestors <- GO_ancestors[!is.na(GO_ancestors)]
obsolete_terms <- names(as.list(GO.db::GOOBSOLETE))
top_GO_enr <- top_GO_enr[!top_GO_enr$GO.ID %in% obsolete_terms,]
go_to_keep <- rep(TRUE, nrow(top_GO_enr))
names(go_to_keep) <- top_GO_enr$GO.ID

for (go_term in top_GO_enr[top_GO_enr$p.value < p_value_cutoff,"GO.ID"]){
term_stats <- top_GO_enr[top_GO_enr$GO.ID == go_term,]
go_to_keep[names(go_to_keep) %in% GO_ancestors[[go_term]]] <- FALSE
}
top_GO_enr <- top_GO_enr[go_to_keep,]
return(top_GO_enr)
}




#' @importFrom GO.db GOTERM
get_GOid_term <- function(GOid, output = "term"){
term <- AnnotationDbi::Term(GO.db::GOTERM[GOid])
Expand Down Expand Up @@ -1200,11 +1287,7 @@ enrich_density <- function(enrich_result,
showCategory = 30) {

terms_attr <- get_attr_by_terms(enrich_result, attributes)
enrich_result <- enrichplot:::fortify.internal(
enrich_result,
showCategory = showCategory,
by = "p.adjust",
)
enrich_result <- enrichplot:::fortify.enrichResult(enrich_result,showCategory = showCategory,by = "p.adjust")

enriched_dist <- merge(terms_attr, enrich_result, by ="ID", all.y = TRUE)
plot <- ggplot2::ggplot(enriched_dist, ggplot2::aes(x = attribute,
Expand Down
9 changes: 9 additions & 0 deletions R/general_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -269,4 +269,13 @@ remove_pattern_from_df <- function(dataframe, rgx){
remove_text_from_column <- function(column, rgx) {
parsed_col <- gsub(rgx, "", column)
return(parsed_col)
}

name_column <- function(column, row_names){
names(column) <- row_names
column
}

name_all_columns <- function(data_frame) {
lapply(data_frame, name_column, row_names = rownames(data_frame))
}
Loading

0 comments on commit f1409ce

Please sign in to comment.