diff --git a/DESCRIPTION b/DESCRIPTION index 7528e55..5e57e89 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -65,62 +65,54 @@ Suggests: testthat (>= 3.0.0) VignetteBuilder: knitr Imports: - ReactomePA (>= 1.46.0), - limma (>= 3.58.0), - edgeR (>= 4.0.15), - NOISeq (>= 2.46.0), - biomaRt (>= 2.58.0), - topGO (>= 2.54.0), - diffcoexp (>= 1.22.0), - DT (>= 0.30), - ggplot2 (>= 3.4.4), - stringr (>= 1.5.0), - WGCNA (>= 1.72.1), - dplyr (>= 1.1.3), - BiocGenerics (>= 0.48.0), - enrichplot (>= 1.22.0), - rmarkdown (>= 2.25), - stats (>= 4.3.1), - Biobase (>= 2.62.0), - DESeq2 (>= 1.42.0), - ROCR (>= 1.0.11), - data.table (>= 1.14.8), - knitr (>= 1.44), - magrittr (>= 2.0.3), - SummarizedExperiment (>= 1.32.0), - miRBaseVersions.db (>= 1.1.0), - grDevices (>= 4.3.1), - graphics (>= 4.3.1), - utils (>= 4.3.1), - BiocParallel (>= 1.36.0), - MKinfer (>= 1.1), - matrixStats (>= 1.0.0), - ggupset (>= 0.3.0), - rlang (>= 1.1.1), - plyr (>= 1.8.9), - tidyr (>= 1.3.0), - GO.db (>= 3.18.0), - Matrix (>= 1.6.1.1), - fastcluster (>= 1.2.3), - DOSE (>= 3.28.0), - heatmaply (>= 1.5.0), - EnhancedVolcano (>= 1.20.0), - ggrepel (>= 0.9.4), - clusterProfiler (>= 4.11.0), - GenomicRanges (>= 1.54.0), - GenomicFeatures (>= 1.54.0), - tximport (>= 1.30.0), - annotatr (>= 1.28.0), - ggridges (>= 0.5.4), - FactoInvestigate (>= 1.8), - FactoMineR (>= 2.9), - cowplot (>= 1.1.1), - S4Vectors (>= 0.40.0), - OUTRIDER (>= 1.20.0), - MatrixGenerics (>= 1.14.0), - GenomeInfoDb (>= 1.38.0), - yaml (>= 2.3.7), - rtracklayer (>= 1.62.0), - AnnotationDbi (>= 1.64.0), - htmlreportR (>= 1.0.0) + ReactomePA, + limma, + edgeR, + NOISeq, + biomaRt, + topGO, + diffcoexp, + DT, + ggplot2, + stringr, + WGCNA, + dplyr, + AnnotationDbi, + BiocGenerics, + enrichplot, + rmarkdown, + stats, + Biobase, + DESeq2, + ROCR, + data.table, + knitr, + magrittr, + SummarizedExperiment, + miRBaseVersions.db, + grDevices, + graphics, + utils, + BiocParallel, + MKinfer, + matrixStats, + ggupset, + rlang, + plyr, + tidyr, + GO.db, + Matrix, + fastcluster, + DOSE, + heatmaply, + EnhancedVolcano, + ggrepel, + clusterProfiler, + GenomicRanges, + GenomicFeatures, + tximport, + annotatr, + ggridges, + FactoInvestigate, + FactoMineR Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 0591f06..955e58d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,15 +2,7 @@ export(acc) export(degsynth) -export(filter_counts) export(fmeasure) -export(format_aberrants) -export(format_for_report) -export(get_aberrants) -export(get_bcv) -export(get_counts_correlation) -export(get_gene_sample_correlations) -export(get_ods_results) export(get_org_db) export(get_organism_table) export(get_stats) @@ -18,21 +10,16 @@ export(get_stats_from_cm) export(install_DEgenes_hunter) export(load_and_parse_gmt) export(load_synth_dataset) -export(main_abgenes_Hunter) export(main_clusters_to_enrichment) export(main_degenes_Hunter) export(main_functional_hunter) -export(merge_bam_stats) -export(merge_counts) export(multienricher_gsea) export(multienricher_ora) export(multienricher_topGO) export(parse_pseudocounts) export(ppv) -export(preprocess_gtf) export(recall) export(rtable2measures) -export(run_outrider) export(spc) export(target_generation) export(write_clusters_to_enrichment) @@ -40,11 +27,8 @@ export(write_df_list_as_tables) export(write_enrich_files) export(write_expression_report) export(write_functional_report) -import(data.table) importClassesFrom(topGO,topGOdata) importFrom(AnnotationDbi,keys) -importFrom(AnnotationDbi,loadDb) -importFrom(AnnotationDbi,saveDb) importFrom(AnnotationDbi,select) importFrom(Biobase,assayData) importFrom(BiocParallel,MulticoreParam) @@ -71,41 +55,18 @@ importFrom(GO.db,GOMFANCESTOR) importFrom(GO.db,GOMFPARENTS) importFrom(GO.db,GOOBSOLETE) importFrom(GO.db,GOTERM) -importFrom(GenomeInfoDb,keepStandardChromosomes) -importFrom(GenomicFeatures,exonsBy) importFrom(GenomicFeatures,makeTxDbFromGFF) importFrom(GenomicRanges,makeGRangesFromDataFrame) importFrom(MKinfer,boot.t.test) importFrom(Matrix,forceSymmetric) -importFrom(MatrixGenerics,rowQuantiles) importFrom(NOISeq,noiseqbio) importFrom(NOISeq,readData) importFrom(NOISeq,tmm) -importFrom(OUTRIDER,OUTRIDER) -importFrom(OUTRIDER,OutriderDataSet) -importFrom(OUTRIDER,counts) -importFrom(OUTRIDER,estimateSizeFactors) -importFrom(OUTRIDER,filterExpression) -importFrom(OUTRIDER,findEncodingDim) -importFrom(OUTRIDER,fit) -importFrom(OUTRIDER,normalizationFactors) -importFrom(OUTRIDER,plotAberrantPerSample) -importFrom(OUTRIDER,plotCountCorHeatmap) -importFrom(OUTRIDER,plotCountGeneSampleHeatmap) -importFrom(OUTRIDER,plotEncDimSearch) -importFrom(OUTRIDER,results) -importFrom(OUTRIDER,theta) importFrom(ROCR,performance) importFrom(ROCR,prediction) importFrom(ReactomePA,enrichPathway) importFrom(ReactomePA,gsePathway) -importFrom(S4Vectors,endoapply) -importFrom(SummarizedExperiment,SummarizedExperiment) importFrom(SummarizedExperiment,assay) -importFrom(SummarizedExperiment,colData) -importFrom(SummarizedExperiment,mcols) -importFrom(SummarizedExperiment,rowData) -importFrom(SummarizedExperiment,rowRanges) importFrom(WGCNA,allowWGCNAThreads) importFrom(WGCNA,binarizeCategoricalVariable) importFrom(WGCNA,blockwiseModules) @@ -123,11 +84,7 @@ importFrom(clusterProfiler,gseGO) importFrom(clusterProfiler,gseKEGG) importFrom(clusterProfiler,merge_result) importFrom(clusterProfiler,simplify) -importFrom(cowplot,background_grid) -importFrom(cowplot,plot_grid) -importFrom(cowplot,theme_cowplot) importFrom(data.table,as.data.table) -importFrom(data.table,fread) importFrom(data.table,merge.data.table) importFrom(data.table,rbindlist) importFrom(data.table,setnames) @@ -137,7 +94,6 @@ importFrom(dplyr,between) importFrom(dplyr,desc) importFrom(dplyr,filter) importFrom(dplyr,group_by) -importFrom(dplyr,left_join) importFrom(dplyr,row_number) importFrom(edgeR,DGEList) importFrom(edgeR,calcNormFactors) @@ -155,14 +111,10 @@ importFrom(fastcluster,hclust) importFrom(ggplot2,aes) importFrom(ggplot2,aes_string) importFrom(ggplot2,element_blank) -importFrom(ggplot2,element_rect) importFrom(ggplot2,element_text) -importFrom(ggplot2,facet_wrap) importFrom(ggplot2,fortify) importFrom(ggplot2,geom_bar) -importFrom(ggplot2,geom_boxplot) importFrom(ggplot2,geom_count) -importFrom(ggplot2,geom_histogram) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_text) importFrom(ggplot2,geom_tile) @@ -172,18 +124,13 @@ importFrom(ggplot2,ggplot_gtable) importFrom(ggplot2,ggsave) importFrom(ggplot2,ggtitle) importFrom(ggplot2,guide_colorbar) -importFrom(ggplot2,guide_legend) importFrom(ggplot2,guides) -importFrom(ggplot2,labs) -importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_colour_gradientn) importFrom(ggplot2,scale_fill_gradient2) importFrom(ggplot2,scale_shape_manual) importFrom(ggplot2,scale_x_continuous) -importFrom(ggplot2,scale_x_log10) importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,theme) -importFrom(ggplot2,theme_bw) importFrom(ggplot2,theme_classic) importFrom(ggplot2,theme_light) importFrom(ggplot2,theme_minimal) @@ -213,7 +160,6 @@ importFrom(miRBaseVersions.db,miRBaseVersions.db) importFrom(plyr,rbind.fill) importFrom(rlang,.data) importFrom(rmarkdown,render) -importFrom(rtracklayer,import) importFrom(stats,as.dist) importFrom(stats,complete.cases) importFrom(stats,cor) @@ -234,7 +180,6 @@ importFrom(stats,setNames) importFrom(stats,var) importFrom(stringr,str_remove) importFrom(tidyr,unite) -importFrom(tools,file_path_sans_ext) importFrom(topGO,annFUN) importFrom(tximport,tximport) importFrom(utils,assignInNamespace) diff --git a/R/DROP_functions.R b/R/DROP_functions.R deleted file mode 100644 index c8e2bcb..0000000 --- a/R/DROP_functions.R +++ /dev/null @@ -1,719 +0,0 @@ -#' Preprocess GTF for DROP analysis -#' -#' `preprocess_gtf` takes a GTF file and a directory path. It outputs a txdb, -#'count ranges, and gene_mapping files, which it writes to provided directory. -#' @importFrom GenomicFeatures makeTxDbFromGFF exonsBy -#' @importFrom GenomeInfoDb keepStandardChromosomes -#' @importFrom AnnotationDbi saveDb -#' @param gtf GTF file to process. -#' @param outdir Processed files directory. -#' @returns A list containing all three generated objects. -#' @examples -#' gtf <- system.file("extData/testdata", "gencode.v45.toy.annotation.gtf", -#' package = "ExpHunterSuite") -#' @export - -preprocess_gtf <- function(gtf) { - message("Generating TxDb from provided GTF file") - txdb <- GenomicFeatures::makeTxDbFromGFF(gtf) # Adjusted (came from snakemake) - txdb <- GenomeInfoDb::keepStandardChromosomes(txdb) - message("Calculating count ranges") - count_ranges <- GenomicFeatures::exonsBy(txdb, by = "gene") - message("Creating gene-name mapping file") - gene_name_mapping <- .map_genes(gtf) - preprocessing_results <- list(txdb = txdb, - count_ranges = count_ranges, - gene_name_mapping = gene_name_mapping) - return(preprocessing_results) -} - -#' Create gene name mapping file -#' -#' `map_genes` takes a GTF file and builds a gene-name mapping data frame out -#' of it. -#' @inheritParams preprocess_gtf -#' @returns A data frame containing GTF file gene annotation. -#' @importFrom AnnotationDbi saveDb -#' @importFrom tools file_path_sans_ext -#' @importFrom rtracklayer import - -.map_genes <- function(gtf) { - gtf_name <- basename(tools::file_path_sans_ext(gtf)) - gtf_df <- as.data.frame(rtracklayer::import(gtf)) - if (!"gene_name" %in% colnames(gtf_df)) { - gtf_df$gene_name <- gtf_df$gene_id - } - gtf_df <- gtf_df[gtf_df$type == 'gene',] - if('gene_biotype' %in% colnames(gtf_df)) { - colnames(gtf_df)[colnames(gtf_df) == "gene_biotype"] <- "gene_type" - } - # Subset to the following columns only - columns <- c('seqnames', 'start', 'end', 'strand', 'gene_id', - 'gene_name', 'gene_type', 'gene_status') - columns <- intersect(columns, colnames(gtf_df)) - gtf_df <- gtf_df[, columns] - return(gtf_df) -} - -#' Add HPO terms to table -#' -#' `.add_HPO_cols` takes a data table and adds a hgncSymbol column. -#' @param RES Data table to process. -#' @param sample_id_col Name of sample id column. -#' @param gene_name_col Name of column where hgnc symbols will be included. -#' @param hpo_file File containing HPO terms associated to genes. -#' @importFrom data.table fread setnames -#' @returns Data frame with new hgnc symbol column and (optionally) HPO terms -#' associated to symbols. - -.add_HPO_cols <- function(RES, sample_id_col = 'sampleID', - gene_name_col = 'hgncSymbol', hpo_file = NULL){ - - filename <- file.path(hpo_file) # Adjusted. Instead of trying to download it - # it finds it locally. Same change as in original - # DROP (thanks Jim!) - - hpo_dt <- data.table::fread(filename) - - # change column names - data.table::setnames(RES, old = c(sample_id_col, gene_name_col), new = c('sampleID', 'hgncSymbol')) - - # Get HPO terms of all genes - f2 <- merge(unique(RES[, .(sampleID, hgncSymbol)]), - hpo_dt[,.(hgncSymbol, HPO_id, HPO_label, Ngenes_per_pheno, Nphenos_gene)], - by = 'hgncSymbol', allow.cartesian = TRUE) - sa[, HPO_TERMS := gsub(', ', ',', HPO_TERMS)] - - if(nrow(f2) > 0){ - f3 <- merge(f2, sa[,.(RNA_ID, HPO_TERMS)], by.x = 'sampleID', by.y = 'RNA_ID') - f3 <- f3[HPO_TERMS != ''] # remove cases with no HPO terms to speed up - if(nrow(f3) > 0){ - f3[, HPO_match := any(grepl(HPO_id, HPO_TERMS)), by = 1:nrow(f3)] - f3 <- f3[HPO_match == TRUE] #only take those that match HPOs - if(nrow(f3) > 0){ - f4 <- f3[, .(HPO_label_overlap = paste(HPO_label, collapse = ', '), - HPO_id_overlap = paste(HPO_id, collapse = ', '), - Ngenes_per_pheno = paste(Ngenes_per_pheno, collapse = ', '), - Nphenos_gene = unique(Nphenos_gene)), - by = .(sampleID, hgncSymbol)] - RES <- merge(RES, f4, by = c('sampleID', 'hgncSymbol'), all.x = TRUE) - } - } - } - - data.table::setnames(RES, old = c('sampleID', 'hgncSymbol'), new = c(sample_id_col, gene_name_col)) - return(RES) -} - -#' Get counts table from rds file or table -#' -#' `.get_file_counts` takes a file from which to extract a counts table. File may -#' be in RDS or table format. -#' @param file Counts table file. -#' @returns Loaded counts table. -#' @importFrom data.table fread -#' @importFrom SummarizedExperiment assay - -.get_file_counts <- function(file) { - print(file) - if(grepl('rds$', file)) - counts <- SummarizedExperiment::assay(readRDS(file)) - else { - counts <- as.matrix(data.table::fread(file), rownames = "geneID") - } - return(counts) -} - -#' Get unique rownames of a data frame -#' -#' `.get_unique_rownames` takes a data frame and extracts its unique row names. -#' @param df A data frame. -#' @returns Unique rownames of data frame. - -.get_unique_rownames <- function(df) { - res <- sort(unique(rownames(df))) - return(res) -} - -#' Get base sample ID from RNA_ID string generated for sample annotation file -#' -#' `.get_base_ID` takes an RNA_ID string from the DROP sample annotation file. -#' String generated in this way contain not only sample ID, but the name of -#' the program that generated them. This function extracts the sample ID from -#' that string. -#' @param RNA_ID A string containing the sample ID and the program used for -#' read counting, separated by underscores. -#' @returns Sample ID. - -.get_base_ID <- function(RNA_ID) { - base_ID <- .split_string_by_char(RNA_ID, "_", 1) - return(base_ID) -} - -#' Add base ID column to sample annotation data frame -#' -#' `.add_base_IDs` takes a sample annotation data frame and calls .get_base_ID -#' on it, then assigns the results to new BASE_ID column. -#' @param col_data Data frame containing RNA_ID column. -#' @returns The same dataframe, with a new BASE_ID column containing sample ID. -#' If RNA_ID column only contains sample ID, new column will be an exact copy. - -.add_base_IDs <- function(col_data) { - base_IDs <- sapply(col_data$RNA_ID, .get_base_ID) - col_data$BASE_ID <- base_IDs - return(col_data) -} - -#' Estimate BCV (Biological Coefficient of Variation) from Outrider Dataset. -#' -#' `.estimateThetaWithoutAutoCorrect` takes an ODS (Outrider DataSet) object -#' and calculates its biological coefficient of variation, adding it to the ODS. -#' @param ods Outrider DataSet object. -#' @returns An updated ODS containing estimated BCV. -#' @importFrom OUTRIDER OutriderDataSet normalizationFactors counts fit theta -#' @importFrom SummarizedExperiment colData - -.estimateThetaWithoutAutoCorrect <- function(ods){ - - ods1 <- OUTRIDER::OutriderDataSet( - countData = OUTRIDER::counts(ods), - colData = SummarizedExperiment::colData(ods) - ) - # use rowMeans as expected means - OUTRIDER::normalizationFactors(ods1) <- matrix( - rowMeans(OUTRIDER::counts(ods1)), - ncol=ncol(ods1), nrow=nrow(ods1) - ) - ods1 <- OUTRIDER::fit(ods1) - OUTRIDER::theta(ods1) - - return(OUTRIDER::theta(ods1)) -} - -#' Function to separate locally processed and GTEx samples from OUTRIDER -#' results. -#' `.processed_vs_imported` takes a data frame and splits it into two data frames -#' contained in a list. The two elements are "processed" and "imported". -#' All samples whose name contain "GTEX" are considered imported, the rest -#' are considered exported. -#' @param df A data frame containing read counts by sample. -#' @returns A list with two elements, one for processed counts and another -#' for imported counts. - -.processed_vs_imported <- function(df) { - extIndex <- grep("GTEX", df$sampleID) - if(any(extIndex)) { - processed <- df[-grep("GTEX", df$sampleID), ] - imported <- df[grep("GTEX", df$sampleID), ] - return(list(processed = processed, imported = imported)) - } else { - return(list(processed = df)) - } -} - -#' Function to identify aberrantly expressed genes in an outrider results -#' data frame. -#' `get_aberrants` takes an outrider results data frame and creates a column -#' named "aberrant", set to FALSE by default. It then compares each gene's -#' zScore and adjusted p-value to the cutoffs provided and sets that gene's -#' aberrant value to TRUE if it passes the check (zScore greater than cutoff and -#' adjusted p-value lower than cutoff)'. It then subsets the input data frame -#' to samples with at least one aberrantly expressed gene and genes aberrantly -#' expressed in at least one sample. -#' @inheritParams main_abgenes_Hunter -#' @param df An OUTRIDER results data frame. -#' @returns A data frame containing only samples with at least one aberrantly -#' expressed genes and genes aberrantly expressed in at least one sample. -#' @export - -get_aberrants <- function(df, z_score_cutoff, p_adj_cutoff) { - if(is.null(df)) { - return(NULL) - } - df$aberrant <- FALSE - df$aberrant[abs(df$zScore) > z_score_cutoff & df$padjust <= p_adj_cutoff] <- TRUE - aberrants <- df$aberrant==TRUE - genes <- df[aberrants, ]$geneID - samples <- df[aberrants, ]$sampleID - res <- df[df$geneID %in% genes & df$sampleID %in% samples, ] - if((any(is.na(res)))) { - warning("NAs in aberrant genes, removing") - res <- res[!is.na(res$geneID), ] - } - if(nrow(res)==0) { - warning("No aberrants found in input") - return(NULL) - } - return(res) -} - -#' Function to format DROP results -#' `format_aberrants` takes a data frame and subsets it to columns named -#' "sampleID", "geneID", "padjust", "type", "zScore" and "altRatio". It then -#' updates padjust column and takes its negative decimal logarithm, and renames -#' it to p_padjust (as in -log(padjust)). -#' @param input_table A DROP results data frame or data table. -#' @returns A subset of the data frame containing only "sampleID", "geneID", -#' "p_padjust" (calculated from "padjust" column), "type", "zScore" and -#' "altRatio" columns, if they existed in the input data frame. -#' @import data.table -#' @export - -format_aberrants <- function(input_table) { - if(is.null(input_table)) { - warning("NULL input. This can be due to missing module results or no - aberrants found.") - return(NULL) - } - names <- c("sampleID", "geneID", "padjust", "type", "zScore", "altRatio") - matches <- colnames(input_table) %in% names - if("data.table" %in% class(input_table)){ - res <- input_table[, matches, with = FALSE] - } else { - res <- input_table[, matches] - } - res$padjust <- -log(res$padjust) - colnames(res)[colnames(res)=="padjust"] <- "p_padjust" - return(res) -} - -#' Function to parse imported data information in merge_counts function. -#' `.parse_externals` subsets the sample annotation table to entries with -#' "EXTERNAL" column set to "external", and sets up a custom object to allow -#' the incorporation of imported data into OUTRIDER analysis. -#' @inheritParams .get_metadata -#' @param local_counts Counts table to merge with external table -#' @returns A list containing three objects. "run" is a logical that is TRUE -#' if the is an imported counts table. Counts is the counts table extracted -#' from the file. IDs is a vectors containing imported sample IDs. - -.parse_externals <- function(sample_anno, local_counts) { - external_anno <- sample_anno[sample_anno$EXTERNAL=="external", ] - external_file <- unique(external_anno$GENE_COUNTS_FILE) - ex_counts <- data.frame(data.table::fread(external_file), - check.names = FALSE, row.names = "geneID") - ex_counts <- ex_counts[, colnames(ex_counts)!="Description"] - exCountIDs <- external_anno$RNA_ID - if(exCountIDs=="all") { - # Replace string "all" with sample names from external counts - exCountIDs <- colnames(ex_counts) - } else { - ex_counts <- subset(ex_counts, select = exCountIDs) - } - if(!identical(.get_unique_rownames(local_counts), - .get_unique_rownames(ex_counts))){ - stop('The rows (genes) of the count matrices to be - merged are not the same.') - } - return(list(run = nrow(external_anno) > 0, counts = ex_counts, - IDs = exCountIDs)) -} - -#' Function to add rowRanges to merged counts table, and imported counts -#' sample information if imported counts table exists in dataset. -#' `.get_metadata` takes a merged counts table, a sample annotation table, -#' a path to a count_ranges file and a logical, and adds some metadata to -#' merged counts table. -#' @inheritParams main_abgenes_Hunter -#' @param total_counts A merged counts table. -#' @param sample_anno A sample annotation table. -#' @param external A boolean. -#' * `TRUE` : function will check for imported table sample information, and -#' update the merged_table. -#' * `FALSE`: function will only set new rowRanges. -#' @returns A merged counts table with updated rowRanges and imported sample -#' metadata. - -.get_metadata <- function(total_counts, sample_anno, count_ranges, external) { - SummarizedExperiment::rowRanges(total_counts) <- readRDS(count_ranges) - col_data <- data.table::data.table(RNA_ID = as.character( - colnames(total_counts))) - col_data <- dplyr::left_join(col_data, sample_anno, by = "RNA_ID") - col_data <- methods::as(col_data, "DataFrame") - rownames(col_data) <- col_data$RNA_ID - local_IDs <- sample_anno$BASE_ID[sample_anno$EXTERNAL == "no"] - if(external) { - col_data$EXTERNAL[!col_data$BASE_ID %in% local_IDs] <- "external" - } - SummarizedExperiment::colData(total_counts) <- col_data - return(total_counts) -} - -#' Function to extract count tables and merge them into one. -#' `merge_counts` takes multiple paths to count tables and reads them, merging -#' them into one. -#' @inheritParams .get_metadata -#' @inheritParams main_abgenes_Hunter -#' @returns A merged counts table. -#' @export - -merge_counts <- function(cpu, sample_anno, count_files, count_ranges) { - BiocParallel::register(BiocParallel::MulticoreParam(cpu)) - count_files <- strsplit(count_files, " ")[[1]] - local_counts_list <- BiocParallel::bplapply(count_files, .get_file_counts) - merged_assays <- do.call(cbind, local_counts_list) - message(paste("read", length(count_files), 'files')) - external_anno <- sample_anno[sample_anno$EXTERNAL=="external", ] - if(nrow(external_anno) > 0) { - external <- .parse_externals(sample_anno, merged_assays) - merged_assays <- cbind(merged_assays, external$counts) - } else { - external <- list(run = FALSE) - } - merged_counts_table <- cbind(rownames(merged_assays), merged_assays) - rownames(merged_counts_table) <- NULL - colnames(merged_counts_table)[1] <- "gene_ID" - total_counts <- SummarizedExperiment::SummarizedExperiment(assays = list( - counts = as.matrix(merged_assays))) - total_counts <- .get_metadata(total_counts = total_counts, - sample_anno = sample_anno, - count_ranges = count_ranges, - external = external$run) - return(total_counts) -} - -#' Wrapper for DROP OUTRIDER filtering script. -#' `filter_counts` takes a counts table, a txdb object and a fpkm_cutoff to -#' build an OutriderDataSet object and filter it by fpkm. -#' @inheritParams main_abgenes_Hunter -#' @param counts A counts table. -#' @returns A filtered OutriderDataSet built from the merged table. -#' @importFrom OUTRIDER OutriderDataSet filterExpression -#' @importFrom SummarizedExperiment colData rowData assay -#' @export - -filter_counts <- function(counts, txdb, fpkm_cutoff) { - ods <- OUTRIDER::OutriderDataSet(counts) - col_data <- SummarizedExperiment::colData(ods) - row_data <- SummarizedExperiment::rowData(ods) - col_data$EXTERNAL[is.na(col_data$EXTERNAL)] <- "no" - ods <- OUTRIDER::filterExpression(ods, gtfFile=txdb, filter=FALSE, - fpkm_cutoff=fpkm_cutoff, addExpressedGenes=TRUE) - - # add column for genes with at least 1 gene - row_data$counted1sample = rowSums(SummarizedExperiment::assay(ods)) > 0 - - col_data$isExternal <- col_data$EXTERNAL=="external" - return(ods) -} - -#' Wrapper for DROP main OUTRIDER script. -#' `run_outrider` takes a filtered Outrider dataset and runs the OUTRIDER -#' algorithm on it. -#' @inheritParams main_abgenes_Hunter -#' @param ods_unfitted The filtered Outrider dataset on which to run OUTRIDER. -#' @returns A fitted outrider dataset. -#' @export - -run_outrider <- function(ods_unfitted, implementation, max_dim_proportion) { - ods_unfitted <- ods_unfitted[ - SummarizedExperiment::mcols(ods_unfitted)$passedFilter, ] - - gr <- unlist(S4Vectors::endoapply(SummarizedExperiment::rowRanges( - ods_unfitted), range)) - if(length(gr) > 0){ - rd <- SummarizedExperiment::rowData(ods_unfitted) - SummarizedExperiment::rowRanges(ods_unfitted) <- gr - SummarizedExperiment::rowData(ods_unfitted) <- rd - } - - ods_unfitted <- OUTRIDER::estimateSizeFactors(ods_unfitted) - - b <- min(ncol(ods_unfitted), nrow(ods_unfitted)) / max_dim_proportion - - maxSteps <- 15 - if(max_dim_proportion < 4) { - maxSteps <- 20 - } - Nsteps <- min(maxSteps, b) - pars_q <- unique(round(exp(seq(log(5), log(b), length.out = Nsteps)))) - ods_unfitted <- OUTRIDER::findEncodingDim(ods_unfitted, params = pars_q, - implementation = implementation) - - ods <- OUTRIDER::OUTRIDER(ods_unfitted, implementation = implementation) - message("outrider fitting finished") - return(ods) -} - -#' Wrapper for DROP OUTRIDER results script. -#' `get_ods_results` extracts the results from an ods object. -#' @inheritParams main_abgenes_Hunter -#' @param ods A fitted outrider dataset. -#' @returns A list with two items. The first one, named "all", contains all -#' results. The second one, "table", only contains significant results. -#' @export - -get_ods_results <- function(ods, p_adj_cutoff, z_score_cutoff, - gene_mapping_file, sample_anno) { - res <- OUTRIDER::results(ods, padjCutoff = p_adj_cutoff, - zScoreCutoff = z_score_cutoff, all = TRUE) - res[, foldChange := round(2^l2fc, 2)] - res <- res[order(res$padj_rank),] - gene_annot_dt <- data.table::fread(gene_mapping_file) - if(!is.null(gene_annot_dt$gene_name)){ - if(grepl('ENSG00', res[1,geneID]) & grepl('ENSG00', gene_annot_dt[1,gene_id])){ - res <- merge(res, gene_annot_dt[, .(gene_id, gene_name)], - by.x = 'geneID', by.y = 'gene_id', sort = FALSE, all.x = TRUE) - data.table::setnames(res, 'gene_name', 'hgncSymbol') - res <- cbind(res[, .(hgncSymbol)], res[, - 'hgncSymbol']) - } - } - if(!is.null(sample_anno$HPO_TERMS) & nrow(res) > 0){ - if(!all(is.na(sample_anno$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){ - res <- .add_HPO_cols(res, hpo_file = hpo_file) - } - } - return(list(all = res, table = res[padjust <= p_adj_cutoff & - abs(zScore) > z_score_cutoff])) - } - -#' Function to get the Biological Coefficient Variation of an OutriderDataSet -#' object. -#' `get_bcv` extracts the biological coefficient variation of an outrider -#' dataset. -#' @inheritParams get_ods_results -#' @returns A data table containing the biological coefficient variation -#' of the outrider dataset before and after normalisation. -#' @export - -get_bcv <- function(ods) { - before <- data.table::data.table(when = "Before", - BCV = 1/sqrt(.estimateThetaWithoutAutoCorrect(ods))) - after <- data.table::data.table(when = "After", - BCV = 1/sqrt(OUTRIDER::theta(ods))) - bcv_dt <- rbind(before, after) - return(bcv_dt) -} - -#' Function to merge bam stats. -#' `merge_bam_stats` Merges the bam stat files contained in the specified path. -#' @inheritParams main_abgenes_Hunter -#' @returns A list containing two merged counts matrices, one for local and -#' one for xternal counts, and a merged bam stats table. -#' @export - -merge_bam_stats <- function(ods, stats_path) { - stats <- list.files(stats_path, pattern=".txt", full.names = TRUE) - bam_coverage <- lapply(stats, read.table) - bam_coverage <- do.call(rbind, bam_coverage) - colnames(bam_coverage) <- c("sampleID", "record_count") - merged_bam_stats <- .extract_coverage_info(ods = ods, - bam_coverage = bam_coverage) - return(merged_bam_stats) -} - -#' Function to convert aberrant expression results into a format that allows -#' a heatmap report of aberrants found. -#' `format_for_report` Takes an OUTRIDER results table and reformats it, -#' returning only information pertaining genes aberrantly expressed in at least -#' one sample, and only samples containing at least one aberrantly expressed -#' gene. It keeps p-value and z-score information. -#' @inheritParams main_abgenes_Hunter -#' @returns A table containing the merged bam stats. -#' @export - -format_for_report <- function(results, z_score_cutoff, p_adj_cutoff) { - data <- .processed_vs_imported(results) - aberrants <- lapply(data, get_aberrants, z_score_cutoff, p_adj_cutoff) - formatted <- lapply(aberrants, format_aberrants) - return(formatted) -} - -#' Function to split a string by a certain character and return only the desired -#' element. -#' `.split_string_by_char` Splits a string by the provided character, and -#' keeps the result specified by the provided index. -#' @param string String to split. -#' @param char Char by which the string will be split. -#' @param index Index of the element after splitting that will be kept -#' @returns The element of the split string specified by provided index. - -.split_string_by_char <- function(string, char, index) { - if (!char %in% strsplit(string, "")[[1]]) { - warning("WARNING: Attempted to split string by character not present") - } - split <- strsplit(x = string, split = char, fixed = TRUE)[[1]] - if(index > length(split) || index < 1) { - warning("WARNING: Attempted to split string by out-of-bounds index.") - } - return(split[index]) -} - -#' Extract correlation between samples according to gene counts. -#' `get_counts_correlation` Calculates the correlation between all samples of an -#' ods object according to gene count levels. -#' @inheritParams .estimateThetaWithoutAutoCorrect -#' @param normalized A boolean. -#' * `TRUE` : raw counts. -#' * `FALSE`: normalized counts. -#' @returns The correlation data frame, calculated with the spearman method. -#' It constains a column of clusters (calculated with euclidean distance) and a -#' row of EXTERNAL metadata ('yes' for imported counts and 'no' for locally -#' processed counts). -#' @export -get_counts_correlation <- function(ods, normalized) { - counts <- OUTRIDER::counts(ods, normalized = normalized) - fc_matrix <- as.matrix(log2(counts + 1)) - counts_corr <- cor(fc_matrix, method = "spearman") - clust_annotation <- .get_clusters_df(ods = ods, matrix = counts_corr, - groups = "EXTERNAL", nClust = 4) - external_row <- clust_annotation["EXTERNAL"] - external_row <- t(rbind(NA, external_row)) - colnames(external_row)[1] <- "nClust" - res <- cbind(clust_annotation["nClust"], counts_corr) - res <- rbind(external_row, res) - rownames(res)[1] <- "EXTERNAL" - return(res) -} - -#' Clusterize correlation data. -#' `.get_clusters_df` Clusterizes a count correlation matrix, adding metadata -#' specified in the ods object from which it was calculated. -#' @inheritParams .estimateThetaWithoutAutoCorrect -#' @param matrix log2 counts correlation matrix. -#' @param groups A string. Column of ods colData to use as secondary grouping -#' atribute. Default: "EXTERNAL". -#' @param nClust Number of clusters in which data should be divided. -#' @returns A data frame. Rows are named after samples. Columns are nClust -#' (cluster where sample belongs) and specified groups column. - -.get_clusters_df <- function(ods, matrix, groups = character(), - nClust = 4, type = "sample") { - if(type == "gene") { - data <- SummarizedExperiment::rowData(ods) - } else { - data <- SummarizedExperiment::colData(ods) - } - ans <- as.data.frame(data[, groups]) - colnames(ans) <- groups - clusters <- cutree(hclust(dist(matrix)), nClust) - ans[["nClust"]] <- as.character(clusters) - rownames(ans) <- rownames(data) - return(ans) -} - -#' Get gene counts correlations across all samples of an ods object. -#' samples from an ods object. Genes are then clusterized. -#' `get_gene_sample_correlations` Extracts the correlation between counts across -#' all samples and clusterizes it. -#' @inheritParams .estimateThetaWithoutAutoCorrect -#' @param normalized A boolean. -#' * `TRUE` : raw counts. -#' * `FALSE`: normalized counts. -#' @returns A data frame containing the correlations between the log2 of counts -#' calculated with the spearman method. It constains a column of clusters -#' (calculated with euclidean distance) and a row of EXTERNAL metadata ('yes' -#' for imported counts and 'no' for locally processed counts). -#' @export - -get_gene_sample_correlations <- function(ods, normalized = TRUE, nGenes = 500, - rowCentered = TRUE, bcvQuantile = 0.9) { - bcv <- 1/sqrt(OUTRIDER::theta(ods)) - SummarizedExperiment::rowData(ods)$BCV <- bcv - ods_sub <- ods[!is.na(bcv) & bcv > stats::quantile(bcv, probs=bcvQuantile, - na.rm=TRUE),] - if(!is.null(nGenes)){ - ods_sub <- ods_sub[BiocGenerics::rank( - SummarizedExperiment::rowData(ods_sub)$BCV) <= nGenes,] - } - if(normalized){ - fc_mat <- as.matrix(log2(OUTRIDER::counts( - ods_sub, normalized=TRUE) + 1)) - } else { - fc_mat <- as.matrix(log2(OUTRIDER::counts( - ods_sub, normalized=FALSE) + 1)) - fc_mat <- t(t(fc_mat)/ - DESeq2::estimateSizeFactorsForMatrix(fc_mat)) - } - if(rowCentered) { - fc_mat <- fc_mat - rowMeans(fc_mat) - } - cluster_col <- .get_clusters_df(ods = ods_sub, matrix = fc_mat, - nClust = 4, type = "gene") - external_row <- .get_clusters_df(ods = ods_sub, matrix = t(fc_mat), - groups = "EXTERNAL", nClust = 4)["EXTERNAL"] - external_row <- t(rbind(NA, external_row)) - colnames(external_row)[1] <- "nClust" - res <- cbind(cluster_col, fc_mat) - res <- rbind(external_row, res) - return(res) -} - -.extract_coverage_info <- function(bam_coverage, ods) { - cnts_mtx <- OUTRIDER::counts(ods, normalized = F) - local_columns <- SummarizedExperiment::colData(ods)$EXTERNAL == "no" - cnts_mtx_local <- cnts_mtx[, local_columns] - rownames(bam_coverage) <- bam_coverage$sampleID - coverage_df <- data.frame(sampleID = colnames(ods), - read_count = colSums(cnts_mtx)) - coverage_df <- merge(bam_coverage, coverage_df, by = "sampleID", sort = FALSE) - coverage_dt <- data.table::data.table(coverage_df) - data.table::setorder(coverage_dt, read_count) - coverage_dt[, count_rank := .I] - coverage_dt[, counted_frac := read_count/record_count] - data.table::setorder(coverage_dt, counted_frac) - coverage_dt[, frac_rank := .I] - size_factors <- OUTRIDER::sizeFactors(ods) - locals <- names(size_factors) %in% colnames(cnts_mtx_local) - local_size_factors <- size_factors[locals] - coverage_dt[, size_factors := local_size_factors] - data.table::setorder(coverage_dt, size_factors) - coverage_dt[, sf_rank := 1:.N] - coverage_df <- as.data.frame(coverage_dt) - return(list(coverage_df = coverage_df, counts_matrix = cnts_mtx, - local_counts_matrix = cnts_mtx_local, size_factors = size_factors, - local_size_factors = local_size_factors)) -} - -filter_matrix <- function(ods, cnts_mtx, cnts_mtx_local, has_external) { - if(has_external){ - filter_mtx <- list( - local = cnts_mtx_local, - all = cnts_mtx, - `passed FPKM` = cnts_mtx[SummarizedExperiment::rowData(ods)$passedFilter,], - `min 1 read` = cnts_mtx[MatrixGenerics::rowQuantiles(cnts_mtx, probs = 0.95) > 1, ], - `min 10 reads` = cnts_mtx[MatrixGenerics::rowQuantiles(cnts_mtx, probs = 0.95) > 10, ] - ) - filter_dt <- lapply(names(filter_mtx), function(filter_name) { - mtx <- filter_mtx[[filter_name]] - data.table::data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name) - }) |> data.table::rbindlist() - filter_dt[, filter := factor(filter, levels = c('local', 'all', 'passed FPKM', 'min 1 read', 'min 10 reads'))] - } else { - filter_mtx <- list( - all = cnts_mtx, - `passed FPKM` = cnts_mtx[SummarizedExperiment::rowData(ods)$passedFilter,], - `min 1 read` = cnts_mtx[MatrixGenerics::rowQuantiles(cnts_mtx, probs = 0.95) > 1, ], - `min 10 reads` = cnts_mtx[MatrixGenerics::rowQuantiles(cnts_mtx, probs = 0.95) > 10, ] - ) - filter_dt <- lapply(names(filter_mtx), function(filter_name) { - mtx <- filter_mtx[[filter_name]] - data.table::data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name) - }) |> data.table::rbindlist() - filter_dt[, filter := factor(filter, levels = c('all', 'passed FPKM', 'min 1 read', 'min 10 reads'))] - } - return(as.data.frame(filter_dt)) -} - - -# Data manipulation extracted from OUTRIDER::plotExpressedGenes, WITHOUT -# the plotting part. -get_expressed_genes <- function(ods) { - exp_genes_cols <- c(sampleID = "sampleID", Rank = "expressedGenesRank", - Expressed = "expressedGenes", - Union = "unionExpressedGenes", - Intersection = "intersectionExpressedGenes", - Passed = "passedFilterGenes") - col_data <- SummarizedExperiment::colData(ods) - if(!all(exp_genes_cols %in% names(col_data))) { - stop("Compute expressed genes first by executing \n\tods <- ", - "filterExpression(ods, addExpressedGenes=TRUE)") - } - dt <- data.table::as.data.table(col_data[, exp_genes_cols]) - colnames(dt) <- names(exp_genes_cols) - df <- as.data.frame(dt) - new_col <- data.frame(Sample = rownames(df)) - new_df <- cbind(new_col, df) - rownames(new_df) <- NULL - return(df[order(df$Rank), ]) -} diff --git a/R/main_abgenes_Hunter.R b/R/main_abgenes_Hunter.R deleted file mode 100644 index 18410c4..0000000 --- a/R/main_abgenes_Hunter.R +++ /dev/null @@ -1,148 +0,0 @@ -#' Main Aberrant Expression function -#' -#' `main_abgenes_Hunter` runs DROP OUTRIDER analysis. -#' @param sample_annotation Path to sample annotation table. See documentation. -#' @param anno_database TxDb annotation database created by preprocess_gtf -#' function. -#' @param count_ranges Count ranges file generated by preprocess_gtf function. -#' @param gene_mapping_file Gene mapping file generated by preprocess_gtf -#' function. -#' @param count_files Count tables to process. Can be in rds or table format. -#' @param dataset Name of the dataset. -#' @param cpu Amount of CPUs available for analysis. Default 1. -#' @param fpkm_cutoff Min FPKM value for read filtering. Default 3. -#' @param implementation Method for sample covariation removal in OUTRIDER. -#' Possible values: "autoencoder" (default), "pca" or "peer". -#' @param z_score_cutoff Cutoff that gene zScore must surpass in order to be -#' considered aberrantly expressed. -#' @param p_adj_cutoff Adjusted p-value cutoff. Gene adjusted p-value must be -#' lower than this number in order to be considered aberrantly expressed. -#' @param max_dim_proportion Maximum value for autoencoder encoding -#' dimension. Default 3, optimum for aberrant expression. -#' @param hpo_file File containing HPO terms associated to genes. Optional. -#' @param stats_path Bam stats directories of each sample. Stats generated -#' with samtools idxstats (see Aberrant_Expression workflow for how to generate -#' this file). -#' @param top_N Top N genes by adjusted p-value to select for OUTRIDER -#' overview. Default 10. -#' @return Aberrant expression object containing all analysis results. -#' @importFrom data.table fread -#' @importFrom dplyr left_join -#' @importFrom utils write.table -#' @importFrom SummarizedExperiment SummarizedExperiment colData rowData -#' rowRanges mcols -#' @importFrom OUTRIDER OutriderDataSet filterExpression estimateSizeFactors -#' findEncodingDim OUTRIDER results plotEncDimSearch plotAberrantPerSample -#' plotCountCorHeatmap plotCountGeneSampleHeatmap -#' @importFrom AnnotationDbi loadDb -#' @importFrom S4Vectors endoapply -#' @importFrom ggplot2 labs scale_color_brewer aes geom_boxplot theme_bw -#' geom_point ylim geom_histogram scale_x_log10 facet_wrap guides guide_legend -#' theme element_rect -#' @importFrom cowplot theme_cowplot background_grid plot_grid -#' @importFrom MatrixGenerics rowQuantiles -#' @export - -main_abgenes_Hunter <- function(sample_annotation = NULL, anno_database = NULL, - count_ranges = NULL, gene_mapping_file = NULL, - count_files = NULL, dataset = NULL, cpu = 1, - fpkm_cutoff = 1, implementation = "autoencoder", - max_dim_proportion = 3, p_adj_cutoff = 0.05, - z_score_cutoff = 0.2, hpo_file = NULL, - stats_path = NULL, top_N = 10) { - sample_anno <- data.table::fread(sample_annotation, - colClasses = c(RNA_ID = 'character', - DNA_ID = 'character')) - has_external <- any(sample_anno$EXTERNAL == "external") - txdb <- AnnotationDbi::loadDb(anno_database) - counts <- merge_counts(cpu = cpu, sample_anno = sample_anno, - count_files = count_files, - count_ranges = count_ranges) - ods_unfitted <- filter_counts(counts = counts, txdb = txdb, - fpkm_cutoff = fpkm_cutoff) - ods <- run_outrider(ods_unfitted = ods_unfitted, - implementation = implementation, - max_dim_proportion = max_dim_proportion) - outrider_results <- get_ods_results(ods = ods, p_adj_cutoff = p_adj_cutoff, - z_score_cutoff = z_score_cutoff, - gene_mapping_file = gene_mapping_file, - sample_anno = sample_anno) - bcv_dt <- get_bcv(ods) - merged_bam_stats <- merge_bam_stats(ods = ods, stats_path = stats_path) - formatted <- format_for_report(outrider_results$all, - z_score_cutoff, p_adj_cutoff) - raw_sample_cors <- get_counts_correlation(ods, FALSE) - norm_sample_cors <- get_counts_correlation(ods, TRUE) - raw_gene_cors <- get_gene_sample_correlations(ods, FALSE) - norm_gene_cors <- get_gene_sample_correlations(ods, TRUE) - filter_df <- filter_matrix(cnts_mtx = merged_bam_stats$counts_matrix, - cnts_mtx_local = merged_bam_stats$local_counts_matrix, - has_external = has_external, ods = ods) - expressed_genes <- get_expressed_genes(ods) - final_results <- list() - final_results$counts <- counts - final_results$ods_unfitted <- ods_unfitted - final_results$ods <- ods - final_results$outrider_res_all <- outrider_results$all - final_results$outrider_res_table <- outrider_results$table - final_results$coverage_df <- merged_bam_stats$coverage_df - final_results$formatted <- formatted - final_results$bcv_dt <- bcv_dt - final_results$raw_sample_cors <- raw_sample_cors - final_results$norm_sample_cors <- norm_sample_cors - final_results$raw_gene_cors <- raw_gene_cors - final_results$norm_gene_cors <- norm_gene_cors - final_results$filter_df <- filter_df - final_results$expressed_genes <- expressed_genes - return(final_results) -} - -write_abgenes_results <- function(final_results, output_dir) { - return("WIP") -} - -write_abgenes_report <- function(final_results, output_dir = getwd(), - template_folder = NULL, source_folder = "none", - p_adj_cutoff = 0.05, z_score_cutoff = 3, - top_N = 10){ - if(is.null(template_folder)) { - stop("No template folder was provided.") - } - if(!file.exists(source_folder)) { - stop(paste0("Source folder not found. Was ", source_folder)) - } - if(any(is.null(final_results))) { - stop("ERROR: final_results object contains NULL fields. Analysis - is not complete.") - } - template <- file.path(template_folder, "abgenes_template.txt") - tmp_folder <- "tmp_lib" - out_file <- paste0(output_dir, "/abgenes_report.html") - container <- list(counts = final_results$counts, ods = final_results$ods, - ods_unfitted = final_results$ods_unfitted, - outrider_res_all = final_results$outrider_res_all, - outrider_res_table = final_results$outrider_res_table, - coverage_df = final_results$coverage_df, - counts_matrix = final_results$counts_matrix, - local_counts_matrix = final_results$local_counts_matrix, - formatted = final_results$formatted, - bcv_dt = final_results$bcv_dt, - p_adj_cutoff = p_adj_cutoff, - z_score_cutoff = z_score_cutoff, - top_N = top_N, - raw_sample_cors = final_results$raw_sample_cors, - norm_sample_cors = final_results$norm_sample_cors, - raw_gene_cors = final_results$raw_gene_cors, - norm_gene_cors = final_results$norm_gene_cors, - filter_df = final_results$filter_df, - expressed_genes = final_results$expressed_genes) - plotter <- htmlreportR:::htmlReport$new(title_doc = "Aberrant Expression - report", - container = container, - tmp_folder = tmp_folder, - src = source_folder, - compress_obj = TRUE, - type_index = "menu") - plotter$build(template) - plotter$write_report(out_file) -} diff --git a/R/main_degenes_Hunter.R b/R/main_degenes_Hunter.R index e4c4e66..ed20101 100644 --- a/R/main_degenes_Hunter.R +++ b/R/main_degenes_Hunter.R @@ -17,6 +17,7 @@ #' @param minpack_common minimum of pack that must be significant to tag #' a gene as significant #' @param model_variables custom model +#' @param pseudocounts boolean, activate if the input contains pseudocounts #' @param numerics_as_factors transform numeric values to factors. Default: TRUE #' @param string_factors string factors for WGCNA #' @param numeric_factors numeric factors for WGCNA @@ -46,6 +47,7 @@ main_degenes_Hunter <- function( raw = NULL, + pseudocounts = FALSE, target = NULL, count_var_quantile = 0, external_DEA_data = NULL, @@ -78,7 +80,7 @@ main_degenes_Hunter <- function( library_sizes = NULL ){ modified_input_args <- check_input_main_degenes_Hunter(raw, - minlibraries, reads, external_DEA_data, modules, model_variables, + minlibraries, pseudocounts,reads, external_DEA_data, modules, model_variables, active_modules, WGCNA_all, minpack_common, target, string_factors, numeric_factors, multifactorial) modules <- modified_input_args[['modules']] @@ -119,6 +121,7 @@ main_degenes_Hunter <- function( if (sum(numeric_factors == "") >= 1) numeric_factors <- NULL #esto esta para controlar que no haya elementos vacios string_factors <- split_str(string_factors, ",") + string_factors <- c("treat", string_factors) @@ -238,14 +241,16 @@ main_degenes_Hunter <- function( #computing PCA for PREVALENT DEG prevalent_degs <- rownames(DE_all_genes[DE_all_genes$genes_tag == "PREVALENT_DEG",]) - pca_deg_data <- default_norm$default - pca_deg_data <- pca_deg_data[rownames(pca_deg_data) %in% prevalent_degs,] - - PCA_res[["DEGs"]] <- compute_pca(pca_data = pca_deg_data, - target = target, - string_factors = string_factors, - numeric_factors = numeric_factors) + if (length(prevalent_degs) > 1) { + pca_deg_data <- default_norm$default + pca_deg_data <- pca_deg_data[rownames(pca_deg_data) %in% prevalent_degs,] + DEG_pca <- compute_pca(pca_data = pca_deg_data, + target = target, + string_factors = string_factors, + numeric_factors = numeric_factors) + } } + PCA_res[["DEGs"]] <- DEG_pca if(grepl("W", modules)) { # Check WGCNA was run and returned proper results DE_all_genes <- merge(by.x=0, by.y="ENSEMBL_ID", x= DE_all_genes, @@ -324,6 +329,7 @@ main_degenes_Hunter <- function( check_input_main_degenes_Hunter <- function(raw, minlibraries, + pseudocounts = FALSE, reads, external_DEA_data, modules, @@ -335,6 +341,10 @@ check_input_main_degenes_Hunter <- function(raw, string_factors, numeric_factors, multifactorial){ + + if (pseudocounts) { + raw <- round(raw) + } if (minlibraries < 1){ stop(cat(paste0("Minimum library number to check minimum read counts", " cannot be less than 1.\nIf you want to avoid filtering, set", diff --git a/inst/extData/testdata/gencode.v45.toy.annotation.gtf b/inst/extData/testdata/gencode.v45.toy.annotation.gtf deleted file mode 100644 index e199a44..0000000 --- a/inst/extData/testdata/gencode.v45.toy.annotation.gtf +++ /dev/null @@ -1,50 +0,0 @@ -##description: evidence-based annotation of the human genome (GRCh38), version 45 (Ensembl 111) -##provider: GENCODE -##contact: gencode-help@ebi.ac.uk -##format: gtf -##date: 2023-09-19 -chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000290825.1"; gene_type "lncRNA"; gene_name "DDX11L2"; level 2; tag "overlaps_pseudogene"; -chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000290825.1"; transcript_id "ENST00000456328.2"; gene_type "lncRNA"; gene_name "DDX11L2"; transcript_type "lncRNA"; transcript_name "DDX11L2-202"; level 2; transcript_support_level "1"; tag "basic"; tag "Ensembl_canonical"; havana_transcript "OTTHUMT00000362751.1"; -chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000290825.1"; transcript_id "ENST00000456328.2"; gene_type "lncRNA"; gene_name "DDX11L2"; transcript_type "lncRNA"; transcript_name "DDX11L2-202"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; tag "Ensembl_canonical"; havana_transcript "OTTHUMT00000362751.1"; -chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000290825.1"; transcript_id "ENST00000456328.2"; gene_type "lncRNA"; gene_name "DDX11L2"; transcript_type "lncRNA"; transcript_name "DDX11L2-202"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; tag "Ensembl_canonical"; havana_transcript "OTTHUMT00000362751.1"; -chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000290825.1"; transcript_id "ENST00000456328.2"; gene_type "lncRNA"; gene_name "DDX11L2"; transcript_type "lncRNA"; transcript_name "DDX11L2-202"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; tag "Ensembl_canonical"; havana_transcript "OTTHUMT00000362751.1"; -chr1 HAVANA gene 12010 13670 . + . gene_id "ENSG00000223972.6"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2"; -chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.6"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; -chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.6"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; -chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.6"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; -chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.6"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; -chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.6"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; -chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.6"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; -chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.6"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; -chr1 HAVANA gene 14696 24886 . - . gene_id "ENSG00000227232.6"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; level 2; hgnc_id "HGNC:38034"; havana_gene "OTTHUMG00000000958.1"; -chr1 HAVANA transcript 14696 24886 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 24738 24886 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 1; exon_id "ENSE00003507205.2"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 18268 18366 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 2; exon_id "ENSE00003477500.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 17915 18061 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 3; exon_id "ENSE00003565697.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 17606 17742 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 4; exon_id "ENSE00003475637.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 17233 17368 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 5; exon_id "ENSE00003502542.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 16858 17055 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 6; exon_id "ENSE00003553898.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 16607 16765 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 7; exon_id "ENSE00003621279.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 15796 15947 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 8; exon_id "ENSE00002030414.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 14970 15038 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 9; exon_id "ENSE00001935574.2"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 HAVANA exon 14696 14829 . - . gene_id "ENSG00000227232.6"; transcript_id "ENST00000488147.2"; gene_type "unprocessed_pseudogene"; gene_name "WASH7P"; transcript_type "unprocessed_pseudogene"; transcript_name "WASH7P-201"; exon_number 10; exon_id "ENSE00004020410.1"; level 2; transcript_support_level "NA"; hgnc_id "HGNC:38034"; ont "PGO:0000005"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000958.1"; havana_transcript "OTTHUMT00000002839.1"; -chr1 ENSEMBL gene 17369 17436 . - . gene_id "ENSG00000278267.1"; gene_type "miRNA"; gene_name "MIR6859-1"; level 3; hgnc_id "HGNC:50039"; -chr1 ENSEMBL transcript 17369 17436 . - . gene_id "ENSG00000278267.1"; transcript_id "ENST00000619216.1"; gene_type "miRNA"; gene_name "MIR6859-1"; transcript_type "miRNA"; transcript_name "MIR6859-1-201"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:50039"; tag "basic"; tag "Ensembl_canonical"; -chr1 ENSEMBL exon 17369 17436 . - . gene_id "ENSG00000278267.1"; transcript_id "ENST00000619216.1"; gene_type "miRNA"; gene_name "MIR6859-1"; transcript_type "miRNA"; transcript_name "MIR6859-1-201"; exon_number 1; exon_id "ENSE00003746039.1"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:50039"; tag "basic"; tag "Ensembl_canonical"; -chr1 HAVANA gene 29554 31109 . + . gene_id "ENSG00000243485.5"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; level 2; hgnc_id "HGNC:52482"; tag "ncRNA_host"; havana_gene "OTTHUMG00000000959.2"; -chr1 HAVANA transcript 29554 31097 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "dotter_confirmed"; tag "not_best_in_genome_evidence"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; -chr1 HAVANA exon 29554 30039 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 1; exon_id "ENSE00001947070.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "dotter_confirmed"; tag "not_best_in_genome_evidence"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; -chr1 HAVANA exon 30564 30667 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 2; exon_id "ENSE00001922571.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "dotter_confirmed"; tag "not_best_in_genome_evidence"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; -chr1 HAVANA exon 30976 31097 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 3; exon_id "ENSE00001827679.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "dotter_confirmed"; tag "not_best_in_genome_evidence"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; -chr1 HAVANA transcript 30267 31109 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; -chr1 HAVANA exon 30267 30667 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 1; exon_id "ENSE00001841699.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; -chr1 HAVANA exon 30976 31109 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 2; exon_id "ENSE00001890064.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; -chr1 ENSEMBL gene 30366 30503 . + . gene_id "ENSG00000284332.1"; gene_type "miRNA"; gene_name "MIR1302-2"; level 3; hgnc_id "HGNC:35294"; -chr1 ENSEMBL transcript 30366 30503 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; -chr1 ENSEMBL exon 30366 30503 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; exon_number 1; exon_id "ENSE00003695741.1"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; -chr1 HAVANA gene 34554 36081 . - . gene_id "ENSG00000237613.2"; gene_type "lncRNA"; gene_name "FAM138A"; level 2; hgnc_id "HGNC:32334"; havana_gene "OTTHUMG00000000960.1"; -chr1 HAVANA transcript 34554 36081 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lncRNA"; gene_name "FAM138A"; transcript_type "lncRNA"; transcript_name "FAM138A-201"; level 2; transcript_support_level "1"; hgnc_id "HGNC:32334"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1"; -chr1 HAVANA exon 35721 36081 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lncRNA"; gene_name "FAM138A"; transcript_type "lncRNA"; transcript_name "FAM138A-201"; exon_number 1; exon_id "ENSE00001656588.1"; level 2; transcript_support_level "1"; hgnc_id "HGNC:32334"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1"; -chr1 HAVANA exon 35277 35481 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lncRNA"; gene_name "FAM138A"; transcript_type "lncRNA"; transcript_name "FAM138A-201"; exon_number 2; exon_id "ENSE00001669267.1"; level 2; transcript_support_level "1"; hgnc_id "HGNC:32334"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1"; -chr1 HAVANA exon 34554 35174 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lncRNA"; gene_name "FAM138A"; transcript_type "lncRNA"; transcript_name "FAM138A-201"; exon_number 3; exon_id "ENSE00001727627.1"; level 2; transcript_support_level "1"; hgnc_id "HGNC:32334"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1"; -chr1 HAVANA transcript 35245 36073 . - . gene_id "ENSG00000237613.2"; transcript_id "ENST00000461467.1"; gene_type "lncRNA"; gene_name "FAM138A"; transcript_type "lncRNA"; transcript_name "FAM138A-202"; level 2; transcript_support_level "3"; hgnc_id "HGNC:32334"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002843.1"; diff --git a/inst/scripts/DROP_Aberrant_Expression.R b/inst/scripts/DROP_Aberrant_Expression.R deleted file mode 100755 index 7f6a591..0000000 --- a/inst/scripts/DROP_Aberrant_Expression.R +++ /dev/null @@ -1,95 +0,0 @@ -#! /usr/bin/env Rscript - - -########################################## -## LOAD LIBRARIES -########################################## - -if( Sys.getenv('DEGHUNTER_MODE') == 'DEVELOPMENT' ){ - # Obtain this script directory - full.fpath <- normalizePath(unlist(strsplit(commandArgs()[grep('^--file=', - commandArgs())], '='))[2]) - main_path_script <- dirname(full.fpath) - root_path <- file.path(main_path_script, '..', '..') - # Load custom libraries - custom_libraries <- c('main_abgenes_Hunter.R', 'DROP_functions.R') - for (lib in custom_libraries){ - source(file.path(root_path, 'R', lib)) - } - template_folder <- file.path(root_path, 'inst', 'templates') -} else { - require('ExpHunterSuite') - root_path <- find.package('ExpHunterSuite') - template_folder <- file.path(root_path, 'templates') -} - -devtools::load_all('~/dev_R/htmlreportR') -source_folder <- file.path(find.package("htmlreportR"), "inst") - -########################################## -## OPTPARSE -########################################## - -option_list <- list( - optparse::make_option(c("-s", "--sample_annotation"), type="character", default=NULL, - help="Sample annotation table in tsv format."), - optparse::make_option(c("-a", "--anno_database"), type="character", default=NULL, - help="TxDb annotation database in db format."), - optparse::make_option(c("-r", "--count_ranges"), type="character", default=NULL, - help="Count ranges R object in rds format."), - optparse::make_option(c("-g", "--gene_mapping_file"), type="character", default=NULL, - help="Gene mapping file in tsv format."), - optparse::make_option(c("-c", "--count_files"), type="character", default=NULL, - help="Count tables, can be rds objects or text files."), - optparse::make_option(c("-d", "--dataset"), type="character", default=NULL, - help="Dataset name."), - optparse::make_option(c("-n", "--cpu"), type="integer", default=NULL, - help="Number of CPUs provided to job."), - optparse::make_option(c("--fpkm_cutoff"), type="numeric", default=1, - help="FPKM cutoff to filter expressed genes."), - optparse::make_option(c("--implementation"), type="character", default="autoencoder", - help="Implementation for sample covariation removal."), - optparse::make_option(c("--max_tested_dimension_proportion"), type="numeric", default=3, - help="Quotient by which to divide the number of samples to calculate maximum value for encoding dimension."), - optparse::make_option(c("--p_adj_cutoff"), type="numeric", default=NULL, - help="adjusted P-value to use as cutoff to mark a gene as aberrantly expressed."), - optparse::make_option(c("--z_score_cutoff"), type="numeric", default=NULL, - help="z score value to use as cutoff to mark a gene as aberrantly expressed."), - optparse::make_option(c("-f", "--hpo_file"), type="character", default=NULL, - help="Genes and associated HPO terms in compressed tsv format."), - optparse::make_option(c("-b", "--stats_path"), type="character", default=NULL, - help="Path to directory containing bam stats."), - optparse::make_option(c("-t", "--top_N"), type="integer", default=10, - help="Top N genes by adjusted p-value to be selected."), - optparse::make_option(c("-o", "--report_dir"), type="character", default=getwd(), - help="Directory where report will be generated.") - ) - -opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) - -########################################## -## MAIN -########################################## - -BiocParallel::register(BiocParallel::MulticoreParam(opt$cpu)) - -final_results <- main_abgenes_Hunter( - sample_annotation = opt$sample_annotation, - anno_database = opt$anno_database, - count_ranges = opt$count_ranges, - gene_mapping_file = opt$gene_mapping_file, - count_files = opt$count_files, - dataset = opt$dataset, - cpu = opt$cpu, - fpkm_cutoff = opt$fpkm_cutoff, - implementation = opt$implementation, - max_dim_proportion = opt$max_tested_dimension_proportion, - p_adj_cutoff = opt$p_adj_cutoff, - z_score_cutoff = opt$z_score_cutoff, - hpo_file = opt$hpo_file, - stats_path = opt$stats_path, - top_N = opt$top_N) - -write_abgenes_report(final_results = final_results, template_folder = template_folder, output_dir = opt$report_dir, - source_folder = source_folder, p_adj_cutoff = opt$p_adj_cutoff, z_score_cutoff = opt$z_score_cutoff, - top_N = opt$top_N) diff --git a/inst/scripts/DROP_preprocess_GTF.R b/inst/scripts/DROP_preprocess_GTF.R deleted file mode 100755 index 76bed55..0000000 --- a/inst/scripts/DROP_preprocess_GTF.R +++ /dev/null @@ -1,50 +0,0 @@ -#! /usr/bin/env Rscript - -########################################## -## LOAD LIBRARIES -########################################## - -if( Sys.getenv('ABGHUNTER_MODE') == 'DEVELOPMENT' ){ - # Obtain this script directory - full.fpath <- normalizePath(unlist(strsplit(commandArgs()[grep('^--file=', - commandArgs())], '='))[2]) - main_path_script <- dirname(full.fpath) - root_path <- file.path(main_path_script, '..', '..') - # Load custom libraries - custom_libraries <- 'DROP_functions.R' - for (lib in custom_libraries){ - source(file.path(root_path, 'R', lib)) - } - template_folder <- file.path(root_path, 'inst', 'templates') -} else { - require('ExpHunterSuite') - root_path <- find.package('ExpHunterSuite') - template_folder <- file.path(root_path, 'templates') -} - -option_list <- list( - optparse::make_option(c("-g", "--gtf"), type="character", default=NULL, - help="GTF file for txdb."), - optparse::make_option(c("-o", "--output_dir"), type="character", default=NULL, - help="Path to output directory.") - ) -opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) - -preprocessing_results <- preprocess_gtf(opt$gtf) - -if(is.null(opt$output_dir)) { - ### I want this to point do inst/DROP_data, or root/DROP_data in installed package - output_dir <- system.file("DROP_data", package= "ExpHunterSuite") -} else { - output_dir <- opt$output_dir -} - -gtf_name <- basename(tools::file_path_sans_ext(opt$gtf)) -db_file <- file.path(outdir, paste0(gtf_name, "_txdb.db")) -cr_file <- file.path(outdir, paste0(gtf_name, "_count_ranges.rds")) -map_file <- file.path(outdir, paste0(gtf_name, "_gene_name_mapping.tsv")) - -AnnotationDbi::saveDb(preprocessing_results$txdb, db_file) -saveRDS(preprocessing_results$count_ranges, cr_file) -write.table(preprocessing_results$gene_name_mapping, file = map_file, sep = "\t", quote = FALSE, - row.names = FALSE) diff --git a/inst/templates/DROP_template.txt b/inst/templates/DROP_template.txt deleted file mode 100644 index cf19a58..0000000 --- a/inst/templates/DROP_template.txt +++ /dev/null @@ -1,86 +0,0 @@ - -