From 0c429a37eecab4ec6beff207ccfa91b913c564ed Mon Sep 17 00:00:00 2001 From: Christopher Barrington Date: Tue, 12 Oct 2021 13:23:47 +0100 Subject: [PATCH] added read_msigdb --- DESCRIPTION | 1 + NAMESPACE | 5 +++++ R/babs.R | 38 ++++++++++++++++++++++++++++++++++++++ man/read_msigdb.Rd | 31 +++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+) create mode 100644 man/read_msigdb.Rd diff --git a/DESCRIPTION b/DESCRIPTION index f1a9c1b..d25e495 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,6 +23,7 @@ Imports: biomaRt, datarepository, devtools, + fs, gtools, kableExtra, plyr, diff --git a/NAMESPACE b/NAMESPACE index e3e8f30..a49cc32 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(open_x11) export(preferred_choice) export(print_object_size) export(read_dotbabs) +export(read_msigdb) export(resize_and_show) export(show_newpage_grid) export(wideScreen) @@ -42,10 +43,12 @@ importFrom(devtools,session_info) importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,select) +importFrom(fs,path) importFrom(kableExtra,kable) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(magrittr,extract) +importFrom(plyr,dlply) importFrom(plyr,llply) importFrom(purrr,discard) importFrom(purrr,keep) @@ -62,9 +65,11 @@ importFrom(stringr,str_pad) importFrom(stringr,str_remove) importFrom(stringr,str_replace_all) importFrom(stringr,str_split) +importFrom(stringr,str_subset) importFrom(stringr,str_to_lower) importFrom(stringr,str_to_upper) importFrom(tibble,deframe) importFrom(tibble,is_tibble) +importFrom(utils,count.fields) importFrom(utils,page) importFrom(yaml,read_yaml) diff --git a/R/babs.R b/R/babs.R index 8bad149..1efb61a 100644 --- a/R/babs.R +++ b/R/babs.R @@ -28,3 +28,41 @@ get_project_root <- function() unlist() %>% head(n=10) %>% str_c(collapse='/') + +#' Read files from my MSigDB cache +#' +#' Reads an MSigDB `gmt` file and returns a list of genes in pathways. +#' +#' @param collection Name of the collection +#' @param pathways Character vector of pathways in `collection` to keep +#' @param version Release version +#' @param identifier Identifier type: `symbols` or `entrez` (`entrez` may not exist!!) +#' @param dbroot Path to the root of the MSigDB files +#' +#' @return +#' A list of genes in pathways, with pathway name as the key and a character vector of gene identifiers (`identifer`). +#' +#' @importFrom fs path +#' @importFrom dplyr select +#' @importFrom plyr dlply +#' @importFrom purrr when +#' @importFrom stringr str_c str_subset +#' @importFrom utils count.fields + +#' @export +#' +read_msigdb <- function(collection='h.all', pathways=NULL, version='7.4', identifier='symbols', dbroot='/camp/stp/babs/working/barrinc/db/msigdb') { + + sprintf(fmt='%s/%s.v%s.%s.gmt',dbroot, collection, version, identifier) %>% + when(!file.exists(.)~stop('MSigDB gmt file does not exist!', call.=FALSE), + TRUE~.) %>% + {list(path=., ncol={count.fields(., sep='\t') %>% max()})} %>% + {read.table(file=.$path, sep='\t', fill=TRUE, header=FALSE, col.names=c('pathway', 'url', str_c('id.', 1:.$ncol)))} %>% + dlply(~pathway, function(x) + select(x, starts_with('id.')) %>% + unlist(use.names=FALSE) %>% + na.omit() %>% + str_subset('^$', negate=TRUE)) %>% + when(!is.null(pathways)~.[pathways], + TRUE~.[names(.)]) +} diff --git a/man/read_msigdb.Rd b/man/read_msigdb.Rd new file mode 100644 index 0000000..41c4731 --- /dev/null +++ b/man/read_msigdb.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/babs.R +\name{read_msigdb} +\alias{read_msigdb} +\title{Read files from my MSigDB cache} +\usage{ +read_msigdb( + collection = "h.all", + pathways = NULL, + version = "7.4", + identifier = "symbols", + dbroot = "/camp/stp/babs/working/barrinc/db/msigdb" +) +} +\arguments{ +\item{collection}{Name of the collection} + +\item{pathways}{Character vector of pathways in \code{collection} to keep} + +\item{version}{Release version} + +\item{identifier}{Identifier type: \code{symbols} or \code{entrez} (\code{entrez} may not exist!!)} + +\item{dbroot}{Path to the root of the MSigDB files} +} +\value{ +A list of genes in pathways, with pathway name as the key and a character vector of gene identifiers (\code{identifer}). +} +\description{ +Reads an MSigDB \code{gmt} file and returns a list of genes in pathways. +}