Skip to content

Commit

Permalink
Gene2symbol (#39)
Browse files Browse the repository at this point in the history
* Added sorting for Sample names to avoid conflicts

* Clean

* Added rule to annotate ensemblID with gene symbol

* fmt

* added env

* Update deseq2-init.R

* more variable test data

* more variable test data

Co-authored-by: jafors <[email protected]>
  • Loading branch information
jafors and jafors authored Aug 24, 2021
1 parent 8706d31 commit dfd0b9c
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 9 deletions.
4 changes: 2 additions & 2 deletions .test/config/samples.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sample_name condition
A treated
B untreated
A1 treated
B1 untreated
A2 treated
B2 untreated
4 changes: 2 additions & 2 deletions .test/config/units.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sample_name unit_name fq1 fq2 sra adapters strandedness
A1 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq
B1 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq
A2 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq
B2 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq
A2 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq
B2 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq
8 changes: 4 additions & 4 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ pca:
diffexp:
# contrasts for the deseq2 results method
contrasts:
treated-vs-untreated:
- treated
- untreated
A-vs-B:
- A
- B
model: ~condition

params:
cutadapt-pe: ""
cutadapt-se: ""
star: ""
star: "--outSAMtype BAM Unsorted"
6 changes: 6 additions & 0 deletions workflow/envs/biomart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- bioconda
- conda-forge
dependencies:
- bioconductor-biomart =2.46
- r-tidyverse =1.3
10 changes: 9 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ samples = (

def get_final_output():
final_output = expand(
"results/diffexp/{contrast}.diffexp.tsv",
"results/diffexp/{contrast}.diffexp.symbol.tsv",
contrast=config["diffexp"]["contrasts"],
)
final_output.append("results/deseq2/normcounts.symbol.tsv")
final_output.append("results/counts/all.symbol.tsv")
return final_output


Expand Down Expand Up @@ -144,6 +146,12 @@ def is_activated(xpath):
return bool(c.get("activate", False))


def get_bioc_species_name():
first_letter = config["ref"]["species"][0]
subspecies = config["ref"]["species"].split("_")[1]
return first_letter + subspecies


def get_fastqs(wc):
if config["trimming"]["activate"]:
return expand(
Expand Down
15 changes: 15 additions & 0 deletions workflow/rules/diffexp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,21 @@ rule count_matrix:
"../scripts/count-matrix.py"


rule gene_2_symbol:
input:
counts="{prefix}.tsv",
output:
symbol="{prefix}.symbol.tsv",
params:
species=get_bioc_species_name(),
log:
"logs/gene2symbol/{prefix}.log",
conda:
"../envs/biomart.yaml"
script:
"../scripts/gene2symbol.R"


rule deseq2_init:
input:
counts="results/counts/all.tsv",
Expand Down
3 changes: 3 additions & 0 deletions workflow/scripts/deseq2-init.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ if (snakemake@threads > 1) {
# colData and countData must have the same sample order, but this is ensured
# by the way we create the count matrix
cts <- read.table(snakemake@input[["counts"]], header=TRUE, row.names="gene", check.names=FALSE)
cts <- cts[ , order(names(cts))]

coldata <- read.table(snakemake@params[["samples"]], header=TRUE, row.names="sample_name", check.names=FALSE)
coldata <- coldata[order(row.names(coldata)), , drop=F]

dds <- DESeqDataSetFromMatrix(countData=cts,
colData=coldata,
Expand Down
62 changes: 62 additions & 0 deletions workflow/scripts/gene2symbol.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
library(biomaRt)
library(tidyverse)

# this variable holds a mirror name until
# useEnsembl succeeds ("www" is last, because
# of very frequent "Internal Server Error"s)
mart <- "useast"
rounds <- 0
while ( class(mart)[[1]] != "Mart" ) {
mart <- tryCatch(
{
# done here, because error function does not
# modify outer scope variables, I tried
if (mart == "www") rounds <- rounds + 1
# equivalent to useMart, but you can choose
# the mirror instead of specifying a host
biomaRt::useEnsembl(
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
mirror = mart
)
},
error = function(e) {
# change or make configurable if you want more or
# less rounds of tries of all the mirrors
if (rounds >= 3) {
stop(
)
}
# hop to next mirror
mart <- switch(mart,
useast = "uswest",
uswest = "asia",
asia = "www",
www = {
# wait before starting another round through the mirrors,
# hoping that intermittent problems disappear
Sys.sleep(30)
"useast"
}
)
}
)
}


df <- read.table(snakemake@input[["counts"]], sep='\t', header=1)

g2g <- biomaRt::getBM(
attributes = c( "ensembl_gene_id",
"external_gene_name"),
filters = "ensembl_gene_id",
values = df$gene,
mart = mart,
)

annotated <- merge(df, g2g, by.x="gene", by.y="ensembl_gene_id")
annotated$gene <- ifelse(annotated$external_gene_name == '', annotated$gene, annotated$external_gene_name)
annotated$external_gene_name <- NULL
write.table(annotated, snakemake@output[["symbol"]], sep='\t', row.names=F)


0 comments on commit dfd0b9c

Please sign in to comment.