Skip to content

Commit

Permalink
QC (#14)
Browse files Browse the repository at this point in the history
* Added qc ruleset. Do QC based on rseqc and multiqc.

* Changed qc.smk to make rules not dependent on touched files but real outputs; improved readability.

* Working but still with custom script for convreting gtf to bed12

* Fixed multiqc rule to include rseqc_junction_annotation logs.

* Updated qc.smk to make it standardized.; removed multiqc env; removed scritps/gtf2bed.pl; added scripts/gtf2bed.py for converting gtf to bed12.

* Reworked gtf2bed.py tp make it snakemake centric.

* Update rseqc.yaml

* Update gffutils.yaml

* Update rseqc.yaml

* Use output file names in script.

* Use context for file obj.
  • Loading branch information
sschmeier authored and johanneskoester committed Mar 13, 2019
1 parent c0a1740 commit e078b1a
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ This workflow performs a differential expression analysis with STAR and Deseq2.
## Authors

* Johannes Köster (@johanneskoester), https://koesterlab.github.io
* Sebastian Schmeier (@sschmeier), https://sschmeier.com

## Usage

Expand Down
4 changes: 3 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ rule all:
expand(["results/diffexp/{contrast}.diffexp.tsv",
"results/diffexp/{contrast}.ma-plot.svg"],
contrast=config["diffexp"]["contrasts"]),
"results/pca.svg"
"results/pca.svg",
"qc/multiqc_report.html"


##### setup singularity #####
Expand All @@ -45,3 +46,4 @@ include: "rules/common.smk"
include: "rules/trim.smk"
include: "rules/align.smk"
include: "rules/diffexp.smk"
include: "rules/qc.smk"
5 changes: 5 additions & 0 deletions envs/gffutils.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::gffutils=0.9
5 changes: 5 additions & 0 deletions envs/rseqc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::rseqc=2.6.4
165 changes: 165 additions & 0 deletions rules/qc.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
## RSEQC

rule rseqc_gtf2bed:
input:
config["ref"]["annotation"]
output:
bed="qc/rseqc/annotation.bed",
db=temp("qc/rseqc/annotation.db")
log:
"logs/rseqc_gtf2bed.log"
conda:
"../envs/gffutils.yaml"
script:
"../scripts/gtf2bed.py"


rule rseqc_junction_annotation:
input:
bam="star/{sample}-{unit}/Aligned.out.bam",
bed="qc/rseqc/annotation.bed"
output:
"qc/rseqc/{sample}-{unit}.junctionanno.junction.bed"
priority: 1
log:
"logs/rseqc/rseqc_junction_annotation/{sample}-{unit}.log"
params:
extra=r"-q 255", # STAR uses 255 as a score for unique mappers
prefix="qc/rseqc/{sample}-{unit}.junctionanno"
conda:
"../envs/rseqc.yaml"
shell:
"junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
"> {log[0]} 2>&1"


rule rseqc_junction_saturation:
input:
bam="star/{sample}-{unit}/Aligned.out.bam",
bed="qc/rseqc/annotation.bed"
output:
"qc/rseqc/{sample}-{unit}.junctionsat.junctionSaturation_plot.pdf"
priority: 1
log:
"logs/rseqc/rseqc_junction_saturation/{sample}-{unit}.log"
params:
extra=r"-q 255",
prefix="qc/rseqc/{sample}-{unit}.junctionsat"
conda:
"../envs/rseqc.yaml"
shell:
"junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
"> {log} 2>&1"


rule rseqc_stat:
input:
"star/{sample}-{unit}/Aligned.out.bam",
output:
"qc/rseqc/{sample}-{unit}.stats.txt"
priority: 1
log:
"logs/rseqc/rseqc_stat/{sample}-{unit}.log"
conda:
"../envs/rseqc.yaml"
shell:
"bam_stat.py -i {input} > {output} 2> {log}"


rule rseqc_infer:
input:
bam="star/{sample}-{unit}/Aligned.out.bam",
bed="qc/rseqc/annotation.bed"
output:
"qc/rseqc/{sample}-{unit}.infer_experiment.txt"
priority: 1
log:
"logs/rseqc/rseqc_infer/{sample}-{unit}.log"
conda:
"../envs/rseqc.yaml"
shell:
"infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"


rule rseqc_innerdis:
input:
bam="star/{sample}-{unit}/Aligned.out.bam",
bed="qc/rseqc/annotation.bed"
output:
"qc/rseqc/{sample}-{unit}.inner_distance_freq.inner_distance.txt"
priority: 1
log:
"logs/rseqc/rseqc_innerdis/{sample}-{unit}.log"
params:
prefix="qc/rseqc/{sample}-{unit}.inner_distance_freq"
conda:
"../envs/rseqc.yaml"
shell:
"inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1"


rule rseqc_readdis:
input:
bam="star/{sample}-{unit}/Aligned.out.bam",
bed="qc/rseqc/annotation.bed"
output:
"qc/rseqc/{sample}-{unit}.readdistribution.txt"
priority: 1
log:
"logs/rseqc/rseqc_readdis/{sample}-{unit}.log"
conda:
"../envs/rseqc.yaml"
shell:
"read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"


rule rseqc_readdup:
input:
"star/{sample}-{unit}/Aligned.out.bam"
output:
"qc/rseqc/{sample}-{unit}.readdup.DupRate_plot.pdf"
priority: 1
log:
"logs/rseqc/rseqc_readdup/{sample}-{unit}.log"
params:
prefix="qc/rseqc/{sample}-{unit}.readdup"
conda:
"../envs/rseqc.yaml"
shell:
"read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1"


rule rseqc_readgc:
input:
"star/{sample}-{unit}/Aligned.out.bam"
output:
"qc/rseqc/{sample}-{unit}.readgc.GC_plot.pdf"
priority: 1
log:
"logs/rseqc/rseqc_readgc/{sample}-{unit}.log"
params:
prefix="qc/rseqc/{sample}-{unit}.readgc"
conda:
"../envs/rseqc.yaml"
shell:
"read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"


rule multiqc:
input:
expand("star/{unit.sample}-{unit.unit}/Aligned.out.bam", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.junctionanno.junction.bed", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.junctionsat.junctionSaturation_plot.pdf", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.infer_experiment.txt", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.stats.txt", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.inner_distance_freq.inner_distance.txt", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.readdistribution.txt", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.readdup.DupRate_plot.pdf", unit=units.itertuples()),
expand("qc/rseqc/{unit.sample}-{unit.unit}.readgc.GC_plot.pdf", unit=units.itertuples()),
expand("logs/rseqc/rseqc_junction_annotation/{unit.sample}-{unit.unit}.log", unit=units.itertuples())
output:
"qc/multiqc_report.html"
log:
"logs/multiqc.log"
wrapper:
"0.31.1/bio/multiqc"
16 changes: 16 additions & 0 deletions scripts/gtf2bed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import gffutils

db = gffutils.create_db(snakemake.input[0],
dbfn=snakemake.output.db,
force=True,
keep_order=True,
merge_strategy='merge',
sort_attribute_values=True,
disable_infer_genes=True,
disable_infer_transcripts=True)

with open(snakemake.output.bed, 'w') as outfileobj:
for tx in db.features_of_type('transcript', order_by='start'):
bed = [s.strip() for s in db.bed12(tx).split('\t')]
bed[3] = tx.id
outfileobj.write('{}\n'.format('\t'.join(bed)))

0 comments on commit e078b1a

Please sign in to comment.