-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
186 lines (166 loc) · 6.04 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import subprocess
subprocess.run(["mkdir", "data"])
# Specify NCBI's forward and reverse fastq file naming scheme for snakemake
DIRECTION = ["1", "2"]
#TODO: Fix this such that SAMPLES is taken from the list of samples with forward and reverse
# reads. This currently breaks rule 'srrMunch' when ANY sample in the accession list has
# no "_1.fastq" or "_2.fastq" output files. Better to run srrMunch first (maybe not even
# as a snakemake rule, given non-deterministic outcomes), and then establish SAMPLES
# variable from a subset of the files in "data", like maybe all _1.fastq SRRs?
#
#NOTE: Okay now we're getting somewhere. split-files works for importing, but will mess up vsearch
# join_pairs down the line ("more forward than reverse reads" errors). BUT, if the only SRRs
# included have both _1 and _2 with the old option (the split-3 default, i.e. no "-S" flag),
# THEN everything will work. So...the initial assumption is probably correct: run srr munch
# script first, then get the SAMPLES var from the resulting files.
#
#NOTE: Update this for use with ENA via fastq-dl (relies on metadata tsv rather than accession
# list, though you *can* just give it the PRJNA number!)
# Generate sample list by reading NCBI accession list file line-by-line
with open("SRR_Acc_List.txt") as f:
ALL_SRRs = [line.rstrip("\n") for line in f]
#configfile: "config.yaml"
rule all:
input:
"table-dn-99/feature-table.tsv"
threads: 7
# Download fastqs from NCBI, reading from SRR_Acc_List.txt
# TODO: Modify to a single tar file as output
rule srrMunch:
input: "SRR_Acc_List.txt"
output: "data/fastqs.tar"
log: "logs/srrMunch/output.log"
shell:
# NOTE: Could just trash the non-biologic reads
# (i.e. the third fastq per SRR, the one without the _1 or _2 suffix)
"""
scripts/SrrMunch2.sh {input} data
"""
# Snippets to generate a list of only the SRRs that have pairwise data
ALL_FASTQS = [x for x in os.listdir("data")]
SAMPLES = [x.split("_")[0] for x in ALL_FASTQS if x.endswith("_2.fastq")]
#ruleorder: generate_manifest_paired > generate_manifest_merged
#TODO: Create an input function to determine whether you're using
# "data/{sample}_{direction}.fastq" or "data/{sample}.fastq"
#
#TODO: Modify manifest_gen.py to take an input list (in this case, {wildcard.sample}).
# Or alternatively, filter the list of SAMPLES vs ALL_SRRs, and place those files
# in a new directory to use an the input to the current version of manifest_gen.py
# Filter SRRs by
# Create qiime2 manifest
# TODO: modify to expect the srr.tar output of srrmunch
rule generate_manifest:
input:
expand("data/{sample}_{direction}.fastq", sample=SAMPLES, direction=DIRECTION)
output: "manifest.tsv"
log: "logs/generate_manifest/output.log"
shell:
"python scripts/manifest_gen.py -i data -o {output}"
# Import seqs via manifest
rule import:
input: "manifest.tsv"
output: "test-paired-end-demux.qza"
log: "logs/import/output.log"
shell:
"scripts/import-paired.sh"
# """
# (qiime tools import
# --type 'SampleData[PairedEndSequencesWithQuality]' \
# --input-path {input} \
# --output-path {output} \
# --input-format PairedEndFastqManifestPhred33V2) > {log} 2>&1
# """
# Merge pairs using q2-vsearch join-pairs
rule merge_pairs:
input: "test-paired-end-demux.qza"
output: temp("test-merged.qza")
log: "logs/merge_pairs/output.log"
shell:
"scripts/join_pairs.sh"
# """
# (qiime vsearch join-pairs \
# --i-demultiplexed-seqs {input} \
# --o-joined-sequences {output}) > {log} 2>&1
# """
# Trim Primers
#rule primer_trim:
# input: "test-merged.qza"
# output: "trimmed_demux.qza"
# log: "logs/primer_trim/output.log"
# params:
# fwd=config["primers"]["FWD"]
# rev=config["primers"]["REV"]
# shell:
# """
# qiime cutadapt trim-paired \
# --i-demultiplexed-sequences {input} \
# --p-adapter-f {params.fwd} \
# --p-adapter-r {params.rev} \
# --p-discard-untrimmed \
# --o-trimmed-sequences {output}
# """
#
#rule offset:
# input: "trimmed_demux.qza"
# output: "offset_demux.qza"
# log: "logs/offset_trim/output.log"
# params:
# fwd=config["primers"]["FWD"]
# rev=config["primers"]["REV"]
# shell:
# """
# qiime cutadapt trim-paired \
# --i-demultiplexed-sequences {input} \
# --p-adapter-f {params.fwd} \
# --p-adapter-r {params.rev} \
# --p-discard-untrimmed \
# --o-trimmed-sequences {output}
# """
# TODO: Add input function to output "trimmed_demux.qza" or "offset_demux.qza",
# depending on whether config["offset"]["FWD"] or config["offset"]["REV"]
# are non-zero. Let this function also determine parameters in trim and
# offset rules if possible. Otherwise have 4 separate rules for 5' and 3'
# cutadapt trim and offset.
#def offset_needed():
# if config["offset"]["FWD"] or config["offset"]["REV"]:
# return "offset_demux.qza"
# else:
# return "trimmed_demux.qza"
rule derep:
input: "test-merged.qza"
output:
temp("table.qza"),
temp("rep-seqs.qza")
log: "logs/derep/output.log"
shell:
"scripts/derep.sh"
rule de_novo:
input:
table="table.qza",
seqs="rep-seqs.qza"
output:
temp("table-dn-99.qza"),
temp("rep-seqs-dn-99.qza")
log: "logs/de_novo/output.log"
shell:
"scripts/de-novo.sh"
rule q2_export:
input:
"table-dn-99.qza"
output:
out_dir=directory("table-dn-99"),
biom=temp("table-dn-99/feature-table.biom")
shell:
"""
scripts/q2_export.sh {input} {output.out_dir}
"""
rule biom_convert:
input:
"table-dn-99/feature-table.biom"
output:
"table-dn-99/feature-table.tsv"
log: "logs/biom_convert/output.log"
shell:
"""
(biom convert -i {input} -o {output} --to-tsv) > {log} 2>&1
"""