forked from ParkinsonLab/MetaPro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MetaPro.py
250 lines (181 loc) · 8.81 KB
/
MetaPro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#You should have received a copy of the GNU General Public License
#along with this program. If not, see <https://www.gnu.org/licenses/>.
#!/usr/bin/env python
from curses import meta
import sys
import os
import os.path
from argparse import ArgumentParser
from configparser import ConfigParser, ExtendedInterpolation
import multiprocessing as mp
import MetaPro_stages as mps
import time
import zipfile
import pandas as pd
import shutil
from datetime import datetime as dt
import psutil as psu
import threading as th
import queue as q
def debug_stop_check(self, stop_flag, signal):
if(stop_flag == signal):
sys.exit("paused at:", stop_flag)
def main(config_path, pair_1_path, pair_2_path, single_path, contig_path, output_folder_path, args_pack, tutorial_mode):
metapro_stage_obj = mps.mp_stage(config_path, pair_1_path, pair_2_path, single_path, contig_path, output_folder_path, args_pack, tutorial_mode)
# This is the format we use to launch each stage of the pipeline.
# We start a multiprocess that starts a subprocess.
# The subprocess is created from the commands object
# The quality filter stage
metapro_stage_obj.mp_quality_filter()
# The host read filter stage
metapro_stage_obj.mp_host_filter()
# The vector contaminant filter stage
metapro_stage_obj.mp_vector_filter()
# rRNA removal stage
metapro_stage_obj.mp_rRNA_filter()
# Duplicate repopulation
metapro_stage_obj.mp_repop()
# Assemble contigs
metapro_stage_obj.mp_assemble()
#metapro_stage_obj.mp_TA()
if(metapro_stage_obj.paths.DNA_DB_mode == "chocophlan"):
metapro_stage_obj.mp_GA_pre_scan()
#sys.exit("paused")
# GA split
metapro_stage_obj.mp_GA_split()
# GA lib check
metapro_stage_obj.mp_GA_lib_check()
# BWA gene annotation
metapro_stage_obj.mp_GA_BWA()
metapro_stage_obj.mp_GA_BWA_pp()
if(metapro_stage_obj.GA_DB_mode == "multi"):
metapro_stage_obj.mp_GA_BWA_merge()
# BLAT gene annotation
metapro_stage_obj.mp_GA_BLAT()
metapro_stage_obj.mp_GA_BLAT_pp()
metapro_stage_obj.mp_GA_BLAT_merge()
#DIAMOND gene annotation
metapro_stage_obj.mp_GA_dmd()
metapro_stage_obj.mp_GA_dmd_pp()
# final GA merge()
metapro_stage_obj.mp_GA_final_merge()
# Taxonomic annotation
metapro_stage_obj.mp_TA()
# Detect EC annotation
metapro_stage_obj.mp_EC()
# RPKM Table and Cytoscape Network
metapro_stage_obj.mp_output()
def tutorial_main(config_file, pair_1, pair_2, single, contig, output_folder, args_pack, tutorial_mode):
metapro_stage_obj = mps.mp_stage(config_file, pair_1, pair_2, single, contig, output_folder, args_pack, tutorial_mode)
if(tutorial_mode == "quality"):
# The quality filter stage
metapro_stage_obj.mp_quality_filter()
elif(tutorial_mode == "host"):
# The host read filter stage
metapro_stage_obj.mp_host_filter()
elif(tutorial_mode == "vector"):
# The vector contaminant filter stage
metapro_stage_obj.mp_vector_filter()
elif(tutorial_mode == "rRNA"):
# rRNA removal stage
metapro_stage_obj.mp_rRNA_filter()
elif(tutorial_mode == "repop"):
# Duplicate repopulation
metapro_stage_obj.mp_repop()
elif(tutorial_mode == "contigs"):
# Assemble contigs
metapro_stage_obj.mp_assemble()
elif(tutorial_mode == "GA"):
#check the contig state
metapro_stage_obj.mp_contig_statecheck()
# GA split
metapro_stage_obj.mp_GA_split()
# BWA gene annotation
metapro_stage_obj.mp_GA_BWA()
metapro_stage_obj.mp_GA_BWA_pp()
metapro_stage_obj.mp_GA_BWA_merge()
# BLAT gene annotation
metapro_stage_obj.mp_GA_BLAT()
metapro_stage_obj.mp_GA_BLAT_pp()
metapro_stage_obj.mp_GA_BLAT_merge()
#DIAMOND gene annotation
metapro_stage_obj.mp_GA_dmd()
metapro_stage_obj.mp_GA_dmd_pp()
# final GA merge()
metapro_stage_obj.mp_GA_final_merge()
elif(tutorial_mode == "TA"):
# Taxonomic annotation
metapro_stage_obj.mp_TA()
elif(tutorial_mode == "EC"):
# Detect EC annotation
metapro_stage_obj.mp_EC()
elif(tutorial_mode == "output"):
#check the contig state
metapro_stage_obj.mp_contig_statecheck()
# RPKM Table and Cytoscape Network
metapro_stage_obj.mp_output()
if __name__ == "__main__":
print("METAPRO metatranscriptomic analysis pipeline")
# This is where the code starts
# There's a few operating modes, mainly "docker", and "singularity". These modes edit the pipeline filepaths
parser = ArgumentParser(description="MetaPro - Meta-omic sequence processing and analysis pipeline"
"Version 2.2.0 © 2023")
parser.add_argument("-c", "--config", type=str, help="Path to the configureation file")
parser.add_argument("-1", "--pair1", type=str, help="Path to the file containing the forward paired-end reads in fastq format")
parser.add_argument("-2", "--pair2", type=str, help="Path to the file containing the reverse paired-end reads in fastq format")
parser.add_argument("-s", "--single", type=str, help="Path to the file containing the single-end reads in fastq format")
parser.add_argument("-con", "--contig", type=str, help="Tutorial use only: Path to the file containing the contig reads in fastq format")
parser.add_argument("-o", "--output_folder", type=str, required=True, help="Path of the folder for the output of the pipeline")
parser.add_argument("--nhost", "--no-host", action='store_true', help="Skip the host read removal step of the pipeline")
parser.add_argument("--verbose_mode", type=str, help = "Decide how to handle the interim files, Compress them, or leave them alone. Values are: keep, compress, quiet")
parser.add_argument("--tutorial", type = str, help = "tutorial operating mode for MetaPro")
args = parser.parse_args()
config_file = args.config if args.config else ""
contig = args.contig if args.contig else "None"
pair_1 = args.pair1 if args.pair1 else ""
pair_2 = args.pair2 if args.pair2 else ""
single = args.single if args.single else ""
output_folder = args.output_folder
no_host = args.nhost if args.nhost else False
verbose_mode = args.verbose_mode if args.verbose_mode else "quiet"
tutorial_mode = args.tutorial if args.tutorial else "none"
if(tutorial_mode == "none"):
if (args.pair1 and not args.pair2) or (args.pair2 and not args.pair1):
print("You must specify both forward and reverse reads for a paired-end run")
sys.exit()
elif args.single and (args.pair1 or args.pair2):
print("You cannot specify both paired-end and single-end reads in a single run.")
sys.exit()
if not (os.path.exists(output_folder)):
print("output folder does not exist. Now building directory.")
os.makedirs(output_folder)
os.chdir(output_folder)
config = ConfigParser(interpolation = ExtendedInterpolation())
if args.config:
config.read(config_file)
if not args.pair1 and not args.pair2 and not args.single:
pair_1 = config["Sequences"]["pair1"] if config["Sequences"]["pair1"] else ""
pair_2 = config["Sequences"]["pair2"] if config["Sequences"]["pair2"] else ""
single = config["Sequences"]["single"] if config["Sequences"]["single"] else ""
if pair_1 == "" and pair_2 == "" and single == "":
print("You must specify paired-end or single-end reads as input for the pipeline.")
sys.exit()
args_pack = dict()
args_pack["no_host"] = no_host
args_pack["verbose_mode"] = verbose_mode
print("=====================================")
print("no-host:", no_host)
print("verbose_mode:", verbose_mode)
if (tutorial_mode != "none"):
print("working in tutorial mode:", tutorial_mode)
tutorial_main(config_file, pair_1, pair_2, single, contig, output_folder, args_pack, tutorial_mode)
else:
main(config_file, pair_1, pair_2, single, contig, output_folder, args_pack, tutorial_mode)