-
Notifications
You must be signed in to change notification settings - Fork 8
/
ditaxa.py
163 lines (134 loc) · 8.19 KB
/
ditaxa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
__author__ = "Ehsaneddin Asgari"
__license__ = "Apache 2"
__version__ = "1.0.0"
__maintainer__ = "Ehsaneddin Asgari"
__email__ = "[email protected]"
__project__ = "LLP - DiTaxa"
__website__ = "https://llp.berkeley.edu/ditaxa/"
import argparse
import os
import os.path
import sys
from main.DiTaxa import DiTaxaWorkflow
from utility.file_utility import FileUtility
import warnings
def checkArgs(args):
'''
This function checks the input arguments and returns the errors (if exist) otherwise reads the parameters
'''
# keep all errors
err = "";
# Using the argument parser in case of -h or wrong usage the correct argument usage
# will be prompted
parser = argparse.ArgumentParser()
## to do : chi2 print
# input directory #################################################################################################
parser.add_argument('--indir', action='store', dest='input_dir', default=False, type=str,
help='directory of 16S rRNA samples')
# file type #######################################################################################################
parser.add_argument('--ext', action='store', dest='filetype', default='fastq', type=str,
help='extension of the sample files, the default is fastq')
# to override the previous files or to continue ####################################################################
parser.add_argument('--override', action='store', dest='override',default=1, type=int,
help='Override the existing files?')
# output directory #################################################################################################
parser.add_argument('--outdir', action='store', dest='output_dir', default=False, type=str,
help="directory for storing the output files, if doesn't exist will be created.")
# dbname ################################################################################################
parser.add_argument('--dbname', action='store', dest='dbname', default=False, type=str,
help='dataset name: to be used for figures and output creation!')
# cores ################################################################################################
parser.add_argument('--cores', action='store', dest='cores', default=4, type=int,
help='Number of cores to be used, default is 4')
# label filename #################################################################################################
parser.add_argument('--fast2label', action='store', dest='fast2label', default=False, type=str,
help='tabular mapping between fatsa/fastq file names and their labels')
# blast path #################################################################################################
parser.add_argument('--blastn', action='store', dest='blastn', default="ncbi-blast/bin/", type=str,
help='path to the bin directory of blastn; If you run build.sh the default parameter works. Otherwise get the latest from ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/')
# label values ##################################################################################
parser.add_argument('--phenomap', action='store', dest='phenomap', default=None, type=str,
help='pair of comma separated label:[0 or 1]. e.g., untreated_disease:1, treated_diesease:0, healthy:0, ..')
# label values ##################################################################################
parser.add_argument('--phenoname', action='store', dest='phenoname', default=None, type=str,
help='Phenotype setting name, if not given the labeling scheme will be used.')
# generate heatmap ###############################################################################
parser.add_argument('--heatmap', action='store', dest='heatmap', default="positive_samples:negative_samples", type=str,
help='to generate heatmap of top 100 markers: positive_title:negative_title')
# generate excel file ###############################################################################
parser.add_argument('--excel', action='store', dest='excel', default=1, type=int,
help='to generate excel output')
######################################################################################################
parser.add_argument('--classify', action='store', dest='classify', type=str, default=False,
choices=[False, 'RF', 'SVM', 'DNN', 'LR'],
help='train_predictor: choice of classifier from RF, SVM, DNN')
parser.add_argument('--batchsize', action='store', dest='batch_size', type=int, default=10,
help='train_predictor-model/DNN: batch size for deep learning')
parser.add_argument('--gpu_id', action='store', dest='gpu_id', type=str, default='0',
help='train_predictor-model/DNN: GPU id for deep learning')
parser.add_argument('--epochs', action='store', dest='epochs', type=int, default=100,
help='train_predictor-model/DNN: number of epochs for deep learning')
parser.add_argument('--arch', action='store', dest='dnn_arch', type=str, default='1024,0.2,512',
help='train_predictor-model/DNN: The comma separated definition of neural network layers connected to eahc other, you do not need to specify the input and output layers, values between 0 and 1 will be considered as dropouts')
parsedArgs = parser.parse_args()
if (not os.access(parsedArgs.input_dir, os.F_OK)):
err = err + "\nError: Permission denied or could not find the directory!"
return err
if (not os.access(parsedArgs.fast2label, os.F_OK)):
err = err + "\nError: Permission to the label file is denied!"
return err
try:
label_dict = dict()
pheno_temp=[]
for x in parsedArgs.phenomap.split(','):
k, n = x.split(':')
k = k
n = int(n)
label_dict[k]=n
pheno_temp.append('@'.join([k,str(n)]))
pheno_temp='#'.join(pheno_temp)
if not parsedArgs.phenoname:
phenoname=pheno_temp
else:
phenoname=parsedArgs.phenoname
except:
err = err + "\nWrong format for labels!"
return err
if parsedArgs.heatmap:
if not len(parsedArgs.heatmap.split(':'))==2:
return err + "\nThe heatmap inputs is incorrect!"
if not os.access(parsedArgs.blastn, os.F_OK):
print('The blast path is incorrect..')
exit()
Pipeline = DiTaxaWorkflow(parsedArgs.input_dir,
parsedArgs.filetype,parsedArgs.output_dir,parsedArgs.dbname, 50000,5000,-1, parsedArgs.blastn, num_p=parsedArgs.cores, override=parsedArgs.override)
Pipeline.train_npe()
Pipeline.representation_npe()
labels={line.split()[0].split('/')[-1]:line.split()[1] for line in FileUtility.load_list(parsedArgs.fast2label)}
if parsedArgs.heatmap:
pos_label, neg_label =parsedArgs.heatmap.split(':')
Pipeline.biomarker_extraction(labels,label_dict,phenoname, excel=parsedArgs.excel, pos_label=pos_label,neg_label=neg_label)
else:
Pipeline.biomarker_extraction(labels,label_dict,phenoname, excel=parsedArgs.excel)
if parsedArgs.classify:
print('Classification requested..')
if parsedArgs.classify=='DNN':
'''
Deep learning
'''
arch=[int(layer) if float(layer)>1 else float(layer) for layer in parsedArgs.dnn_arch.split(',')]
Pipeline.classify_DNN(phenoname, arch, parsedArgs.gpu_id,parsedArgs.batch_size,parsedArgs.epochs)
else:
'''
SVM and Random Forest
'''
if parsedArgs.classify in ['SVM','RF','LR']:
Pipeline.classify_classic(phenoname, parsedArgs.classify, parsedArgs.cores)
else:
return "\nNot able to recognize the model!"
if __name__ == '__main__':
warnings.filterwarnings('ignore')
err = checkArgs(sys.argv)
if err:
print(err)
exit()