-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_RNAseq_samples.py
170 lines (145 loc) · 4.95 KB
/
filter_RNAseq_samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
### Boas Pucker ###
### [email protected] ###
### v0.4 ###
__usage__ = """
python3 filter_RNAseq_samples.py
--tpms <TPM_FILE>
--counts <COUNT_FILE>
--out <OUTPUT_FILE>
--min <MIN_PERCENT_EXPRESSION_ON_TOP100>
--max <MAX_PERCENT_EXPRESSION_ON_TOP100>
optional:
--mincounts <MIN_READ_NUMBER>[1000000]
--black <ID_BLACK_LIST>
"""
import os, sys, glob
import matplotlib.pyplot as plt
# --- end of imports --- #
def load_all_TPMs( exp_file ):
"""! @brief load all values from given TPM file """
data = {}
genes = []
with open( exp_file, "r" ) as f:
headers = f.readline().strip()
if "\t" in headers:
headers = headers.split('\t')
else:
headers = [ headers ]
if headers[0] == "gene":
headers = headers[1:]
elif headers[0] == exp_file.split('/')[-1][:10]:
headers = headers[1:]
for header in headers:
data.update( { header: [] } )
line = f.readline()
while line:
parts = line.strip().split('\t')
genes.append( parts[0] )
for idx, val in enumerate( parts[1:] ):
data[ headers[ idx ] ].append( float( val ) )
line = f.readline()
return data, genes
def load_black_IDs( black_list_file ):
"""! @brief load IDs from given black list """
black_list = {}
with open( black_list_file, "r" ) as f:
line = f.readline()
while line:
if len( line ) > 3:
black_list.update( { line.strip(): None } )
line = f.readline()
return black_list
def main( arguments ):
"""! @brief run everything """
tpm_file = arguments[ arguments.index('--tpms')+1 ]
count_file = arguments[ arguments.index('--counts')+1 ]
output_file = arguments[ arguments.index('--out')+1 ]
if '--min' in arguments:
min_cutoff = int( arguments[ arguments.index('--min')+1 ] )
else:
min_cutoff = 10 #value in percent
if '--max' in arguments:
max_cutoff = int( arguments[ arguments.index('--max')+1 ] )
else:
max_cutoff = 80
if '--mincounts' in arguments:
min_counts = int( arguments[ arguments.index('--mincounts')+1 ] )
else:
min_counts = 1000000
if '--black' in arguments:
black_list_file = arguments[ arguments.index('--black')+1 ]
black_list = load_black_IDs( black_list_file )
else:
black_list = {}
# --- run analysis of all data in folder/file --- #
doc_file = output_file + ".doc"
valid_samples = []
with open( doc_file, "w" ) as out:
out.write( "SampleName\tPercentageOfTop100\tPercentageOfTop500\tPercentageOfTop1000\n" )
TPM_data, genes = load_all_TPMs( tpm_file )
count_data, genes = load_all_TPMs( count_file )
for key in sorted( list( TPM_data.keys() ) ):
new_line = [ key ]
selection = sorted( TPM_data[ key ] )
counts = sum( count_data[ key ] ) #calculate counts per library
if counts >= min_counts: #check for sufficient library size
try: #check for ID presence on black list
black_list[ key ]
new_line.append( "ID on black list" )
out.write( "\t".join( list( map( str, new_line ) ) ) + "\n" )
except KeyError:
try:
val = 100.0 * sum( selection[-100:] ) / sum( selection )
except ZeroDivisionError:
val = 0
new_line.append( val )
if min_cutoff < val < max_cutoff:
valid_samples.append( key )
if len( selection ) > 500 and val > 0:
new_line.append( 100.0 * sum( selection[-500:] ) / sum( selection ) )
else:
new_line.append( "n/a" )
if len( selection ) > 1000 and val > 0:
new_line.append( 100.0 * sum( selection[-1000:] ) / sum( selection ) )
else:
new_line.append( "n/a" )
out.write( "\t".join( list( map( str, new_line ) ) ) + "\n" )
else:
new_line.append( "insufficient counts: " + str( counts ) )
out.write( "\t".join( list( map( str, new_line ) ) ) + "\n" )
print ( "number of valid sample: " + str( len( valid_samples ) ) )
print ( "number of invalid sample: " + str( len( TPM_data.keys() ) - len( valid_samples ) ) )
# --- generate output file --- #
if len( valid_samples ) > 0:
with open( output_file, "w" ) as out:
out.write( "gene\t" + "\t".join( valid_samples )+ "\n" )
for idx, gene in enumerate( genes ):
new_line = [ gene ]
for sample in valid_samples:
new_line.append( TPM_data[ sample ][ idx ] )
out.write( "\t".join( list( map( str, new_line ) ) ) + "\n" )
else:
print ( "WARNING: no valid samples in data set!" )
# --- generate figure --- #
fig_file = output_file + ".pdf"
values = []
with open( doc_file, "r" ) as f:
f.readline() #remove header
line = f.readline()
while line:
parts = line.strip().split('\t')
try:
values.append( float( parts[1] ) )
except ValueError:
pass
line = f.readline()
values = [ x for x in values if str(x) != 'nan' ]
fig, ax = plt.subplots()
ax.hist( values, bins=100, color="green" )
ax.set_xlabel( "Percentage of expression on top100 genes" )
ax.set_ylabel( "Number of analyzed samples" )
fig.savefig( fig_file )
if '--tpms' in sys.argv and '--counts' in sys.argv and '--out' in sys.argv:
main( sys.argv )
else:
sys.exit( __usage__ )