-
Notifications
You must be signed in to change notification settings - Fork 0
/
blobblurb.py
126 lines (101 loc) · 5.56 KB
/
blobblurb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/python
import json
import pandas as pd
import argparse
import numpy as np, scipy.stats as st
import os
import re
parser = argparse.ArgumentParser()
parser.add_argument("blobdir", help="The blob directory containing bestsumorder and identifiers files in .json format")
parser.add_argument("buscoTable", help="The full table output of BUSCO (usually called: full_table.tsv for BUSCO5 or full_table_{outprefix}.tsv for BUSCO3)")
args = parser.parse_args()
df_buscos = pd.read_csv(args.buscoTable, sep='\t', comment='#', usecols=[0, 1, 2], header=None)
df_buscos.columns = ["busco id", "status", "# record"]
for filename in os.listdir(args.blobdir):
if re.match(r".*reads_cov\.json", filename):
with open(os.path.join(args.blobdir, filename), 'r') as f:
readCov = json.loads(f.read())
for filename in os.listdir(args.blobdir):
if re.match(r"identifiers\.json", filename):
with open(os.path.join(args.blobdir, filename), 'r') as f:
identifiers = json.loads(f.read())
for filename in os.listdir(args.blobdir):
if re.match(r"length\.json", filename):
with open(os.path.join(args.blobdir, filename), 'r') as f:
lengths = json.loads(f.read())
coverage = pd.json_normalize(readCov, record_path=['values'])
records = pd.json_normalize(identifiers, record_path=['values'])
record_lengths = pd.json_normalize(lengths, record_path=['values'])
df_buscos["# record"] = df_buscos["# record"].apply(str)
with open(args.blobdir + '/identifiers.json', 'r') as f:
identifiers = json.loads(f.read())
with open(args.blobdir + '/bestsumorder_phylum.json', 'r') as f:
bestsumorder_phylum = json.loads(f.read())
with open(args.blobdir + '/bestsumorder_species.json', 'r') as f:
bestsumorder_species = json.loads(f.read())
with open(args.blobdir + '/bestsumorder_class.json', 'r') as f:
bestsumorder_class = json.loads(f.read())
with open(args.blobdir + '/bestsumorder_order.json', 'r') as f:
bestsumorder_order = json.loads(f.read())
with open(args.blobdir + '/bestsumorder_family.json', 'r') as f:
bestsumorder_family = json.loads(f.read())
with open(args.blobdir + '/bestsumorder_genus.json', 'r') as f:
bestsumorder_genus = json.loads(f.read())
taxa = {"phylum" : bestsumorder_phylum,
"class" : bestsumorder_class,
"order" : bestsumorder_order,
"family" : bestsumorder_family,
"genus": bestsumorder_genus,
"species" : bestsumorder_species}
path = args.blobdir
def assignContaminantstoSpecies(taxa_dict, ctgNames, ctgLengths, cov):
df_id = pd.concat([ctgNames, ctgLengths], axis=1)
df_id = pd.concat([df_id, cov], axis=1)
df_id.columns = ["# record", "length", "coverage"]
for taxon, file in taxa_dict.items():
for key in file:
values = pd.json_normalize(file, record_path=['values'])
values = values.set_axis(['value'], axis=1)
keys = pd.json_normalize(file, record_path=['keys'])
keys = keys.set_axis([taxon], axis=1)
keys['value'] = keys.index
df_vk = values.join(keys, lsuffix='_value', rsuffix='_key', on = 'value')[taxon]
df_id = df_id.join(df_vk)
low,high = st.t.interval(0.95, len(df_id["coverage"])-1, loc=np.mean(df_id["coverage"]), scale=st.sem(df_id["coverage"]))
lessthanhalflow = low/2
morethantwicehigh = high*2
conditions = [
(df_id['coverage'] <= lessthanhalflow),
(df_id['coverage'] > lessthanhalflow) & (df_id['coverage'] <= morethantwicehigh),
(df_id['coverage'] > morethantwicehigh)
]
values = ['low', 'medium', 'high']
df_id['coverage class'] = np.select(conditions, values)
df_id["# record"] = df_id["# record"].apply(str)
return df_id
def addBUSCOs(df, buscos):
df_complete = buscos[buscos["status"] == "Complete"]
numbercomplete = df_complete["# record"].value_counts().to_frame("complete BUSCOs")
numbercomplete["# record"] = numbercomplete.index
df_duplicate = buscos[buscos["status"] == "Duplicated"]
numberduplicated = df_duplicate["# record"].value_counts().to_frame("duplicated BUSCOs")
numberduplicated["# record"] = numberduplicated.index
df.reset_index(drop = True, inplace = True)
numbercomplete.reset_index(drop = True, inplace = True)
df_busco_cols = pd.merge(left = df, right = numbercomplete, how = "left", left_on = "# record", right_on = "# record")
numberduplicated.reset_index(drop = True, inplace = True)
df_busco_cols = pd.merge(left = df_busco_cols, right = numberduplicated, how = "left", left_on = "# record", right_on = "# record")
df_busco_cols["complete BUSCOs"] = df_busco_cols["complete BUSCOs"].replace(np.nan, 0)
df_busco_cols["complete BUSCOs"] = df_busco_cols["complete BUSCOs"].astype(int)
df_busco_cols["duplicated BUSCOs"] = df_busco_cols["duplicated BUSCOs"].replace(np.nan, 0)
df_busco_cols["duplicated BUSCOs"] = df_busco_cols["duplicated BUSCOs"].astype(int)
return df_busco_cols
out1 = assignContaminantstoSpecies(taxa, records, record_lengths, coverage)
out2 = addBUSCOs(out1, df_buscos)
speciesTbl = out2[["# record", "length", "coverage", "coverage class", "complete BUSCOs", "duplicated BUSCOs", "phylum", "class", "order", "family", "genus", "species"]]
# keeperTigs = speciesTbl[speciesTbl['phylum'].isin(['Arthropoda', 'no-hit', 'undef'])]
# contaminantTigs = speciesTbl[~speciesTbl['phylum'].isin(['Arthropoda', 'no-hit', 'undef'])]
prefix = args.blobdir.split('/')[-1]
species_outfile = prefix + '_blobblurbout.tsv'
speciesTbl.to_csv(species_outfile, sep='\t', index=False, header=True)
print("Blobblurb finished blurbing.")