Skip to content

Commit

Permalink
bug: wrong copy of assemble_libs uploaded
Browse files Browse the repository at this point in the history
correct code added
  • Loading branch information
billytaj committed Apr 17, 2023
1 parent bd77db3 commit 5ddd9d9
Showing 1 changed file with 66 additions and 120 deletions.
186 changes: 66 additions & 120 deletions Scripts/ga_pre_scan_assemble_libs.py
Original file line number Diff line number Diff line change
@@ -1,137 +1,83 @@
#parse the wevote results. get the taxa. get the class. pull the libs from our chocophlan
#march 10, 2023: install a threshold for reads. There is noise in taxa.
#only include taxa for which there is a 1%< occurence
#this isn't the final design. this is just to assemble the pieces to test the efficiency of the new GA lib-maker
#new thing: copies all files, and their index files.
#there are a handful of exceptions
#actually, this file should never be used. we shouldn't copy. this should be code inside the pipe to just generate commands using the names_list

#note: only the class-level split has a size problem. everyone else is under 4.5GB



import os
import sys
import time
from datetime import datetime as dt
import shutil as sh

def import_nodes(nodes_file):
nodes_dict = dict()
with open(nodes_file, "r") as nodes_in:
for line in nodes_in:
cleaned_line = line.strip("\n").split("\t")
taxid = cleaned_line[0]
rank = cleaned_line[4]
nodes_dict[taxid] = rank

return nodes_dict

def import_wevote(wevote_file, existence_percent):
#filter the taxa. only include taxa that reps 1% or more
read_counter = 0
taxa_tally_dict = dict()
unique_taxa = set()
tally_dict = dict()

#read the file
with open(wevote_file, "r") as wevote_in:
for line in wevote_in:
read_counter += 1
cleaned_line = line.strip("\n").split("\t")
final_taxa = cleaned_line[-1]
#unique_taxa.add(final_taxa)
if(final_taxa in taxa_tally_dict):
taxa_tally_dict[final_taxa] += 1
def import_lib_names(names_file):
files_list = []
with open(names_file, "r") as names_in:
for line in names_in:
if("can't" in line):
continue
else:
taxa_tally_dict[final_taxa] = 1


#figure out percentages
for taxa in taxa_tally_dict:
rep_val = taxa_tally_dict[taxa] * 100/ read_counter
if(rep_val >= existence_percent):
unique_taxa.add(taxa)
tally_dict[taxa] = rep_val
return unique_taxa, tally_dict

def import_taxa_class_map(taxa_class_map_path):
taxa_class_dict = dict()
cleaned_line = line.strip("\n")

src_path = cleaned_line.split("|")[3]
if(os.path.exists(src_path)):
files_list.append(src_path)
return files_list

with open(taxa_class_map_path, "r") as taxa_class_in:
for line in taxa_class_in:
cleaned_line = line.strip("\n").split("\t")
taxa = cleaned_line[0]
class_level = cleaned_line[1].split("|")
class_taxa = class_level[1]
taxa_class_dict[taxa] = class_taxa
return taxa_class_dict
def copy_all_files(root_name, lib_dir, dest_path):
amb_path = os.path.join(lib_dir, root_name + ".amb")
ann_path = os.path.join(lib_dir, root_name + ".ann")
bwt_path = os.path.join(lib_dir, root_name + ".bwt")
pac_path = os.path.join(lib_dir, root_name + ".pac")
sa_path = os.path.join(lib_dir, root_name + ".sa")
root_path = os.path.join(lib_dir, root_name)

def export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count):
if("1236" in lib_file_path):
print(lib_file_path)
exist_flag = "no"
if(os.path.exists(lib_file_path)):
exist_flag = "yes"
yes_count += 1
else:
no_count += 1
sh.copy(root_path, dest_path)
sh.copy(amb_path, dest_path)
sh.copy(ann_path, dest_path)
sh.copy(bwt_path, dest_path)
sh.copy(pac_path, dest_path)
sh.copy(sa_path, dest_path)

def copy_fasta(root_name, lib_dir, dest_path):
root_path = os.path.join(lib_dir, root_name)
sh.copy(root_path, dest_path)

out_file.write(exist_flag +"|"+nodes_dict[class_item] + "|" + class_item + ".fasta" + "|" + lib_file_path + "\n")




if __name__ == "__main__":
wevote_file_path = sys.argv[1]
taxa_class_map_path = sys.argv[2]
nodes_file = sys.argv[3]
export_lib_file = sys.argv[4]
reject_lib_file = sys.argv[5]
lib_root_path = sys.argv[6]
exist_percent = float(sys.argv[7])

unique_taxa, tally_dict = import_wevote(wevote_file_path, exist_percent)
taxa_class_dict = import_taxa_class_map(taxa_class_map_path)
nodes_dict = import_nodes(nodes_file)

for item in tally_dict:
print(item, tally_dict[item])
names_file = sys.argv[1]
lib_dir = sys.argv[2]
names_list = import_lib_names(names_file)
dest_dir = sys.argv[3]
mode = sys.argv[4]

class_set = set()
reject_set = set()
for item in unique_taxa:
try:
class_set.add(taxa_class_dict[item])
except KeyError:
reject_set.add(item)

yes_count = 0
no_count = 0
no_find_count = 0
with open(export_lib_file, "w") as out_file:
for class_taxa_item in sorted([int(i) for i in class_set]):
try:
class_item = str(class_taxa_item)
lib_file_path = os.path.join(lib_root_path, class_item + ".fasta")
if(class_item == "1236" or class_item == "1760" or class_item == "28211"):
#bypasser for our specific split libs for these taxa
for i in range(0, 3):
lib_file_path = os.path.join(lib_root_path, class_item + "_" + str(i) + ".fasta")
export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count)

elif(class_item == "91061"):
#bypasser for our specific split libs for these taxa
for i in range(0, 2):
lib_file_path = os.path.join(lib_root_path, class_item + "_" + str(i) + ".fasta")
export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count)
for item in names_list:

if(item == "1236.fasta" or item == "1760.fasta" or item == "28211.fasta"):
root_name = item.split(".")[0]
for i in range(0, 3):
real_name = root_name + "_" + str(i) + ".fasta"
if(mode == "all"):
copy_all_files(real_name, lib_dir, dest_dir)
else:
export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count)



except KeyError:
no_find_count += 1
out_file.write("can't find" + "|" + class_item + ".fasta" + "\n")

with open(reject_lib_file, "w") as out_file:
copy_fasta(real_name, lib_dir, dest_dir)

for reject_taxa_item in sorted([int(i) for i in reject_set]):
reject_item = str(reject_taxa_item)
try:
out_file.write(nodes_dict[reject_item] + "|" + reject_item + "\n")
except KeyError:
out_file.write("can't find" + "|" + reject_item + "\n")

print("no:", no_count, "| yes:", yes_count, "| no-find:", no_find_count)
elif(item == "91061.fasta"):
for i in range(0, 2):
root_name = item.split(".")[0]
real_name = root_name + "_" + str(i) + ".fasta"
if(mode == "all"):
copy_all_files(real_name, lib_dir, dest_dir)
else:
copy_fasta(real_name, lib_dir, dest_dir)
else:
if(mode == "all"):
copy_all_files(item, lib_dir, dest_dir)
else:
copy_fasta(item, lib_dir, dest_dir)

0 comments on commit 5ddd9d9

Please sign in to comment.