From 5ddd9d95860ab7bbd716ac331bf69be47bd17e8c Mon Sep 17 00:00:00 2001 From: Billy Taj Date: Mon, 17 Apr 2023 16:16:15 -0400 Subject: [PATCH] bug: wrong copy of assemble_libs uploaded correct code added --- Scripts/ga_pre_scan_assemble_libs.py | 186 ++++++++++----------------- 1 file changed, 66 insertions(+), 120 deletions(-) diff --git a/Scripts/ga_pre_scan_assemble_libs.py b/Scripts/ga_pre_scan_assemble_libs.py index 99c9141..18c4e16 100644 --- a/Scripts/ga_pre_scan_assemble_libs.py +++ b/Scripts/ga_pre_scan_assemble_libs.py @@ -1,137 +1,83 @@ -#parse the wevote results. get the taxa. get the class. pull the libs from our chocophlan -#march 10, 2023: install a threshold for reads. There is noise in taxa. -#only include taxa for which there is a 1%< occurence +#this isn't the final design. this is just to assemble the pieces to test the efficiency of the new GA lib-maker +#new thing: copies all files, and their index files. +#there are a handful of exceptions +#actually, this file should never be used. we shouldn't copy. this should be code inside the pipe to just generate commands using the names_list + +#note: only the class-level split has a size problem. everyone else is under 4.5GB + + import os import sys import time from datetime import datetime as dt +import shutil as sh -def import_nodes(nodes_file): - nodes_dict = dict() - with open(nodes_file, "r") as nodes_in: - for line in nodes_in: - cleaned_line = line.strip("\n").split("\t") - taxid = cleaned_line[0] - rank = cleaned_line[4] - nodes_dict[taxid] = rank - - return nodes_dict - -def import_wevote(wevote_file, existence_percent): - #filter the taxa. only include taxa that reps 1% or more - read_counter = 0 - taxa_tally_dict = dict() - unique_taxa = set() - tally_dict = dict() - - #read the file - with open(wevote_file, "r") as wevote_in: - for line in wevote_in: - read_counter += 1 - cleaned_line = line.strip("\n").split("\t") - final_taxa = cleaned_line[-1] - #unique_taxa.add(final_taxa) - if(final_taxa in taxa_tally_dict): - taxa_tally_dict[final_taxa] += 1 +def import_lib_names(names_file): + files_list = [] + with open(names_file, "r") as names_in: + for line in names_in: + if("can't" in line): + continue else: - taxa_tally_dict[final_taxa] = 1 - - #figure out percentages - for taxa in taxa_tally_dict: - rep_val = taxa_tally_dict[taxa] * 100/ read_counter - if(rep_val >= existence_percent): - unique_taxa.add(taxa) - tally_dict[taxa] = rep_val - return unique_taxa, tally_dict - -def import_taxa_class_map(taxa_class_map_path): - taxa_class_dict = dict() + cleaned_line = line.strip("\n") + + src_path = cleaned_line.split("|")[3] + if(os.path.exists(src_path)): + files_list.append(src_path) + return files_list - with open(taxa_class_map_path, "r") as taxa_class_in: - for line in taxa_class_in: - cleaned_line = line.strip("\n").split("\t") - taxa = cleaned_line[0] - class_level = cleaned_line[1].split("|") - class_taxa = class_level[1] - taxa_class_dict[taxa] = class_taxa - return taxa_class_dict +def copy_all_files(root_name, lib_dir, dest_path): + amb_path = os.path.join(lib_dir, root_name + ".amb") + ann_path = os.path.join(lib_dir, root_name + ".ann") + bwt_path = os.path.join(lib_dir, root_name + ".bwt") + pac_path = os.path.join(lib_dir, root_name + ".pac") + sa_path = os.path.join(lib_dir, root_name + ".sa") + root_path = os.path.join(lib_dir, root_name) -def export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count): - if("1236" in lib_file_path): - print(lib_file_path) - exist_flag = "no" - if(os.path.exists(lib_file_path)): - exist_flag = "yes" - yes_count += 1 - else: - no_count += 1 + sh.copy(root_path, dest_path) + sh.copy(amb_path, dest_path) + sh.copy(ann_path, dest_path) + sh.copy(bwt_path, dest_path) + sh.copy(pac_path, dest_path) + sh.copy(sa_path, dest_path) + +def copy_fasta(root_name, lib_dir, dest_path): + root_path = os.path.join(lib_dir, root_name) + sh.copy(root_path, dest_path) - out_file.write(exist_flag +"|"+nodes_dict[class_item] + "|" + class_item + ".fasta" + "|" + lib_file_path + "\n") - - + if __name__ == "__main__": - wevote_file_path = sys.argv[1] - taxa_class_map_path = sys.argv[2] - nodes_file = sys.argv[3] - export_lib_file = sys.argv[4] - reject_lib_file = sys.argv[5] - lib_root_path = sys.argv[6] - exist_percent = float(sys.argv[7]) - - unique_taxa, tally_dict = import_wevote(wevote_file_path, exist_percent) - taxa_class_dict = import_taxa_class_map(taxa_class_map_path) - nodes_dict = import_nodes(nodes_file) - - for item in tally_dict: - print(item, tally_dict[item]) + names_file = sys.argv[1] + lib_dir = sys.argv[2] + names_list = import_lib_names(names_file) + dest_dir = sys.argv[3] + mode = sys.argv[4] - class_set = set() - reject_set = set() - for item in unique_taxa: - try: - class_set.add(taxa_class_dict[item]) - except KeyError: - reject_set.add(item) - - yes_count = 0 - no_count = 0 - no_find_count = 0 - with open(export_lib_file, "w") as out_file: - for class_taxa_item in sorted([int(i) for i in class_set]): - try: - class_item = str(class_taxa_item) - lib_file_path = os.path.join(lib_root_path, class_item + ".fasta") - if(class_item == "1236" or class_item == "1760" or class_item == "28211"): - #bypasser for our specific split libs for these taxa - for i in range(0, 3): - lib_file_path = os.path.join(lib_root_path, class_item + "_" + str(i) + ".fasta") - export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count) - - elif(class_item == "91061"): - #bypasser for our specific split libs for these taxa - for i in range(0, 2): - lib_file_path = os.path.join(lib_root_path, class_item + "_" + str(i) + ".fasta") - export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count) + for item in names_list: + + if(item == "1236.fasta" or item == "1760.fasta" or item == "28211.fasta"): + root_name = item.split(".")[0] + for i in range(0, 3): + real_name = root_name + "_" + str(i) + ".fasta" + if(mode == "all"): + copy_all_files(real_name, lib_dir, dest_dir) else: - export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count) - - - - except KeyError: - no_find_count += 1 - out_file.write("can't find" + "|" + class_item + ".fasta" + "\n") - - with open(reject_lib_file, "w") as out_file: + copy_fasta(real_name, lib_dir, dest_dir) - for reject_taxa_item in sorted([int(i) for i in reject_set]): - reject_item = str(reject_taxa_item) - try: - out_file.write(nodes_dict[reject_item] + "|" + reject_item + "\n") - except KeyError: - out_file.write("can't find" + "|" + reject_item + "\n") - - print("no:", no_count, "| yes:", yes_count, "| no-find:", no_find_count) + elif(item == "91061.fasta"): + for i in range(0, 2): + root_name = item.split(".")[0] + real_name = root_name + "_" + str(i) + ".fasta" + if(mode == "all"): + copy_all_files(real_name, lib_dir, dest_dir) + else: + copy_fasta(real_name, lib_dir, dest_dir) + else: + if(mode == "all"): + copy_all_files(item, lib_dir, dest_dir) + else: + copy_fasta(item, lib_dir, dest_dir) \ No newline at end of file