From 5ddd9d95860ab7bbd716ac331bf69be47bd17e8c Mon Sep 17 00:00:00 2001
From: Billy Taj <billyc59@gmail.com>
Date: Mon, 17 Apr 2023 16:16:15 -0400
Subject: [PATCH] bug: wrong copy of assemble_libs uploaded

correct code added
---
 Scripts/ga_pre_scan_assemble_libs.py | 186 ++++++++++-----------------
 1 file changed, 66 insertions(+), 120 deletions(-)

diff --git a/Scripts/ga_pre_scan_assemble_libs.py b/Scripts/ga_pre_scan_assemble_libs.py
index 99c9141..18c4e16 100644
--- a/Scripts/ga_pre_scan_assemble_libs.py
+++ b/Scripts/ga_pre_scan_assemble_libs.py
@@ -1,137 +1,83 @@
-#parse the wevote results. get the taxa. get the class. pull the libs from our chocophlan
-#march 10, 2023:  install a threshold for reads.  There is noise in taxa.
-#only include taxa for which there is a 1%< occurence
+#this isn't the final design.  this is just to assemble the pieces to test the efficiency of the new GA lib-maker
+#new thing: copies all files, and their index files.  
+#there are a handful of exceptions
+#actually, this file should never be used. we shouldn't copy.  this should be code inside the pipe to just generate commands using the names_list
+
+#note: only the class-level split has a size problem. everyone else is under 4.5GB
+
+
 
 import os
 import sys
 import time 
 from datetime import datetime as dt
+import shutil as sh
 
-def import_nodes(nodes_file):
-    nodes_dict = dict()
-    with open(nodes_file, "r") as nodes_in:
-        for line in nodes_in:
-            cleaned_line = line.strip("\n").split("\t")
-            taxid = cleaned_line[0]
-            rank = cleaned_line[4]
-            nodes_dict[taxid] = rank
-            
-    return nodes_dict
-
-def import_wevote(wevote_file, existence_percent):
-    #filter the taxa. only include taxa that reps 1% or more
-    read_counter = 0
-    taxa_tally_dict = dict()
-    unique_taxa = set()
-    tally_dict = dict()
-    
-    #read the file
-    with open(wevote_file, "r") as wevote_in:
-        for line in wevote_in:
-            read_counter += 1
-            cleaned_line = line.strip("\n").split("\t")
-            final_taxa = cleaned_line[-1]
-            #unique_taxa.add(final_taxa)
-            if(final_taxa in taxa_tally_dict):
-                taxa_tally_dict[final_taxa] += 1
+def import_lib_names(names_file):
+    files_list = []
+    with open(names_file, "r") as names_in:
+        for line in names_in:
+            if("can't" in line):
+                continue
             else:
-                taxa_tally_dict[final_taxa] = 1
                 
-
-    #figure out percentages
-    for taxa in taxa_tally_dict:
-        rep_val = taxa_tally_dict[taxa] * 100/ read_counter
-        if(rep_val >= existence_percent):
-            unique_taxa.add(taxa)
-            tally_dict[taxa] = rep_val
-    return unique_taxa, tally_dict
-    
-def import_taxa_class_map(taxa_class_map_path):
-    taxa_class_dict = dict()
+                cleaned_line = line.strip("\n")
+                
+                src_path = cleaned_line.split("|")[3]
+                if(os.path.exists(src_path)):
+                    files_list.append(src_path)
+    return files_list
     
-    with open(taxa_class_map_path, "r") as taxa_class_in:
-        for line in taxa_class_in:
-            cleaned_line = line.strip("\n").split("\t")
-            taxa = cleaned_line[0]
-            class_level = cleaned_line[1].split("|")
-            class_taxa = class_level[1]
-            taxa_class_dict[taxa] = class_taxa
-    return taxa_class_dict
+def copy_all_files(root_name, lib_dir, dest_path):
+    amb_path = os.path.join(lib_dir, root_name + ".amb")
+    ann_path = os.path.join(lib_dir, root_name + ".ann")
+    bwt_path = os.path.join(lib_dir, root_name + ".bwt")
+    pac_path = os.path.join(lib_dir, root_name + ".pac")
+    sa_path = os.path.join(lib_dir, root_name + ".sa")
+    root_path = os.path.join(lib_dir, root_name)
 
-def export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count):
-    if("1236" in lib_file_path):
-        print(lib_file_path)
-    exist_flag = "no"
-    if(os.path.exists(lib_file_path)):
-        exist_flag = "yes"
-        yes_count += 1
-    else:
-        no_count += 1
+    sh.copy(root_path, dest_path)
+    sh.copy(amb_path, dest_path)
+    sh.copy(ann_path, dest_path)
+    sh.copy(bwt_path, dest_path)
+    sh.copy(pac_path, dest_path)
+    sh.copy(sa_path, dest_path)
+    
+def copy_fasta(root_name, lib_dir, dest_path):
+    root_path = os.path.join(lib_dir, root_name)
+    sh.copy(root_path, dest_path)
         
-    out_file.write(exist_flag +"|"+nodes_dict[class_item] + "|" +  class_item + ".fasta" + "|" + lib_file_path + "\n")
-
-
+    
 
 if __name__ == "__main__":
-    wevote_file_path = sys.argv[1]
-    taxa_class_map_path = sys.argv[2]
-    nodes_file = sys.argv[3]
-    export_lib_file = sys.argv[4]
-    reject_lib_file = sys.argv[5]
-    lib_root_path = sys.argv[6]
-    exist_percent = float(sys.argv[7])
-    
-    unique_taxa, tally_dict = import_wevote(wevote_file_path, exist_percent)
-    taxa_class_dict = import_taxa_class_map(taxa_class_map_path)
-    nodes_dict = import_nodes(nodes_file)
-    
-    for item in tally_dict:
-        print(item, tally_dict[item])
+    names_file = sys.argv[1]
+    lib_dir = sys.argv[2]
+    names_list = import_lib_names(names_file)
+    dest_dir = sys.argv[3]
+    mode = sys.argv[4]
     
-    class_set = set()
-    reject_set = set()
-    for item in unique_taxa:
-        try:
-            class_set.add(taxa_class_dict[item])
-        except KeyError:
-            reject_set.add(item)
-            
-    yes_count = 0
-    no_count = 0
-    no_find_count = 0
-    with open(export_lib_file, "w") as out_file:
-        for class_taxa_item in sorted([int(i) for i in class_set]):
-            try:
-                class_item = str(class_taxa_item)
-                lib_file_path = os.path.join(lib_root_path, class_item + ".fasta")
-                if(class_item == "1236" or class_item == "1760" or class_item == "28211"):
-                    #bypasser for our specific split libs for these taxa
-                    for i in range(0, 3):
-                        lib_file_path = os.path.join(lib_root_path, class_item + "_" + str(i) + ".fasta")
-                        export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count)
-                        
-                elif(class_item == "91061"):
-                    #bypasser for our specific split libs for these taxa
-                    for i in range(0, 2):
-                        lib_file_path = os.path.join(lib_root_path, class_item + "_" + str(i) + ".fasta")
-                        export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count)
+    for item in names_list:
+        
+        if(item == "1236.fasta" or item  == "1760.fasta" or item == "28211.fasta"):
+            root_name = item.split(".")[0]
+            for i in range(0, 3):
+                real_name = root_name + "_" + str(i) + ".fasta"
+                if(mode == "all"):
+                    copy_all_files(real_name, lib_dir, dest_dir)
                 else:
-                    export_lines(lib_file_path, nodes_dict, class_item, yes_count, no_count)
-                        
-                
-                
-            except KeyError:
-                no_find_count += 1
-                out_file.write("can't find" + "|" + class_item + ".fasta" + "\n")
-    
-    with open(reject_lib_file, "w") as out_file:
+                    copy_fasta(real_name, lib_dir, dest_dir)
         
-        for reject_taxa_item in sorted([int(i) for i in reject_set]):
-            reject_item = str(reject_taxa_item)
-            try:
-                out_file.write(nodes_dict[reject_item] + "|" + reject_item + "\n")
-            except KeyError:
-                out_file.write("can't find" + "|" + reject_item + "\n")
-                
-    print("no:", no_count, "| yes:", yes_count, "| no-find:", no_find_count)
+        elif(item == "91061.fasta"):
+            for i in range(0, 2):
+                root_name = item.split(".")[0]
+                real_name = root_name + "_" + str(i) + ".fasta"
+                if(mode == "all"):
+                    copy_all_files(real_name, lib_dir, dest_dir)
+                else:
+                    copy_fasta(real_name, lib_dir, dest_dir)
+        else:
+            if(mode == "all"):
+                copy_all_files(item, lib_dir, dest_dir)
+            else:
+                copy_fasta(item, lib_dir, dest_dir)
     
\ No newline at end of file