Updated code for preparing protein library

salilab · Jun 23, 2024 · d409045 · d409045
1 parent 9553929
commit d409045
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 12 deletions.
diff --git a/modules/spatiotemporal/README.md b/modules/spatiotemporal/README.md
@@ -282,6 +282,8 @@ expected_subcomplexes - list of all possible subcomplex strings in the model. Sh
 
 nmodels - int, number of models with different protein copy numbers to generate at each time point.
 
+output_dir - string, directory where the output will be written. Empty string assumes the current working directory. (default: '')
+
 template_topology: string, name of the topology file for the complete complex (default: '', no topology files are output)
 
 template_dict: dictionary for connecting the spatiotemporal model to the topology file. The keys (string) are the names of the proteins, defined by the expected_complexes variable. The values (list) are the names of all proteins in the topology file that should have the same copy number as the labeled protein, specifically the "molecule_name." (default: {}, no topology files are output)

diff --git a/modules/spatiotemporal/pyext/src/prepare_protein_library.py b/modules/spatiotemporal/pyext/src/prepare_protein_library.py
@@ -5,8 +5,9 @@
 import itertools
 import pandas as pd
 from IMP.spatiotemporal import composition_scoring
+import os
 
-def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, template_topology='', template_dict={}, match_final_state=True):
+def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels, output_dir='', template_topology='', template_dict={}, match_final_state=True):
     """
         Function that reads in experimental stoicheometery data and calculates which compositions and location
         assignments should be sampled for spatiotemporal modeling, which are saved as config files. Optionally, a PMI
@@ -34,6 +35,8 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels,
            in the model. Should be a list without duplicates of
            all components in the subcomplex configuration files.
         @param nmodels: int, number of models with different protein copy numbers to generate at each time point.
+        @param output_dir: string, directory where the output will be written.
+           Empty string assumes the current working directory.
         @param template_topology: string, name of the topology file for the complete complex.
             (default: '', no topology files are output)
         @param template_dict: dictionary for connecting the spatiotemporal model to the topology file.
@@ -50,19 +53,30 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels,
         raise TypeError("times should be of type list")
     if not isinstance(exp_comp_map, dict):
         raise TypeError("times should be of type dict")
+    if not isinstance(expected_subcomplexes, list):
+        raise TypeError("nmodels should be of type list")
     if not isinstance(nmodels, int):
         raise TypeError("nmodels should be of type int")
+    if not isinstance(output_dir, str):
+        raise TypeError("output_dir should be of type str")
     if not isinstance(template_topology, str):
         raise TypeError("template_topology should be of type str")
     if not isinstance(template_dict, dict):
         raise TypeError("template_dict should be of type dict")
     if not isinstance(match_final_state, bool):
         raise TypeError("match_final_state should be of type bool")
+    # make output_dir if necessary
+    if len(output_dir) > 0:
+        if os.path.exists(output_dir):
+            os.chdir(output_dir)
+        else:
+            os.mkdir(output_dir)
+            os.chdir(output_dir)
     # Whether or not topology files should be written
     include_topology = False
     # calculate final copy numbers based on the expected complexes
-    final_CN=np.zeros(len(exp_comp.keys()),dtype=int)
-    for i, key in enumerate(exp_comp.keys()):
+    final_CN=np.zeros(len(exp_comp_map.keys()),dtype=int)
+    for i, key in enumerate(exp_comp_map.keys()):
         for subcomplex in expected_subcomplexes:
             if key in subcomplex:
                 final_CN[i] += 1
@@ -95,9 +109,6 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels,
             for state in all_library:
                 unnormalized_weights.append(composition_scoring.calc_likelihood_state(exp_comp_map,time,state))
             unw = np.array(unnormalized_weights)
-            print(time)
-            print(all_library)
-            print(unw)
             # get top scoring nmodels
             mindx = np.argsort(unw)[0:nmodels]
             # write out library with the top scoring models
@@ -141,13 +152,13 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels,
         # write top "scoring" compositions to file
         oary = np.array(olist, dtype=int)
         header=''
-        for prot_name in exp_comp.keys():
+        for prot_name in exp_comp_map.keys():
             header=header+str(prot_name)+'\t\t\t\t'
-        np.savetxt( time + ".txt", oary,header=header)
+        np.savetxt(time + ".txt", oary, header=header)
 
         # write protein config library to file
         for indx,prot_list in enumerate(state_list):
-            with open(str(indx+1) + "_" + time + ".config", "w") as fh:
+            with open(str(indx + 1) + "_" + time + ".config", "w") as fh:
                 for prot in prot_list:
                     fh.write(prot +"\n")
 
@@ -164,7 +175,7 @@ def prepare_protein_library(times, exp_comp_map, expected_subcomplexes, nmodels,
                     else:
                         raise Exception("Protein " + prot + ' does not exist in template_dict\nClosing...')
                 # open new topology file
-                with open(str(indx+1) + "_" + time + "_topol.txt", "w") as fh:
+                with open(str(indx + 1) + "_" + time + "_topol.txt", "w") as fh:
                     old=open(template_topology,'r')
                     line=old.readline()
                     while line:

diff --git a/modules/spatiotemporal/test/test_make_graph.py b/modules/spatiotemporal/test/test_make_graph.py
@@ -6,6 +6,7 @@
 import IMP.test
 import IMP.spatiotemporal as spatiotemporal
 import IMP.spatiotemporal.graphNode as graphNode
+import IMP.spatiotemporal.prepare_protein_library as prepare_protein_library
 import shutil
 import os
 import sys
@@ -17,14 +18,41 @@ def setup_system():
     Function to set up initial variables
     """
     # Input variables.
-    dict = {'0min': 2, '5min': 3, '10min': 2}
+    time_dict = {'0min': 2, '5min': 3, '10min': 2}
     subcomplexes = ['A1', 'A2', 'B1', 'B2']
     # exp_comp_map is a dictionary that describes protein stoicheometery. The key describes the protein, which should correspond to names within the expected_subcomplexes. For each of these proteins, a csv file should be provided with protein copy number data
     exp_comp = {'A': 'exp_comp_A.csv', 'B': 'exp_comp_B.csv'}
-    return dict, subcomplexes, exp_comp
+    return time_dict, subcomplexes, exp_comp
 
 class Tests(IMP.test.TestCase):
 
+    def test_prepare_protein_library(self):
+        """
+        Test setting up a preparing a protein library for spatiotemporal library
+        """
+        # set input dir
+        state_dict, expected_subcomplexes, exp_comp_map = setup_system()
+        with IMP.test.temporary_directory() as tmpdir:
+            input = os.path.join(tmpdir, 'data/')
+            shutil.copytree(self.get_input_file_name('data/'), input)
+            # set output dir
+            output = self.get_tmp_file_name('output')
+            # run code
+            exp_comp_map = {'A': input+'exp_comp_A.csv', 'B': input+'exp_comp_B.csv'}
+            prepare_protein_library.prepare_protein_library(list(state_dict.keys()), exp_comp_map, expected_subcomplexes, 2, output_dir=output)
+            # check copy numbers
+            CN_0min=np.loadtxt(output+'/0min.txt')
+            self.assertAlmostEqual(np.sum(CN_0min[0][:]), 1.0, delta=1e-4)
+            self.assertAlmostEqual(CN_0min[0][0], 1.0, delta=1e-4)
+            # check configuration file
+            check_config=open(output+'/4_0min.config','r')
+            line1=check_config.readline()
+            line2=check_config.readline()
+            check_config.close()
+            self.assertEqual(line1[0:2], 'A1')
+            self.assertEqual(line2[0:2], 'B2')
+
+
     def test_graph_setup(self):
         """
         Test setting up a graph. Tests functionality of graphNode.py