From 729a9091211bc28aa37c50d298e73549a28bfdc9 Mon Sep 17 00:00:00 2001
From: tahorst <thorst@stanford.edu>
Date: Mon, 7 Jun 2021 15:15:12 -0400
Subject: [PATCH 1/5] Add new charging reaction for Sel

---
 .../ecoli/dataclasses/getter_functions.py     |  9 --------
 .../dataclasses/process/transcription.py      |  8 -------
 .../flat/trna_charging_reactions_added.tsv    |  2 ++
 .../flat/trna_charging_reactions_removed.tsv  | 11 +---------
 reconstruction/ecoli/knowledge_base_raw.py    | 21 +++++++++++++++++++
 5 files changed, 24 insertions(+), 27 deletions(-)
 create mode 100644 reconstruction/ecoli/flat/trna_charging_reactions_added.tsv

diff --git a/reconstruction/ecoli/dataclasses/getter_functions.py b/reconstruction/ecoli/dataclasses/getter_functions.py
index e2ee1f9f3c..cd6cf25b4a 100755
--- a/reconstruction/ecoli/dataclasses/getter_functions.py
+++ b/reconstruction/ecoli/dataclasses/getter_functions.py
@@ -397,17 +397,8 @@ def _build_modified_rna_masses(self, raw_data):
 			modified_rna_id for rna in raw_data.rnas
 			for modified_rna_id in rna['modified_forms']}
 
-		# Get IDs of charging reactions that should be removed
-		removed_charging_reaction_ids = {
-			rxn['id'] for rxn in raw_data.trna_charging_reactions_removed
-			}
-
 		# Loop through each charging reaction
 		for rxn in raw_data.trna_charging_reactions:
-			# Skip removed reactions
-			if rxn['id'] in removed_charging_reaction_ids:
-				continue
-
 			# Find molecule IDs whose masses are still unknown
 			unknown_mol_ids = [
 				mol_id for mol_id in rxn['stoichiometry'].keys()
diff --git a/reconstruction/ecoli/dataclasses/process/transcription.py b/reconstruction/ecoli/dataclasses/process/transcription.py
index c0647b2cf3..7a560402ea 100755
--- a/reconstruction/ecoli/dataclasses/process/transcription.py
+++ b/reconstruction/ecoli/dataclasses/process/transcription.py
@@ -540,19 +540,11 @@ def _build_charged_trna(self, raw_data, sim_data):
 		synthetase_names = []
 		synthetase_mapping_aa = []
 		synthetase_mapping_syn = []
-
-		# Get IDs of charging reactions that should be removed
-		removed_reaction_ids = {
-			rxn['id'] for rxn in raw_data.trna_charging_reactions_removed}
-
 		# Get IDs of all metabolites
 		metabolite_ids = {met['id'] for met in raw_data.metabolites}
 
 		# Create stoichiometry matrix for charging reactions
 		for reaction in raw_data.trna_charging_reactions:
-			if reaction['id'] in removed_reaction_ids:
-				continue
-
 			# Get uncharged tRNA name for the given reaction
 			trna = None
 			for mol_id in reaction['stoichiometry'].keys():
diff --git a/reconstruction/ecoli/flat/trna_charging_reactions_added.tsv b/reconstruction/ecoli/flat/trna_charging_reactions_added.tsv
new file mode 100644
index 0000000000..0d780a2aa4
--- /dev/null
+++ b/reconstruction/ecoli/flat/trna_charging_reactions_added.tsv
@@ -0,0 +1,2 @@
+"id"	"stoichiometry"	"catalyzed_by"	"common_name"	"_notes"
+"SELENOCYSTEINE--TRNA-LIGASE-RXN_selC-tRNA"	{"L-SELENOCYSTEINE":-1,"ATP":-1,"selC-tRNA":-1,"PPI":1,"AMP":1,"charged-selC-tRNA":1}	["CPLX0-1141"]	"SELENOCYSTEIN SYNTHASE"	"Simplified selenocysteine charging from two steps (RXN0-2161 and 2.9.1.1-RXN) into one to match modeling framework of other charging reactions"
diff --git a/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv b/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv
index f38014a0e8..46f725abe4 100644
--- a/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv
+++ b/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv
@@ -1,11 +1,2 @@
 "id"	"_comments"
-"GLUTRNAREDUCT-RXN"	"Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)"
-"GLUTRNAREDUCT-R_1"	"Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)"
-"GLUTRNAREDUCT-R_0"	"Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)"
-"GLUTRNAREDUCT-R_2"	"Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)"
-"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN1"	"Removed to allow only one charged tRNA per uncharged tRNA"
-"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN2"	"Removed to allow only one charged tRNA per uncharged tRNA"
-"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN3"	"Removed to allow only one charged tRNA per uncharged tRNA"
-"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN4"	"Removed to allow only one charged tRNA per uncharged tRNA"
-"RXN0-6434"	"Reaction does not have both an uncharged and charged tRNA"
-"2.9.1.1-R_0"	"Reaction does not have both an uncharged and charged tRNA"
+"RXN0-2161_selC-tRNA"	"Ignored because two step Sel charging (Ser charging, Ser -> Sel) does not fit with modeling framework of other charging reactions. Replaced by SELENOCYSTEINE--TRNA-LIGASE-RXN_selC-tRNA."
diff --git a/reconstruction/ecoli/knowledge_base_raw.py b/reconstruction/ecoli/knowledge_base_raw.py
index 6564e9264a..188a1a94b8 100644
--- a/reconstruction/ecoli/knowledge_base_raw.py
+++ b/reconstruction/ecoli/knowledge_base_raw.py
@@ -58,6 +58,7 @@
 	"tf_one_component_bound.tsv",
 	"translation_efficiency.tsv",
 	"trna_charging_reactions.tsv",
+	"trna_charging_reactions_added.tsv",
 	"trna_charging_reactions_removed.tsv",
 	"two_component_systems.tsv",
 	"two_component_system_templates.tsv",
@@ -108,6 +109,11 @@
 # TODO: add other removed files here and not handle removing in scripts
 REMOVED_DATA = {
 	'metabolite_concentrations': 'metabolite_concentrations_removed',
+	'trna_charging_reactions': 'trna_charging_reactions_removed',
+	}
+# TODO: move added rows from some flat files to new files and add here
+ADDED_DATA = {
+	'trna_charging_reactions': 'trna_charging_reactions_added',
 	}
 
 class DataStore(object):
@@ -128,6 +134,7 @@ def __init__(self):
 			self._load_parameters(os.path.join(FLAT_DIR, filename))
 
 		self._prune_data()
+		self._join_data()
 
 		self.genome_sequence = self._load_sequence(os.path.join(FLAT_DIR, SEQUENCE_FILE))
 
@@ -193,3 +200,17 @@ def _prune_data(self):
 				checked_id = tuple([row[col] for col in removed_cols])
 				if checked_id in removed_ids:
 					attr_data.pop(n_entries - i - 1)
+
+	def _join_data(self):
+		"""
+		Add rows that are specified in additional files.
+
+		TODO: add check that columns match up
+		"""
+
+		# Join data for each file with data to be added
+		for data_attr, attr_to_add in ADDED_DATA.items():
+			data = getattr(self, data_attr)
+			added_data = getattr(self, attr_to_add)
+			for row in added_data:
+				data.append(row)

From 52e185e21fb85eb4f54ca35d4738c5105919bb7e Mon Sep 17 00:00:00 2001
From: tahorst <thorst@stanford.edu>
Date: Mon, 7 Jun 2021 15:19:57 -0400
Subject: [PATCH 2/5] Update variable names for clarity and consistency

---
 reconstruction/ecoli/knowledge_base_raw.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/reconstruction/ecoli/knowledge_base_raw.py b/reconstruction/ecoli/knowledge_base_raw.py
index 188a1a94b8..89ef0e5cef 100644
--- a/reconstruction/ecoli/knowledge_base_raw.py
+++ b/reconstruction/ecoli/knowledge_base_raw.py
@@ -185,21 +185,21 @@ def _prune_data(self):
 		"""
 
 		# Check each pair of files to be removed
-		for data, to_remove in REMOVED_DATA.items():
+		for data_attr, attr_to_remove in REMOVED_DATA.items():
 			# Build the set of data to identify rows to be removed
-			attr_removed = getattr(self, to_remove)
-			removed_cols = list(attr_removed[0].keys())
+			data_to_remove = getattr(self, attr_to_remove)
+			removed_cols = list(data_to_remove[0].keys())
 			removed_ids = set()
-			for row in attr_removed:
+			for row in data_to_remove:
 				removed_ids.add(tuple([row[col] for col in removed_cols]))
 
 			# Remove any matching rows
-			attr_data = getattr(self, data)
-			n_entries = len(attr_data)
-			for i, row in enumerate(attr_data[::-1]):
+			data = getattr(self, data_attr)
+			n_entries = len(data)
+			for i, row in enumerate(data[::-1]):
 				checked_id = tuple([row[col] for col in removed_cols])
 				if checked_id in removed_ids:
-					attr_data.pop(n_entries - i - 1)
+					data.pop(n_entries - i - 1)
 
 	def _join_data(self):
 		"""

From 121b65ece63430fea29e7dd0b9aa59fbcb5f8ba3 Mon Sep 17 00:00:00 2001
From: tahorst <thorst@stanford.edu>
Date: Mon, 7 Jun 2021 15:25:44 -0400
Subject: [PATCH 3/5] Check joined flat file columns match

---
 reconstruction/ecoli/knowledge_base_raw.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/reconstruction/ecoli/knowledge_base_raw.py b/reconstruction/ecoli/knowledge_base_raw.py
index 89ef0e5cef..a26db0c8af 100644
--- a/reconstruction/ecoli/knowledge_base_raw.py
+++ b/reconstruction/ecoli/knowledge_base_raw.py
@@ -203,14 +203,22 @@ def _prune_data(self):
 
 	def _join_data(self):
 		"""
-		Add rows that are specified in additional files.
-
-		TODO: add check that columns match up
+		Add rows that are specified in additional files. Data will only be added
+		if all the loaded columns from both datasets match.
 		"""
 
 		# Join data for each file with data to be added
 		for data_attr, attr_to_add in ADDED_DATA.items():
+			# Get datasets to join
 			data = getattr(self, data_attr)
 			added_data = getattr(self, attr_to_add)
+
+			# Check columns are the same for each dataset
+			col_diff = set(data[0].keys()).symmetric_difference(added_data[0].keys())
+			if col_diff:
+				raise ValueError(f'Could not join datasets {data_attr} and {attr_to_add} '
+					f'because columns do not match (different columns: {col_diff}).')
+
+			# Join datasets
 			for row in added_data:
 				data.append(row)

From 3ed7cf56e65467ea3d12a5e039f5bed65e5cc4b9 Mon Sep 17 00:00:00 2001
From: tahorst <thorst@stanford.edu>
Date: Mon, 7 Jun 2021 15:29:54 -0400
Subject: [PATCH 4/5] Remove empty field name columns

---
 reconstruction/spreadsheets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reconstruction/spreadsheets.py b/reconstruction/spreadsheets.py
index 62a18fadb0..4fde162eed 100755
--- a/reconstruction/spreadsheets.py
+++ b/reconstruction/spreadsheets.py
@@ -140,10 +140,10 @@ def __init__(self, *args, **kwargs):
 		fieldnames = [fieldname.strip('"') for fieldname in fieldnames]
 		self.tsv_dict_reader.fieldnames = fieldnames
 
-		# Discard private field names that begin with underscore
+		# Discard private field names that begin with underscore and empty filed names
 		self._fieldnames = [
 			fieldname for fieldname in fieldnames
-			if not fieldname.startswith('_')]
+			if not fieldname.startswith('_') and fieldname != '']
 
 	def __iter__(self):
 		return self

From 05cced8dc4ba08e4a3a8fa960e108699fdf2dcfe Mon Sep 17 00:00:00 2001
From: Gwanggyu Sun <32276711+ggsun@users.noreply.github.com>
Date: Tue, 8 Jun 2021 16:25:51 -0700
Subject: [PATCH 5/5] Fix docstring typo in spreadsheets.py

---
 reconstruction/spreadsheets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reconstruction/spreadsheets.py b/reconstruction/spreadsheets.py
index 4fde162eed..177edf05bd 100755
--- a/reconstruction/spreadsheets.py
+++ b/reconstruction/spreadsheets.py
@@ -140,7 +140,7 @@ def __init__(self, *args, **kwargs):
 		fieldnames = [fieldname.strip('"') for fieldname in fieldnames]
 		self.tsv_dict_reader.fieldnames = fieldnames
 
-		# Discard private field names that begin with underscore and empty filed names
+		# Discard private field names that begin with underscore and empty field names
 		self._fieldnames = [
 			fieldname for fieldname in fieldnames
 			if not fieldname.startswith('_') and fieldname != '']