From 729a9091211bc28aa37c50d298e73549a28bfdc9 Mon Sep 17 00:00:00 2001 From: tahorst Date: Mon, 7 Jun 2021 15:15:12 -0400 Subject: [PATCH 1/5] Add new charging reaction for Sel --- .../ecoli/dataclasses/getter_functions.py | 9 -------- .../dataclasses/process/transcription.py | 8 ------- .../flat/trna_charging_reactions_added.tsv | 2 ++ .../flat/trna_charging_reactions_removed.tsv | 11 +--------- reconstruction/ecoli/knowledge_base_raw.py | 21 +++++++++++++++++++ 5 files changed, 24 insertions(+), 27 deletions(-) create mode 100644 reconstruction/ecoli/flat/trna_charging_reactions_added.tsv diff --git a/reconstruction/ecoli/dataclasses/getter_functions.py b/reconstruction/ecoli/dataclasses/getter_functions.py index e2ee1f9f3c..cd6cf25b4a 100755 --- a/reconstruction/ecoli/dataclasses/getter_functions.py +++ b/reconstruction/ecoli/dataclasses/getter_functions.py @@ -397,17 +397,8 @@ def _build_modified_rna_masses(self, raw_data): modified_rna_id for rna in raw_data.rnas for modified_rna_id in rna['modified_forms']} - # Get IDs of charging reactions that should be removed - removed_charging_reaction_ids = { - rxn['id'] for rxn in raw_data.trna_charging_reactions_removed - } - # Loop through each charging reaction for rxn in raw_data.trna_charging_reactions: - # Skip removed reactions - if rxn['id'] in removed_charging_reaction_ids: - continue - # Find molecule IDs whose masses are still unknown unknown_mol_ids = [ mol_id for mol_id in rxn['stoichiometry'].keys() diff --git a/reconstruction/ecoli/dataclasses/process/transcription.py b/reconstruction/ecoli/dataclasses/process/transcription.py index c0647b2cf3..7a560402ea 100755 --- a/reconstruction/ecoli/dataclasses/process/transcription.py +++ b/reconstruction/ecoli/dataclasses/process/transcription.py @@ -540,19 +540,11 @@ def _build_charged_trna(self, raw_data, sim_data): synthetase_names = [] synthetase_mapping_aa = [] synthetase_mapping_syn = [] - - # Get IDs of charging reactions that should be removed - removed_reaction_ids = { - rxn['id'] for rxn in raw_data.trna_charging_reactions_removed} - # Get IDs of all metabolites metabolite_ids = {met['id'] for met in raw_data.metabolites} # Create stoichiometry matrix for charging reactions for reaction in raw_data.trna_charging_reactions: - if reaction['id'] in removed_reaction_ids: - continue - # Get uncharged tRNA name for the given reaction trna = None for mol_id in reaction['stoichiometry'].keys(): diff --git a/reconstruction/ecoli/flat/trna_charging_reactions_added.tsv b/reconstruction/ecoli/flat/trna_charging_reactions_added.tsv new file mode 100644 index 0000000000..0d780a2aa4 --- /dev/null +++ b/reconstruction/ecoli/flat/trna_charging_reactions_added.tsv @@ -0,0 +1,2 @@ +"id" "stoichiometry" "catalyzed_by" "common_name" "_notes" +"SELENOCYSTEINE--TRNA-LIGASE-RXN_selC-tRNA" {"L-SELENOCYSTEINE":-1,"ATP":-1,"selC-tRNA":-1,"PPI":1,"AMP":1,"charged-selC-tRNA":1} ["CPLX0-1141"] "SELENOCYSTEIN SYNTHASE" "Simplified selenocysteine charging from two steps (RXN0-2161 and 2.9.1.1-RXN) into one to match modeling framework of other charging reactions" diff --git a/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv b/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv index f38014a0e8..46f725abe4 100644 --- a/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv +++ b/reconstruction/ecoli/flat/trna_charging_reactions_removed.tsv @@ -1,11 +1,2 @@ "id" "_comments" -"GLUTRNAREDUCT-RXN" "Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)" -"GLUTRNAREDUCT-R_1" "Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)" -"GLUTRNAREDUCT-R_0" "Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)" -"GLUTRNAREDUCT-R_2" "Charging reaction does not use canonical amino acid (uses GLUTAMATE-1-SEMIALDEHYDE)" -"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN1" "Removed to allow only one charged tRNA per uncharged tRNA" -"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN2" "Removed to allow only one charged tRNA per uncharged tRNA" -"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN3" "Removed to allow only one charged tRNA per uncharged tRNA" -"RX_WC_METHIONYL-TRNA-FORMYLTRANSFERASE-RXN4" "Removed to allow only one charged tRNA per uncharged tRNA" -"RXN0-6434" "Reaction does not have both an uncharged and charged tRNA" -"2.9.1.1-R_0" "Reaction does not have both an uncharged and charged tRNA" +"RXN0-2161_selC-tRNA" "Ignored because two step Sel charging (Ser charging, Ser -> Sel) does not fit with modeling framework of other charging reactions. Replaced by SELENOCYSTEINE--TRNA-LIGASE-RXN_selC-tRNA." diff --git a/reconstruction/ecoli/knowledge_base_raw.py b/reconstruction/ecoli/knowledge_base_raw.py index 6564e9264a..188a1a94b8 100644 --- a/reconstruction/ecoli/knowledge_base_raw.py +++ b/reconstruction/ecoli/knowledge_base_raw.py @@ -58,6 +58,7 @@ "tf_one_component_bound.tsv", "translation_efficiency.tsv", "trna_charging_reactions.tsv", + "trna_charging_reactions_added.tsv", "trna_charging_reactions_removed.tsv", "two_component_systems.tsv", "two_component_system_templates.tsv", @@ -108,6 +109,11 @@ # TODO: add other removed files here and not handle removing in scripts REMOVED_DATA = { 'metabolite_concentrations': 'metabolite_concentrations_removed', + 'trna_charging_reactions': 'trna_charging_reactions_removed', + } +# TODO: move added rows from some flat files to new files and add here +ADDED_DATA = { + 'trna_charging_reactions': 'trna_charging_reactions_added', } class DataStore(object): @@ -128,6 +134,7 @@ def __init__(self): self._load_parameters(os.path.join(FLAT_DIR, filename)) self._prune_data() + self._join_data() self.genome_sequence = self._load_sequence(os.path.join(FLAT_DIR, SEQUENCE_FILE)) @@ -193,3 +200,17 @@ def _prune_data(self): checked_id = tuple([row[col] for col in removed_cols]) if checked_id in removed_ids: attr_data.pop(n_entries - i - 1) + + def _join_data(self): + """ + Add rows that are specified in additional files. + + TODO: add check that columns match up + """ + + # Join data for each file with data to be added + for data_attr, attr_to_add in ADDED_DATA.items(): + data = getattr(self, data_attr) + added_data = getattr(self, attr_to_add) + for row in added_data: + data.append(row) From 52e185e21fb85eb4f54ca35d4738c5105919bb7e Mon Sep 17 00:00:00 2001 From: tahorst Date: Mon, 7 Jun 2021 15:19:57 -0400 Subject: [PATCH 2/5] Update variable names for clarity and consistency --- reconstruction/ecoli/knowledge_base_raw.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/reconstruction/ecoli/knowledge_base_raw.py b/reconstruction/ecoli/knowledge_base_raw.py index 188a1a94b8..89ef0e5cef 100644 --- a/reconstruction/ecoli/knowledge_base_raw.py +++ b/reconstruction/ecoli/knowledge_base_raw.py @@ -185,21 +185,21 @@ def _prune_data(self): """ # Check each pair of files to be removed - for data, to_remove in REMOVED_DATA.items(): + for data_attr, attr_to_remove in REMOVED_DATA.items(): # Build the set of data to identify rows to be removed - attr_removed = getattr(self, to_remove) - removed_cols = list(attr_removed[0].keys()) + data_to_remove = getattr(self, attr_to_remove) + removed_cols = list(data_to_remove[0].keys()) removed_ids = set() - for row in attr_removed: + for row in data_to_remove: removed_ids.add(tuple([row[col] for col in removed_cols])) # Remove any matching rows - attr_data = getattr(self, data) - n_entries = len(attr_data) - for i, row in enumerate(attr_data[::-1]): + data = getattr(self, data_attr) + n_entries = len(data) + for i, row in enumerate(data[::-1]): checked_id = tuple([row[col] for col in removed_cols]) if checked_id in removed_ids: - attr_data.pop(n_entries - i - 1) + data.pop(n_entries - i - 1) def _join_data(self): """ From 121b65ece63430fea29e7dd0b9aa59fbcb5f8ba3 Mon Sep 17 00:00:00 2001 From: tahorst Date: Mon, 7 Jun 2021 15:25:44 -0400 Subject: [PATCH 3/5] Check joined flat file columns match --- reconstruction/ecoli/knowledge_base_raw.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/reconstruction/ecoli/knowledge_base_raw.py b/reconstruction/ecoli/knowledge_base_raw.py index 89ef0e5cef..a26db0c8af 100644 --- a/reconstruction/ecoli/knowledge_base_raw.py +++ b/reconstruction/ecoli/knowledge_base_raw.py @@ -203,14 +203,22 @@ def _prune_data(self): def _join_data(self): """ - Add rows that are specified in additional files. - - TODO: add check that columns match up + Add rows that are specified in additional files. Data will only be added + if all the loaded columns from both datasets match. """ # Join data for each file with data to be added for data_attr, attr_to_add in ADDED_DATA.items(): + # Get datasets to join data = getattr(self, data_attr) added_data = getattr(self, attr_to_add) + + # Check columns are the same for each dataset + col_diff = set(data[0].keys()).symmetric_difference(added_data[0].keys()) + if col_diff: + raise ValueError(f'Could not join datasets {data_attr} and {attr_to_add} ' + f'because columns do not match (different columns: {col_diff}).') + + # Join datasets for row in added_data: data.append(row) From 3ed7cf56e65467ea3d12a5e039f5bed65e5cc4b9 Mon Sep 17 00:00:00 2001 From: tahorst Date: Mon, 7 Jun 2021 15:29:54 -0400 Subject: [PATCH 4/5] Remove empty field name columns --- reconstruction/spreadsheets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reconstruction/spreadsheets.py b/reconstruction/spreadsheets.py index 62a18fadb0..4fde162eed 100755 --- a/reconstruction/spreadsheets.py +++ b/reconstruction/spreadsheets.py @@ -140,10 +140,10 @@ def __init__(self, *args, **kwargs): fieldnames = [fieldname.strip('"') for fieldname in fieldnames] self.tsv_dict_reader.fieldnames = fieldnames - # Discard private field names that begin with underscore + # Discard private field names that begin with underscore and empty filed names self._fieldnames = [ fieldname for fieldname in fieldnames - if not fieldname.startswith('_')] + if not fieldname.startswith('_') and fieldname != ''] def __iter__(self): return self From 05cced8dc4ba08e4a3a8fa960e108699fdf2dcfe Mon Sep 17 00:00:00 2001 From: Gwanggyu Sun <32276711+ggsun@users.noreply.github.com> Date: Tue, 8 Jun 2021 16:25:51 -0700 Subject: [PATCH 5/5] Fix docstring typo in spreadsheets.py --- reconstruction/spreadsheets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reconstruction/spreadsheets.py b/reconstruction/spreadsheets.py index 4fde162eed..177edf05bd 100755 --- a/reconstruction/spreadsheets.py +++ b/reconstruction/spreadsheets.py @@ -140,7 +140,7 @@ def __init__(self, *args, **kwargs): fieldnames = [fieldname.strip('"') for fieldname in fieldnames] self.tsv_dict_reader.fieldnames = fieldnames - # Discard private field names that begin with underscore and empty filed names + # Discard private field names that begin with underscore and empty field names self._fieldnames = [ fieldname for fieldname in fieldnames if not fieldname.startswith('_') and fieldname != '']