From 61a29bfa77bce9b86c964ed3401ea09e34ac0a33 Mon Sep 17 00:00:00 2001 From: Ben Webb Date: Thu, 21 Sep 2023 10:54:57 -0700 Subject: [PATCH] Squashed 'modules/pmi/' changes from e610c5f057..e469c4f9fe e469c4f9fe Pass UniProt accession through to Molecule 91646dd985 Extract UniProt accessions from FASTA headers 4ece080c1b Add max version to placate CMake git-subtree-dir: modules/pmi git-subtree-split: e469c4f9fe2718fba6497ba53b06e6fa7ff2f341 --- modules/pmi/CMakeLists.txt | 2 +- modules/pmi/pyext/src/macros.py | 10 +++++--- modules/pmi/pyext/src/topology/__init__.py | 30 ++++++++++++++++++---- modules/pmi/test/input/uniprot.fasta | 8 ++++++ modules/pmi/test/medium_test_topology.py | 5 +++- modules/pmi/test/test_sequences.py | 9 +++++++ 6 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 modules/pmi/test/input/uniprot.fasta diff --git a/modules/pmi/CMakeLists.txt b/modules/pmi/CMakeLists.txt index 2060c1ed91..6ed7c36895 100644 --- a/modules/pmi/CMakeLists.txt +++ b/modules/pmi/CMakeLists.txt @@ -1,6 +1,6 @@ # Are we running cmake from this directory (out of tree build) ? if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - cmake_minimum_required(VERSION 2.8.12) + cmake_minimum_required(VERSION 2.8.12...3.6.0) project(imp_module) if(POLICY CMP0058) diff --git a/modules/pmi/pyext/src/macros.py b/modules/pmi/pyext/src/macros.py index cc106d52f4..85d27a2783 100644 --- a/modules/pmi/pyext/src/macros.py +++ b/modules/pmi/pyext/src/macros.py @@ -683,12 +683,14 @@ def add_state(self, reader, keep_chain_id=False, fasta_name_map=None, fasta_flag = copy[0].fasta_flag if fasta_flag in self._alphabets: alphabet = self._alphabets[fasta_flag] - seq = IMP.pmi.topology.Sequences( - copy[0].fasta_file, fasta_name_map)[copy[0].fasta_id] + seqs = IMP.pmi.topology.Sequences( + copy[0].fasta_file, fasta_name_map) + seq = seqs[copy[0].fasta_id] print("BuildSystem.add_state: molecule %s sequence has " "%s residues" % (molname, len(seq))) - orig_mol = state.create_molecule(molname, seq, chain_id, - alphabet=alphabet) + orig_mol = state.create_molecule( + molname, seq, chain_id, alphabet=alphabet, + uniprot=seqs.uniprot.get(copy[0].fasta_id)) mol = orig_mol numchain += 1 else: diff --git a/modules/pmi/pyext/src/topology/__init__.py b/modules/pmi/pyext/src/topology/__init__.py index 457dffa489..a67448ca9a 100644 --- a/modules/pmi/pyext/src/topology/__init__.py +++ b/modules/pmi/pyext/src/topology/__init__.py @@ -267,13 +267,15 @@ def get_molecule(self, name, copy_num=0): return self.molecules[name][copy_num] def create_molecule(self, name, sequence='', chain_id='', - alphabet=IMP.pmi.alphabets.amino_acid): + alphabet=IMP.pmi.alphabets.amino_acid, + uniprot=None): """Create a new Molecule within this State @param name the name of the molecule (string); it must not be already used @param sequence sequence (string) @param chain_id Chain ID to assign to this molecule @param alphabet Mapping from FASTA codes to residue types + @param uniprot UniProt accession, if available """ # check whether the molecule name is already assigned if name in self.molecules: @@ -290,7 +292,7 @@ def create_molecule(self, name, sequence='', chain_id='', "is desired." % name, IMP.pmi.ParameterWarning) mol = Molecule(self, name, sequence, chain_id, copy_num=0, - alphabet=alphabet) + alphabet=alphabet, uniprot=uniprot) self.molecules[name] = [mol] return mol @@ -401,7 +403,8 @@ class Molecule(_SystemBase): """ def __init__(self, state, name, sequence, chain_id, copy_num, - mol_to_clone=None, alphabet=IMP.pmi.alphabets.amino_acid): + mol_to_clone=None, alphabet=IMP.pmi.alphabets.amino_acid, + uniprot=None): """The user should not call this directly; instead call State.create_molecule() @@ -423,6 +426,7 @@ def __init__(self, state, name, sequence, chain_id, copy_num, self.alphabet = alphabet self.representations = [] # list of stuff to build self._pdb_elements = [] + self.uniprot = uniprot # residues with representation self._represented = IMP.pmi.tools.OrderedSet() # helps you place beads by storing structure @@ -439,6 +443,8 @@ def __init__(self, state, name, sequence, chain_id, copy_num, self.chain = IMP.atom.Chain.setup_particle(self.hier, chain_id) self.chain.set_sequence(self.sequence) self.chain.set_chain_type(alphabet.get_chain_type()) + if self.uniprot: + self.chain.set_uniprot_accession(self.uniprot) # create TempResidues from the sequence (if passed) self.residues = [] for ns, s in enumerate(sequence): @@ -997,13 +1003,23 @@ def find_nearest_coord(self, query): class Sequences(object): """A dictionary-like wrapper for reading and storing sequence data. Keys are FASTA sequence names, and each value a string of one-letter - codes.""" + codes. + + The FASTA header may contain multiple fields split by pipe (|) + characters. If so, the FASTA sequence name is the first field and + the second field (if present) is the UniProt accession. + For example, ">cop9|Q13098" yields a FASTA sequence name of "cop9" + and UniProt accession of "Q13098". + """ def __init__(self, fasta_fn, name_map=None): """Read a FASTA file and extract all the requested sequences @param fasta_fn sequence file @param name_map dictionary mapping the FASTA name to final stored name """ + # Mapping from sequence name to primary sequence self.sequences = IMP.pmi.tools.OrderedDict() + # Mapping from sequence name to UniProt accession, if available + self.uniprot = {} self.read_sequences(fasta_fn, name_map) def __len__(self): @@ -1040,13 +1056,17 @@ def read_sequences(self, fasta_fn, name_map=None): if line.startswith('>'): if seq is not None: self.sequences[code] = seq.strip('*') - code = line.rstrip()[1:] + spl = line[1:].split('|') + code = spl[0].strip() if name_map is not None: try: code = name_map[code] except KeyError: pass seq = '' + if len(spl) >= 2: + up_accession = spl[1].strip() + self.uniprot[code] = up_accession else: line = line.rstrip() if line: # Skip blank lines diff --git a/modules/pmi/test/input/uniprot.fasta b/modules/pmi/test/input/uniprot.fasta new file mode 100644 index 0000000000..a28a65a2af --- /dev/null +++ b/modules/pmi/test/input/uniprot.fasta @@ -0,0 +1,8 @@ +>1WCM:A +MVGQQYSS +>1WCM:B +AADESAPIT +>test1|acc1 +AADESAPIT +>test2|acc2 +AADESAPIT diff --git a/modules/pmi/test/medium_test_topology.py b/modules/pmi/test/medium_test_topology.py index 8f0a5f07b0..0d6ebce638 100644 --- a/modules/pmi/test/medium_test_topology.py +++ b/modules/pmi/test/medium_test_topology.py @@ -421,7 +421,8 @@ def test_create_molecules(self): m1 = st.create_molecule("Prot1", sequence=seqs["Prot1"]) self.assertRaises(ValueError, st.create_molecule, "Prot1", sequence=seqs["Prot1"]) - m2 = st.create_molecule("Prot2", sequence=seqs["Prot2"]) + m2 = st.create_molecule("Prot2", sequence=seqs["Prot2"], + uniprot="testup") self.assertEqual(m1.get_hierarchy().get_parent(), st.get_hierarchy()) self.assertEqual(m2.get_hierarchy().get_parent(), st.get_hierarchy()) self.assertEqual(m1.model, st.model) @@ -429,6 +430,8 @@ def test_create_molecules(self): self.assertEqual(m1.get_name(), "Prot1") self.assertEqual(m2.get_name(), "Prot2") self.assertEqual(len(st.get_hierarchy().get_children()), 2) + self.assertIsNone(m1.uniprot) + self.assertEqual(m2.uniprot, 'testup') # create state 2 with one molecule st2 = s.create_state() diff --git a/modules/pmi/test/test_sequences.py b/modules/pmi/test/test_sequences.py index 51f12128d3..4cb1f2d090 100644 --- a/modules/pmi/test/test_sequences.py +++ b/modules/pmi/test/test_sequences.py @@ -27,6 +27,15 @@ def test_sequences_name_map(self): self.assertNotIn('1WCM:A', s) self.assertEqual(s['foo'], 'MVGQQYSS') + def test_sequences_uniprot(self): + """Test reading sequences with UniProt info from a FASTA file""" + s = IMP.pmi.topology.Sequences( + self.get_input_file_name('uniprot.fasta')) + self.assertIn('test1', s) + self.assertIn('test2', s) + self.assertEqual(s.uniprot['test1'], 'acc1') + self.assertEqual(s.uniprot['test2'], 'acc2') + def test_sequences_bad(self): """Test reading sequences from an invalid FASTA file""" fname = 'test_sequences_bad.fasta'