From 61a29bfa77bce9b86c964ed3401ea09e34ac0a33 Mon Sep 17 00:00:00 2001
From: Ben Webb <ben@salilab.org>
Date: Thu, 21 Sep 2023 10:54:57 -0700
Subject: [PATCH] Squashed 'modules/pmi/' changes from e610c5f057..e469c4f9fe

e469c4f9fe Pass UniProt accession through to Molecule
91646dd985 Extract UniProt accessions from FASTA headers
4ece080c1b Add max version to placate CMake

git-subtree-dir: modules/pmi
git-subtree-split: e469c4f9fe2718fba6497ba53b06e6fa7ff2f341
---
 modules/pmi/CMakeLists.txt                 |  2 +-
 modules/pmi/pyext/src/macros.py            | 10 +++++---
 modules/pmi/pyext/src/topology/__init__.py | 30 ++++++++++++++++++----
 modules/pmi/test/input/uniprot.fasta       |  8 ++++++
 modules/pmi/test/medium_test_topology.py   |  5 +++-
 modules/pmi/test/test_sequences.py         |  9 +++++++
 6 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 modules/pmi/test/input/uniprot.fasta

diff --git a/modules/pmi/CMakeLists.txt b/modules/pmi/CMakeLists.txt
index 2060c1ed91..6ed7c36895 100644
--- a/modules/pmi/CMakeLists.txt
+++ b/modules/pmi/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Are we running cmake from this directory (out of tree build) ?
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-  cmake_minimum_required(VERSION 2.8.12)
+  cmake_minimum_required(VERSION 2.8.12...3.6.0)
   project(imp_module)
 
   if(POLICY CMP0058)
diff --git a/modules/pmi/pyext/src/macros.py b/modules/pmi/pyext/src/macros.py
index cc106d52f4..85d27a2783 100644
--- a/modules/pmi/pyext/src/macros.py
+++ b/modules/pmi/pyext/src/macros.py
@@ -683,12 +683,14 @@ def add_state(self, reader, keep_chain_id=False, fasta_name_map=None,
                     fasta_flag = copy[0].fasta_flag
                     if fasta_flag in self._alphabets:
                         alphabet = self._alphabets[fasta_flag]
-                    seq = IMP.pmi.topology.Sequences(
-                        copy[0].fasta_file, fasta_name_map)[copy[0].fasta_id]
+                    seqs = IMP.pmi.topology.Sequences(
+                        copy[0].fasta_file, fasta_name_map)
+                    seq = seqs[copy[0].fasta_id]
                     print("BuildSystem.add_state: molecule %s sequence has "
                           "%s residues" % (molname, len(seq)))
-                    orig_mol = state.create_molecule(molname, seq, chain_id,
-                                                     alphabet=alphabet)
+                    orig_mol = state.create_molecule(
+                        molname, seq, chain_id, alphabet=alphabet,
+                        uniprot=seqs.uniprot.get(copy[0].fasta_id))
                     mol = orig_mol
                     numchain += 1
                 else:
diff --git a/modules/pmi/pyext/src/topology/__init__.py b/modules/pmi/pyext/src/topology/__init__.py
index 457dffa489..a67448ca9a 100644
--- a/modules/pmi/pyext/src/topology/__init__.py
+++ b/modules/pmi/pyext/src/topology/__init__.py
@@ -267,13 +267,15 @@ def get_molecule(self, name, copy_num=0):
             return self.molecules[name][copy_num]
 
     def create_molecule(self, name, sequence='', chain_id='',
-                        alphabet=IMP.pmi.alphabets.amino_acid):
+                        alphabet=IMP.pmi.alphabets.amino_acid,
+                        uniprot=None):
         """Create a new Molecule within this State
         @param name                the name of the molecule (string);
                                    it must not be already used
         @param sequence            sequence (string)
         @param chain_id            Chain ID to assign to this molecule
         @param alphabet            Mapping from FASTA codes to residue types
+        @param uniprot             UniProt accession, if available
         """
         # check whether the molecule name is already assigned
         if name in self.molecules:
@@ -290,7 +292,7 @@ def create_molecule(self, name, sequence='', chain_id='',
                 "is desired." % name, IMP.pmi.ParameterWarning)
 
         mol = Molecule(self, name, sequence, chain_id, copy_num=0,
-                       alphabet=alphabet)
+                       alphabet=alphabet, uniprot=uniprot)
         self.molecules[name] = [mol]
         return mol
 
@@ -401,7 +403,8 @@ class Molecule(_SystemBase):
     """
 
     def __init__(self, state, name, sequence, chain_id, copy_num,
-                 mol_to_clone=None, alphabet=IMP.pmi.alphabets.amino_acid):
+                 mol_to_clone=None, alphabet=IMP.pmi.alphabets.amino_acid,
+                 uniprot=None):
         """The user should not call this directly; instead call
            State.create_molecule()
 
@@ -423,6 +426,7 @@ def __init__(self, state, name, sequence, chain_id, copy_num,
         self.alphabet = alphabet
         self.representations = []  # list of stuff to build
         self._pdb_elements = []
+        self.uniprot = uniprot
         # residues with representation
         self._represented = IMP.pmi.tools.OrderedSet()
         # helps you place beads by storing structure
@@ -439,6 +443,8 @@ def __init__(self, state, name, sequence, chain_id, copy_num,
         self.chain = IMP.atom.Chain.setup_particle(self.hier, chain_id)
         self.chain.set_sequence(self.sequence)
         self.chain.set_chain_type(alphabet.get_chain_type())
+        if self.uniprot:
+            self.chain.set_uniprot_accession(self.uniprot)
         # create TempResidues from the sequence (if passed)
         self.residues = []
         for ns, s in enumerate(sequence):
@@ -997,13 +1003,23 @@ def find_nearest_coord(self, query):
 class Sequences(object):
     """A dictionary-like wrapper for reading and storing sequence data.
        Keys are FASTA sequence names, and each value a string of one-letter
-       codes."""
+       codes.
+
+       The FASTA header may contain multiple fields split by pipe (|)
+       characters. If so, the FASTA sequence name is the first field and
+       the second field (if present) is the UniProt accession.
+       For example, ">cop9|Q13098" yields a FASTA sequence name of "cop9"
+       and UniProt accession of "Q13098".
+    """
     def __init__(self, fasta_fn, name_map=None):
         """Read a FASTA file and extract all the requested sequences
         @param fasta_fn sequence file
         @param name_map dictionary mapping the FASTA name to final stored name
         """
+        # Mapping from sequence name to primary sequence
         self.sequences = IMP.pmi.tools.OrderedDict()
+        # Mapping from sequence name to UniProt accession, if available
+        self.uniprot = {}
         self.read_sequences(fasta_fn, name_map)
 
     def __len__(self):
@@ -1040,13 +1056,17 @@ def read_sequences(self, fasta_fn, name_map=None):
                 if line.startswith('>'):
                     if seq is not None:
                         self.sequences[code] = seq.strip('*')
-                    code = line.rstrip()[1:]
+                    spl = line[1:].split('|')
+                    code = spl[0].strip()
                     if name_map is not None:
                         try:
                             code = name_map[code]
                         except KeyError:
                             pass
                     seq = ''
+                    if len(spl) >= 2:
+                        up_accession = spl[1].strip()
+                        self.uniprot[code] = up_accession
                 else:
                     line = line.rstrip()
                     if line:  # Skip blank lines
diff --git a/modules/pmi/test/input/uniprot.fasta b/modules/pmi/test/input/uniprot.fasta
new file mode 100644
index 0000000000..a28a65a2af
--- /dev/null
+++ b/modules/pmi/test/input/uniprot.fasta
@@ -0,0 +1,8 @@
+>1WCM:A
+MVGQQYSS
+>1WCM:B
+AADESAPIT
+>test1|acc1
+AADESAPIT
+>test2|acc2
+AADESAPIT
diff --git a/modules/pmi/test/medium_test_topology.py b/modules/pmi/test/medium_test_topology.py
index 8f0a5f07b0..0d6ebce638 100644
--- a/modules/pmi/test/medium_test_topology.py
+++ b/modules/pmi/test/medium_test_topology.py
@@ -421,7 +421,8 @@ def test_create_molecules(self):
         m1 = st.create_molecule("Prot1", sequence=seqs["Prot1"])
         self.assertRaises(ValueError, st.create_molecule, "Prot1",
                           sequence=seqs["Prot1"])
-        m2 = st.create_molecule("Prot2", sequence=seqs["Prot2"])
+        m2 = st.create_molecule("Prot2", sequence=seqs["Prot2"],
+                                uniprot="testup")
         self.assertEqual(m1.get_hierarchy().get_parent(), st.get_hierarchy())
         self.assertEqual(m2.get_hierarchy().get_parent(), st.get_hierarchy())
         self.assertEqual(m1.model, st.model)
@@ -429,6 +430,8 @@ def test_create_molecules(self):
         self.assertEqual(m1.get_name(), "Prot1")
         self.assertEqual(m2.get_name(), "Prot2")
         self.assertEqual(len(st.get_hierarchy().get_children()), 2)
+        self.assertIsNone(m1.uniprot)
+        self.assertEqual(m2.uniprot, 'testup')
 
         # create state 2 with one molecule
         st2 = s.create_state()
diff --git a/modules/pmi/test/test_sequences.py b/modules/pmi/test/test_sequences.py
index 51f12128d3..4cb1f2d090 100644
--- a/modules/pmi/test/test_sequences.py
+++ b/modules/pmi/test/test_sequences.py
@@ -27,6 +27,15 @@ def test_sequences_name_map(self):
         self.assertNotIn('1WCM:A', s)
         self.assertEqual(s['foo'], 'MVGQQYSS')
 
+    def test_sequences_uniprot(self):
+        """Test reading sequences with UniProt info from a FASTA file"""
+        s = IMP.pmi.topology.Sequences(
+            self.get_input_file_name('uniprot.fasta'))
+        self.assertIn('test1', s)
+        self.assertIn('test2', s)
+        self.assertEqual(s.uniprot['test1'], 'acc1')
+        self.assertEqual(s.uniprot['test2'], 'acc2')
+
     def test_sequences_bad(self):
         """Test reading sequences from an invalid FASTA file"""
         fname = 'test_sequences_bad.fasta'