From 86a8bbbb6b4502679d0d869952569245d0c5e1a3 Mon Sep 17 00:00:00 2001
From: Ben Webb <ben@salilab.org>
Date: Thu, 26 Sep 2024 17:23:18 -0700
Subject: [PATCH] Trim non-represented residues from IHM sequence

PDB policy is for amino acid Entity sequences to be polymers (so
they should include any loops or other gaps in the midst of the
sequence) but for the termini to be trimmed of any not-modeled
residues. Add a function to, after the PMI system is built,
modify the Entity sequence to remove any parts that are not
included in any representation. This will change the seq_id numbering
if any N-terminal residues are removed. If this happens, set the
existing PMI-to-IHM offset accordingly, and renumber any
existing objects.
---
 pyext/src/mmcif.py             | 88 ++++++++++++++++++++++++++++------
 pyext/src/topology/__init__.py |  2 +
 test/test_mmcif.py             | 39 +++++++++------
 test/test_mmcif_pmi2.py        | 76 +++++++++++++++++++++++++++--
 4 files changed, 172 insertions(+), 33 deletions(-)

diff --git a/pyext/src/mmcif.py b/pyext/src/mmcif.py
index c55f9bf9..8916af22 100644
--- a/pyext/src/mmcif.py
+++ b/pyext/src/mmcif.py
@@ -1186,10 +1186,12 @@ def __set_name(self, val):
 class Entity(ihm.Entity):
     """A single entity in the system.
        This functions identically to the base ihm.Entity class, but it
-       allows identifying residues by either the IHM numbering scheme
-       (seq_id, which is always contiguous starting at 1) or by the PMI
-       scheme (which is similar, but may not start at 1 if the sequence in
-       the FASTA file has one or more N-terminal gaps"""
+       allows identifying residues by either the PMI numbering scheme
+       (which is always contiguous starting at 1, covering the entire
+       sequence in the FASTA files, or the IHM scheme (seq_id, which also
+       starts at 1, but which only covers the modeled subset of the full
+       sequence, with non-modeled N-terminal or C-terminal residues
+       removed)."""
     def __init__(self, sequence, pmi_offset, *args, **keys):
         # Offset between PMI numbering and IHM; <pmi_#> = <ihm_#> + pmi_offset
         # (pmi_offset is also the number of N-terminal gaps in the FASTA file)
@@ -1209,10 +1211,12 @@ def pmi_range(self, res_id_begin, res_id_end):
 class AsymUnit(ihm.AsymUnit):
     """A single asymmetric unit in the system.
        This functions identically to the base ihm.AsymUnit class, but it
-       allows identifying residues by either the IHM numbering scheme
-       (seq_id, which is always contiguous starting at 1) or by the PMI
-       scheme (which is similar, but may not start at 1 if the sequence in
-       the FASTA file has one or more N-terminal gaps"""
+       allows identifying residues by either the PMI numbering scheme
+       (which is always contiguous starting at 1, covering the entire
+       sequence in the FASTA files, or the IHM scheme (seq_id, which also
+       starts at 1, but which only covers the modeled subset of the full
+       sequence, with non-modeled N-terminal or C-terminal residues
+       removed)."""
 
     def __init__(self, entity, *args, **keys):
         super().__init__(
@@ -1375,18 +1379,15 @@ def add_component_sequence(self, state, name, seq, asym_name=None,
         if asym_name is None:
             asym_name = name
 
-        def get_offset(seq):
-            # Count length of gaps at start of sequence
-            for i in range(len(seq)):
-                if seq[i] != '-':
-                    return seq[i:], i
-        seq, offset = get_offset(seq)
         if name in self.sequence_dict:
             if self.sequence_dict[name] != seq:
                 raise ValueError("Sequence mismatch for component %s" % name)
         else:
             self.sequence_dict[name] = seq
-            self.entities.add(name, seq, offset, alphabet)
+            # Offset is always zero to start with; this may be modified
+            # later in finalize_build() if any non-modeled N-terminal
+            # residues are removed
+            self.entities.add(name, seq, 0, alphabet)
         if asym_name in self.asym_units:
             if self.asym_units[asym_name] is None:
                 # Set up a new asymmetric unit for this component
@@ -1396,6 +1397,12 @@ def get_offset(seq):
                 self.asym_units[asym_name] = asym
             state.modeled_assembly.append(self.asym_units[asym_name])
 
+    def finalize_build(self):
+        """Called immediately after the PMI system is built"""
+        for entity in self.system.entities:
+            _trim_unrep_termini(entity, self.system.asym_units,
+                                self.system.orphan_representations)
+
     def _add_restraint_model_fits(self):
         """Add fits to restraints for all known models"""
         for group, m in self.system._all_models():
@@ -1740,3 +1747,54 @@ def parse_file(self, filename):
                 elif line.startswith('# ncenters: '):
                     ret['number_of_gaussians'] = int(line[12:])
         return ret
+
+
+def _trim_unrep_termini(entity, asyms, representations):
+    """Trim Entity sequence to only cover represented residues.
+
+       PDB policy is for amino acid Entity sequences to be polymers (so
+       they should include any loops or other gaps in the midst of the
+       sequence) but for the termini to be trimmed of any not-modeled
+       residues. Here, we modify the Entity sequence to remove any parts
+       that are not included in any representation. This may change the
+       numbering if any N-terminal residues are removed, and thus the offset
+       between PMI and IHM numbering, as both count from 1."""
+    rep_range = None
+    for rep in representations:
+        for seg in rep:
+            if seg.asym_unit.entity is entity:
+                seg_range = seg.asym_unit.seq_id_range
+                if rep_range is None:
+                    rep_range = list(seg_range)
+                else:
+                    rep_range[0] = min(rep_range[0], seg_range[0])
+                    rep_range[1] = max(rep_range[1], seg_range[1])
+    # Do nothing if no representations or if everything is represented
+    if rep_range is None or rep_range == [1, len(entity.sequence)]:
+        return
+    # Update offset (= number of unrepresented N terminal entity residues)
+    # in entity and all asyms
+    pmi_offset = rep_range[0] - 1
+    entity.pmi_offset = pmi_offset
+    for asym in asyms:
+        if asym.entity is entity:
+            asym.auth_seq_id_map = entity.pmi_offset
+    # Modify entity sequence to only include represented residues
+    entity.sequence = entity.sequence[rep_range[0] - 1:rep_range[1]]
+
+    # If N terminal deletions, we must fix the numbering of all segments
+    # and starting models
+    if pmi_offset != 0:
+        for rep in representations:
+            for seg in rep:
+                if seg.asym_unit.entity is entity:
+                    seg_range = seg.asym_unit.seq_id_range
+                    seg.asym_unit.seq_id_range = (seg_range[0] - pmi_offset,
+                                                  seg_range[1] - pmi_offset)
+                    if seg.starting_model:
+                        model = seg.starting_model
+                        seg_range = model.asym_unit.seq_id_range
+                        model.asym_unit.seq_id_range = \
+                                (seg_range[0] - pmi_offset,
+                                 seg_range[1] - pmi_offset)
+                        model.offset = model.offset - pmi_offset
diff --git a/pyext/src/topology/__init__.py b/pyext/src/topology/__init__.py
index 9b591bd1..80c24b2f 100644
--- a/pyext/src/topology/__init__.py
+++ b/pyext/src/topology/__init__.py
@@ -201,6 +201,8 @@ def build(self, **kwargs):
             for state in self.states:
                 state.build(**kwargs)
             self.built = True
+            for po in self._protocol_output:
+                po.finalize_build()
         return self.hier
 
     def add_protocol_output(self, p):
diff --git a/test/test_mmcif.py b/test/test_mmcif.py
index 522947f9..46e88b46 100644
--- a/test/test_mmcif.py
+++ b/test/test_mmcif.py
@@ -661,11 +661,19 @@ def test_starting_model_dumper(self):
         nup84_2.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
         nup84_2.add_representation(resolutions=[1])
 
-        nup85 = st2.create_molecule("Nup85", "SELM", "B")
+        nup85 = st2.create_molecule("Nup85", "GGGGSELMGG", "B")
+        # Residues S, E should be residues 8, 9 in test.nup85.pdb; map them
+        # to FASTA sequence
         nup85.add_structure(self.get_input_file_name('test.nup85.pdb'), 'A',
-                            res_range=(8, 9), offset=-7)
-        nup85.add_representation(resolutions=[1])
+                            res_range=(8, 9), offset=-3)
+        nup85.add_representation(residues=nup85[4:8], resolutions=[1])
         _ = s.build()
+        # Since we didn't represent the first 4 or last 2 residues in nup85,
+        # the IHM sequence should just be "SELM", and the PDB offset should be
+        # modified to match (from -3 to -7)
+        self.assertEqual(''.join(x.code
+                                 for x in po.system.entities[1].sequence),
+                         'SELM')
 
         self.assign_entity_asym_ids(po.system)
 
@@ -1214,26 +1222,27 @@ class DummyRestraint:
         s.add_protocol_output(po)
         state = s.create_state()
         po_state = state._protocol_output[0][1]
-        nup84 = state.create_molecule("Nup84", "MELS", "A")
-        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
-        nup84.add_representation(nup84.get_atomic_residues(), resolutions=[1])
-        nup84.add_representation(nup84.get_non_atomic_residues(),
-                                 resolutions=[10])
+        # First 10 residues are not represented, so PMI residue indexes
+        # range from 11 to 14, but IHM seq_ids from 1 to 4
+        nup84 = state.create_molecule("Nup84", "P" * 10 + "MELS", "A")
+        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
+                            offset=10)
+        nup84.add_representation(nup84[10:], resolutions=[1])
         hier = s.build()
 
         r = DummyRestraint()
         r.dataset = DummyDataset()
         r.dataset._id = 42
         xl_group = po.get_cross_link_group(r)
-        ex_xl = po.add_experimental_cross_link(1, 'Nup84',
-                                               2, 'Nup84', xl_group)
-        _ = po.add_experimental_cross_link(1, 'Nup84',
-                                           3, 'Nup84', xl_group)
+        ex_xl = po.add_experimental_cross_link(11, 'Nup84',
+                                               12, 'Nup84', xl_group)
+        _ = po.add_experimental_cross_link(11, 'Nup84',
+                                           13, 'Nup84', xl_group)
         # Duplicates should be ignored
-        po.add_experimental_cross_link(1, 'Nup84', 3, 'Nup84', xl_group)
+        po.add_experimental_cross_link(11, 'Nup84', 13, 'Nup84', xl_group)
         # Non-modeled component should be ignored
-        nm_ex_xl = po.add_experimental_cross_link(1, 'Nup85',
-                                                  2, 'Nup84', xl_group)
+        nm_ex_xl = po.add_experimental_cross_link(11, 'Nup85',
+                                                  12, 'Nup84', xl_group)
         self.assertEqual(nm_ex_xl, None)
         sel = IMP.atom.Selection(hier, molecule='Nup84',
                                  resolution=1)
diff --git a/test/test_mmcif_pmi2.py b/test/test_mmcif_pmi2.py
index e6fd573f..42a20b3f 100644
--- a/test/test_mmcif_pmi2.py
+++ b/test/test_mmcif_pmi2.py
@@ -131,6 +131,9 @@ def test_entity(self):
         w = ihm.format.CifWriter(fh)
         d = ihm.dumper._EntityDumper()
         d.finalize(po.system)
+        self.assertEqual(''.join(x.code
+                                 for x in po.system.entities[0].sequence),
+                         'MELS')
         d.dump(po.system, w)
         out = fh.getvalue()
         self.assertEqual(out, """#
@@ -146,6 +149,24 @@ def test_entity(self):
 #
 """)
 
+    def test_entity_trimmed(self):
+        """Test that Entity sequence is trimmed to represented region"""
+        m = IMP.Model()
+        s = IMP.pmi.topology.System(m)
+        po = IMP.pmi.mmcif.ProtocolOutput()
+        s.add_protocol_output(po)
+        state = s.create_state()
+        nup84 = state.create_molecule("Nup84", "GGGMELSGGGG", "A")
+        # Only "MELS" is represented, so the output entity should be of
+        # length 4 and weigh 532, not the full PMI sequence
+        nup84.add_representation(residues=nup84[3:7], resolutions=[1])
+        hier = s.build()
+        self.assertEqual(''.join(x.code
+                                 for x in po.system.entities[0].sequence),
+                         'MELS')
+        self.assertAlmostEqual(po.system.entities[0].formula_weight,
+                               532.606, delta=1e-2)
+
     def test_model_representation(self):
         """Test ModelRepresentationDumper with PMI2-style init"""
         m = IMP.Model()
@@ -153,10 +174,12 @@ def test_model_representation(self):
         po = IMP.pmi.mmcif.ProtocolOutput()
         s.add_protocol_output(po)
         state = s.create_state()
-        nup84 = state.create_molecule("Nup84", "MELS", "A")
-        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
-        nup84.add_representation(resolutions=[1])
+        nup84 = state.create_molecule("Nup84", "GGGGMELSGG", "A")
+        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
+                            offset=4)
+        nup84.add_representation(resolutions=[1], residues=nup84[4:8])
         hier = s.build()
+        print([x.code for x in po.system.entities[0].sequence])
         fh = StringIO()
         w = ihm.format.CifWriter(fh)
         self.assign_entity_asym_ids(po.system)
@@ -196,5 +219,52 @@ def test_model_representation(self):
 #
 """)
 
+    def test_poly_seq_scheme_trimmed(self):
+        """Test that non-modeled termini are trimmed in scheme tables"""
+        m = IMP.Model()
+        s = IMP.pmi.topology.System(m)
+        po = IMP.pmi.mmcif.ProtocolOutput()
+        s.add_protocol_output(po)
+        state = s.create_state()
+        nup84_a = state.create_molecule("Nup84a", "PWYACMELSWAG", "A")
+        nup84_a.add_representation(residues=nup84_a[4:7], resolutions=[1])
+        nup84_b = state.create_molecule("Nup84b", "PWYACMELSWAG", "A")
+        nup84_b.add_representation(residues=nup84_b[2:5], resolutions=[1])
+        hier = s.build()
+        self.assertEqual(len(po.system.entities), 1)
+        self.assertEqual(po.system.entities[0].pmi_offset, 2)
+        fh = StringIO()
+        w = ihm.format.CifWriter(fh)
+        self.assign_entity_asym_ids(po.system)
+        self.assign_range_ids(po.system)
+        d = ihm.dumper._PolySeqSchemeDumper()
+        d.dump(po.system, w)
+        out = fh.getvalue()
+        self.assertEqual(out, """#
+loop_
+_pdbx_poly_seq_scheme.asym_id
+_pdbx_poly_seq_scheme.entity_id
+_pdbx_poly_seq_scheme.seq_id
+_pdbx_poly_seq_scheme.mon_id
+_pdbx_poly_seq_scheme.pdb_seq_num
+_pdbx_poly_seq_scheme.auth_seq_num
+_pdbx_poly_seq_scheme.pdb_mon_id
+_pdbx_poly_seq_scheme.auth_mon_id
+_pdbx_poly_seq_scheme.pdb_strand_id
+_pdbx_poly_seq_scheme.pdb_ins_code
+A 1 1 TYR 3 3 TYR TYR A .
+A 1 2 ALA 4 4 ALA ALA A .
+A 1 3 CYS 5 5 CYS CYS A .
+A 1 4 MET 6 6 MET MET A .
+A 1 5 GLU 7 7 GLU GLU A .
+B 1 1 TYR 3 3 TYR TYR B .
+B 1 2 ALA 4 4 ALA ALA B .
+B 1 3 CYS 5 5 CYS CYS B .
+B 1 4 MET 6 6 MET MET B .
+B 1 5 GLU 7 7 GLU GLU B .
+#
+""")
+
+
 if __name__ == '__main__':
     IMP.test.main()