From 6c5868c9af3e38245b524f8362e08c336901f360 Mon Sep 17 00:00:00 2001
From: Ben Webb <ben@salilab.org>
Date: Tue, 24 Oct 2023 12:21:29 -0700
Subject: [PATCH] Add support for mmCIF starting models

If an mmCIF file was used to provide structure
for part of the system, create a corresponding
IHM starting model and extract template information,
if provided.
---
 modules/mmcif/pyext/src/data.py         | 14 +++++----
 modules/mmcif/test/input/test.nup84.cif | 39 +++++++++++++++++++++++++
 modules/mmcif/test/test_dumper.py       | 24 ++++++++++-----
 3 files changed, 65 insertions(+), 12 deletions(-)
 create mode 100644 modules/mmcif/test/input/test.nup84.cif

diff --git a/modules/mmcif/pyext/src/data.py b/modules/mmcif/pyext/src/data.py
index 2b04d8c960..63ed2a98f4 100644
--- a/modules/mmcif/pyext/src/data.py
+++ b/modules/mmcif/pyext/src/data.py
@@ -417,12 +417,16 @@ def __hash__(self):
 
     def _set_sources_datasets(self, system, datasets):
         # Attempt to identify PDB file vs. comparative model
-        p = ihm.metadata.PDBParser()
+        if (hasattr(ihm.metadata, 'CIFParser')
+                and self.filename.endswith('.cif')):
+            p = ihm.metadata.CIFParser()
+        else:
+            p = ihm.metadata.PDBParser()
         r = p.parse_file(self.filename)
-        system.software.extend(r['software'])
+        system.software.extend(r.get('software', []))
         dataset = datasets.add(r['dataset'])
         # We only want the templates that model the starting model chain
-        templates = r['templates'].get(self.asym_id, [])
+        templates = r.get('templates', {}).get(self.asym_id, [])
         for t in templates:
             if t.alignment_file:
                 system.locations.append(t.alignment_file)
@@ -430,7 +434,7 @@ def _set_sources_datasets(self, system, datasets):
                 datasets.add(t.dataset)
         self.dataset = dataset
         self.templates = templates
-        self.metadata = r['metadata']
+        self.metadata = r.get('metadata', [])
 
     def _read_coords(self):
         """Read the coordinates for this starting model"""
@@ -438,7 +442,7 @@ def _read_coords(self):
         # todo: support reading other subsets of the atoms (e.g. CA/CB)
         slt = IMP.atom.ChainPDBSelector([self.asym_id]) \
             & IMP.atom.NonWaterNonHydrogenPDBSelector()
-        hier = IMP.atom.read_pdb(self.filename, m, slt)
+        hier = IMP.atom.read_pdb_or_mmcif(self.filename, m, slt)
         rng = self.asym_unit.seq_id_range
         sel = IMP.atom.Selection(
             hier, residue_indexes=list(range(rng[0] - self.offset,
diff --git a/modules/mmcif/test/input/test.nup84.cif b/modules/mmcif/test/input/test.nup84.cif
new file mode 100644
index 0000000000..6b09cb839d
--- /dev/null
+++ b/modules/mmcif/test/input/test.nup84.cif
@@ -0,0 +1,39 @@
+data_model
+#
+_exptl.method 'model, MODELLER Version 9.18 2017/02/10 22:21:34'
+#
+_modeller.version 9.18
+_modeller.alignment modeller_model.ali
+#
+loop_
+_modeller_template.id
+_modeller_template.name
+_modeller_template.template_begin
+_modeller_template.template_end
+_modeller_template.target_begin
+_modeller_template.target_end
+_modeller_template.pct_seq_id
+1 3jroC 33:C 424:C 33:A 424:A 100.0
+2 3f3fG 482:G 551:G 429:A 488:A 10.0
+#
+loop_
+_atom_site.group_PDB
+_atom_site.type_symbol
+_atom_site.label_atom_id
+_atom_site.label_alt_id
+_atom_site.label_comp_id
+_atom_site.label_asym_id
+_atom_site.auth_asym_id
+_atom_site.label_seq_id
+_atom_site.auth_seq_id
+_atom_site.pdbx_PDB_ins_code
+_atom_site.Cartn_x
+_atom_site.Cartn_y
+_atom_site.Cartn_z
+_atom_site.occupancy
+_atom_site.B_iso_or_equiv
+_atom_site.label_entity_id
+_atom_site.id
+_atom_site.pdbx_PDB_model_num
+ATOM C CA . MET A A 1 1 ? -8.986 11.688 -5.817 1.000 91.82 1 1 1
+ATOM C CA . GLU A A 2 2 ? -8.986 11.688 -5.817 1.000 91.82 1 2 1
diff --git a/modules/mmcif/test/test_dumper.py b/modules/mmcif/test/test_dumper.py
index c2a3ba9a35..f4f2989c31 100644
--- a/modules/mmcif/test/test_dumper.py
+++ b/modules/mmcif/test/test_dumper.py
@@ -282,14 +282,17 @@ def add_state(self, m, top, state_index, name):
         top.add_child(state)
         return state
 
-    def _make_residue_chain(self, name, chain_id, model):
+    def _make_residue_chain(self, name, chain_id, model, cif=False):
         if name == 'Nup84':
-            fname = 'test.nup84.pdb'
+            if cif:
+                fname = 'test.nup84.cif'
+            else:
+                fname = 'test.nup84.pdb'
             seq = 'ME'
         else:
             fname = 'test.nup85.pdb'
             seq = 'GE'
-        h = IMP.atom.read_pdb(self.get_input_file_name(fname), model)
+        h = IMP.atom.read_pdb_or_mmcif(self.get_input_file_name(fname), model)
         for hchain in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE):
             chain = IMP.atom.Chain(hchain)
             chain.set_sequence(seq)
@@ -308,20 +311,27 @@ def add_structure(self, p):
             p, IMP.algebra.Sphere3D(IMP.algebra.Vector3D(1, 2, 3), 4))
         IMP.atom.Mass.setup_particle(p, 1.0)
 
-    def test_starting_model_dumper(self):
-        """Test StartingModelDumper"""
+    def test_starting_model_dumper_pdb(self):
+        """Test StartingModelDumper with PDB starting models"""
+        self._internal_test_starting_model_dumper(cif=False)
+
+    def test_starting_model_dumper_cif(self):
+        """Test StartingModelDumper with mmCIF starting models"""
+        self._internal_test_starting_model_dumper(cif=True)
+
+    def _internal_test_starting_model_dumper(self, cif):
         m = IMP.Model()
 
         top = IMP.atom.Hierarchy.setup_particle(IMP.Particle(m))
         state1h = self.add_state(m, top, 0, "State1")
 
-        h1 = self._make_residue_chain('Nup84', 'A', m)
+        h1 = self._make_residue_chain('Nup84', 'A', m, cif=cif)
         state1h.add_child(h1)
 
         # Test multiple states: components that are the same in both states
         # (Nup84) should not be duplicated in the mmCIF output
         state2h = self.add_state(m, top, 0, "State2")
-        h1 = self._make_residue_chain('Nup84', 'A', m)
+        h1 = self._make_residue_chain('Nup84', 'A', m, cif=cif)
         state2h.add_child(h1)
         h2 = self._make_residue_chain('Nup85', 'B', m)
         state2h.add_child(h2)