improving gemmi-convert --sifts-num (work in progress)

project-gemmi · May 11, 2024 · 148f37b · 148f37b
1 parent 14a92d7
commit 148f37b
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 37 deletions.
diff --git a/docs/convert-help.txt b/docs/convert-help.txt
@@ -41,8 +41,8 @@ Any output options:
   --monomer=OLD:NEW       Change monomer name (CCD code) OLD to NEW.
   -s FILE                 Use sequence(s) from FILE in PIR or FASTA format. Each
                           chain is assigned the best matching sequence, if any.
-  --sifts-num             Use SIFTS-mapped position in UniProt sequence as
-                          sequence ID.
+  --sifts-num[=AC]        Set sequence ID to SIFTS-mapped UniProt positions. In
+                          chimeric chains use AC. Adds 5000 to other seqnums.
   -B MIN[:MAX]            Set isotropic B-factors to a single value or change
                           values out of given range to MIN/MAX.
   --anisou=yes|no|heavy   Add or remove ANISOU records.

diff --git a/docs/utils.rst b/docs/utils.rst
@@ -429,6 +429,22 @@ It is not obvious how to name the new chains that are added.
 We have two options: either new names are generated (`=new`) or
 the chain names are not changed but distinct segment IDs are added (`=dup`).
 
+The `--sifts-num` option changes sequence IDs to the corresponding sequence
+positions from UniProt. The mapping between PDB and UniProt is based on
+SIFTS (not DBREF) :ref:`annotations in mmCIF files <dbref>`.
+Currently, these annotations are present only in the PDB NextGen Archive
+and in PDBe "updated" files.
+Most PDB chains are mapped to a single UniProt entry.
+For chimeric chains that correspond to 2+ UniProt sequences,
+we default to selecting the sequence with more corresponding residues.
+This default can be overridden by explicitly specifying the preferred
+UniProtKB identifiers (usually just one AC, but it's possible to specify
+multiple comma-separated ACs, in the order of preference).
+To avoid clashes in sequence IDs, and also to indicate which IDs are based
+on UniProt mapping, the non-mapped sequence numbers are increased by
+an offset of 5000 (this default value is inspired by PDBrenum)
+or bigger if necessary (when UniProt positions are that large).
+
 tags
 ====
 

diff --git a/prog/convert.cpp b/prog/convert.cpp
@@ -121,8 +121,9 @@ const option::Descriptor Usage[] = {
   { SetSeq, 0, "s", "", Arg::Required,
     "  -s FILE  \tUse sequence(s) from FILE in PIR or FASTA format. Each chain"
     " is assigned the best matching sequence, if any." },
-  { SiftsNum, 0, "", "sifts-num", Arg::None,
-    "  --sifts-num  \tUse SIFTS-mapped position in UniProt sequence as sequence ID." },
+  { SiftsNum, 0, "", "sifts-num", Arg::Optional,
+    "  --sifts-num[=AC]  \tSet sequence ID to SIFTS-mapped UniProt positions."
+    " In chimeric chains use AC. Adds 5000 to other seqnums." },
   { Biso, 0, "B", "", Arg::Required,
     "  -B MIN[:MAX]  \tSet isotropic B-factors to a single value or change values "
       "out of given range to MIN/MAX." },
@@ -187,6 +188,68 @@ std::string read_whole_file(std::istream& stream) {
   return out;
 }
 
+std::uint8_t select_acc_index(const std::vector<std::string>& acc, // Entity::sifts_unp_acc
+                              const gemmi::ConstResidueSpan& polymer,
+                              const std::vector<std::string>& preferred_acs) {
+  if (acc.size() < 2)
+    return 0;
+  for (const std::string& pa : preferred_acs) {
+    auto it = std::find(acc.begin(), acc.end(), pa);
+    if (it != acc.end())
+      return std::uint8_t(it - acc.begin());
+  }
+  // return SIFTS mapping with most residues
+  std::vector<int> counts(acc.size(), 0);
+  for (const gemmi::Residue& res : polymer)
+    if (res.sifts_unp.res && res.sifts_unp.acc_index < acc.size())
+      ++counts[res.sifts_unp.acc_index];
+  auto max_el = std::max_element(counts.begin(), counts.end());
+  return std::uint8_t(max_el - counts.begin());
+}
+
+// Set seqid corresponding to one UniProt sequence.
+// Other residues have seqid changed to avoid conflicts.
+void to_sifts_num(gemmi::Structure& st, const std::vector<std::string>& preferred_acs) {
+  bool first_model = true;
+  std::map<gemmi::SeqId, gemmi::SeqId> seqid_map;
+  for (gemmi::Model& model: st.models)
+    for (gemmi::Chain& chain : model.chains) {
+      seqid_map.clear();
+      auto polymer = chain.get_polymer();
+      gemmi::Entity* ent = st.get_entity_of(polymer);
+      std::uint8_t acc_index = 0;
+      if (ent)
+        acc_index = select_acc_index(ent->sifts_unp_acc, polymer, preferred_acs);
+      std::uint16_t offset = 4950;
+      for (gemmi::Residue& res : chain.residues) {
+        if (res.sifts_unp.res && res.sifts_unp.acc_index == acc_index) {
+          if (!ent || res.entity_id != ent->name)
+            gemmi::fail("SIFTS ooops"); // XXX remove it after testing
+          gemmi::SeqId new_seqid(res.sifts_unp.num, ' ');
+          seqid_map.emplace(res.seqid, new_seqid);
+          res.seqid = new_seqid;
+          offset = std::max(offset, res.sifts_unp.num);
+        }
+      }
+      offset = (offset + 50) / 1000 * 1000;  // always >= 5000
+      for (gemmi::Residue& res : chain.residues)
+        if (!res.sifts_unp.res || res.sifts_unp.acc_index != acc_index)
+          res.seqid.num += offset;
+      if (first_model) {
+        gemmi::process_addresses(st, [&](gemmi::AtomAddress& aa) {
+            if (aa.chain_name == chain.name) {
+              auto it = seqid_map.find(aa.res_id.seqid);
+              if (it != seqid_map.end())
+                aa.res_id.seqid = it->second;
+              else
+                aa.res_id.seqid.num += offset;
+            }
+        });
+        first_model = false;
+      }
+    }
+}
+
 void convert(gemmi::Structure& st,
              const std::string& output, CoorFormat output_type,
              const std::vector<option::Option>& options) {
@@ -304,39 +367,19 @@ void convert(gemmi::Structure& st,
     }
   }
 
-  if (options[SiftsNum]) {
-    // Currently, we change seqid only for residues _pdbx_sifts_xref_db.unp_num
-    // set, and leave seqid for other residues.
-    // A more robust method would be to re-assign the whole polymer always when
-    // nup_num is set for any residue. This would mean:
-    // using insertion codes for insertions (up to 26 residues),
-    // renumbering ligands and waters if needed, and
-    // failing if we get duplicated seqid (multiple mappings for one chain).
-    bool changed = false;
-    for (gemmi::Model& model: st.models)
-      for (gemmi::Chain& chain : model.chains) {
-        const gemmi::Residue* prev_res = nullptr;
-        bool wrong_order = false;
-        for (gemmi::Residue& res : chain.residues) {
-          if (res.sifts_unp.res) {
-            res.seqid = gemmi::SeqId(res.sifts_unp.num, ' ');
-            if (prev_res && !(prev_res->seqid < res.seqid))
-              wrong_order = true;
-            changed = true;
-          }
-          prev_res = &res;
-        }
-        if (wrong_order) {
-          std::cerr << "WARNING: new sequence IDs are in wrong order in chain "
-                    << chain.name;
-          if (st.models.size() > 1)
-            std::cerr << " (model " << model.name << ')';
-          std::cerr << std::endl;
-        }
-      }
-    if (options[Verbose] && !changed)
-      std::cerr << "Option --sifts-num had no effect, "
-                   "it works only with _pdbx_sifts_xref_db.unp_num." << std::endl;
+  if (const option::Option* opt = options[SiftsNum]) {
+    if (std::all_of(st.entities.begin(), st.entities.end(),
+                    [](const gemmi::Entity& ent) { return ent.sifts_unp_acc.empty(); }))
+      gemmi::fail("--sifts-num failed: SIFTS annotation not found in the file.\n"
+                  "Check if tags such as _pdbx_sifts_xref_db.unp_num are present.");
+    std::vector<std::string> preferred_acs;
+    if (opt->arg) {
+      gemmi::split_str_into(opt->arg, ',', preferred_acs);
+      for (const std::string& ac : preferred_acs)
+        if (ac.size() < 6 || ac[0] < 'A' || ac[0] > 'Z' || ac[1] < '0' || ac[1] > '9')
+          gemmi::fail(ac + " is not in UniProtKB AC format, from: " + opt->name);
+    }
+    to_sifts_num(st, preferred_acs);
   }
 
   HowToNameCopiedChain how = HowToNameCopiedChain::AddNumber;