From 584c0287663e9e2954d5fe92fac2e4df8a1c1afb Mon Sep 17 00:00:00 2001 From: Ben Webb Date: Thu, 28 Sep 2023 12:07:37 -0700 Subject: [PATCH] PDBSelector now works with PDBRecord, not raw line Rather than passing a raw line of text from the PDB file to PDBSelector, pass a PDBRecord type. This wraps either the PDB text or the equivalent parsed data from the atom_site table in an mmCIF file, and provides methods to get atom/residue name, etc. This ensures that when working with mmCIF we are not restricted to PDB format limitations, such as single-character chain IDs or insertion codes. Closes #1083. --- modules/atom/include/internal/pdb.h | 39 +++++- modules/atom/include/pdb.h | 186 ++++++++++++++++++---------- modules/atom/pyext/swig.i-in | 1 + modules/atom/src/mmcif.cpp | 89 ++----------- modules/atom/src/pdb.cpp | 135 ++++++++++++++++++-- modules/atom/test/test_pdb.py | 4 +- 6 files changed, 301 insertions(+), 153 deletions(-) diff --git a/modules/atom/include/internal/pdb.h b/modules/atom/include/internal/pdb.h index e63e48bce5..78758f97bb 100644 --- a/modules/atom/include/internal/pdb.h +++ b/modules/atom/include/internal/pdb.h @@ -2,7 +2,7 @@ * \file internal/pdb.h * \brief A class with static functions for parsing PDB files * - * Copyright 2007-2022 IMP Inventors. All rights reserved. + * Copyright 2007-2023 IMP Inventors. All rights reserved. * */ @@ -14,6 +14,7 @@ #include #include +#include "ihm_format.h" #include @@ -111,6 +112,42 @@ IMPATOMEXPORT String atom_element(const String& pdb_line); IMPATOMEXPORT Vector connected_atoms( const String& pdb_line); +//! Handle a keyword in an mmCIF file +class CifKeyword { + struct ihm_keyword *k_; +public: + CifKeyword(struct ihm_category *c, std::string name) + : k_(ihm_keyword_new(c, name.c_str())) {} + + //! Get raw string value of the keyword (may be null) + const char *data() { return k_->data; } + + //! Get value as a string, or the empty string if it is missing + const char *as_str() { + if (k_->omitted || k_->unknown || !k_->in_file) { + return ""; + } else { + return k_->data; + } + } + + float as_float(float default_value=0.) { + if (k_->omitted || k_->unknown || !k_->in_file) { + return default_value; + } else { + return boost::lexical_cast(k_->data); + } + } + + int as_int(int default_value=0) { + if (k_->omitted || k_->unknown || !k_->in_file) { + return default_value; + } else { + return boost::lexical_cast(k_->data); + } + } +}; + //! write particles as ATOMs to PDB (assumes Particles are valid Atoms) IMPATOMEXPORT void write_pdb(const ParticlesTemp& ps, TextOutput out); diff --git a/modules/atom/include/pdb.h b/modules/atom/include/pdb.h index e4fb5bdca5..355a1114a7 100644 --- a/modules/atom/include/pdb.h +++ b/modules/atom/include/pdb.h @@ -2,7 +2,7 @@ * \file IMP/atom/pdb.h * \brief Functions to read PDBs * - * Copyright 2007-2022 IMP Inventors. All rights reserved. + * Copyright 2007-2023 IMP Inventors. All rights reserved. * */ @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,65 @@ IMPATOM_BEGIN_NAMESPACE +//! Represent a single ATOM/HETATM "line" in PDB or mmCIF format +class IMPATOMEXPORT PDBRecord : public Value { + const std::string *line_; + internal::CifKeyword *group_, *element_, *atom_name_, *alt_loc_id_, + *residue_name_, *auth_chain_, *chain_, *auth_seq_id_; + bool use_line_, use_keywords_; +public: + PDBRecord() : use_line_(false), use_keywords_(false) {} + +#ifndef SWIG + //! Point the record to a line of text from a PDB file + /** This uses a pointer to `line` so should not be used after line is freed */ + void set_line(const std::string &line); + + //! Point the record to a single atom_site loop row from an mmCIF file + /** This uses pointers to the CIF keywords, so should not be used after + the keywords are freed. */ + void set_keywords(internal::CifKeyword &group, + internal::CifKeyword &element, + internal::CifKeyword &atom_name, + internal::CifKeyword &alt_loc_id, + internal::CifKeyword &residue_name, + internal::CifKeyword &auth_chain, + internal::CifKeyword &chain, + internal::CifKeyword &auth_seq_id); +#endif + + //! Returns the alternative location indicator + /** This is a single character in PDB, but can be longer in mmCIF. */ + std::string get_alt_loc_indicator() const; + + //! Returns true if the record is an ATOM record + bool get_is_atom() const; + + //! Returns the atom type as a string with no leading or trailing whitespace + std::string get_trimmed_atom_name() const; + + //! Returns the atom type in a PDB-style padded string + /** The atom type is at least a 4 character long field. + The first character is space in many cases, but not always. */ + std::string get_padded_atom_name() const; + + //! Returns the residue type + std::string get_residue_name() const; + + //! Returns the chain ID + /** This is a single character in PDB format, but can be longer in mmCIF. + In mmCIF, the author-provided chain ID is provided if available; + otherwise, the mmCIF asym_id is returned. */ + std::string get_chain_id() const; + + //! Returns the element as a string + std::string get_element() const; + + IMP_SHOWABLE_INLINE(PDBRecord, { out << "PDBRecord"; }); +}; +IMP_VALUES(PDBRecord, PDBRecords); + + //! Select which atoms to read from a PDB file /** Selector is a general purpose class used to select records from a PDB file. Using descendants of this class one may implement arbitrary @@ -44,7 +104,8 @@ class IMPATOMEXPORT PDBSelector : public IMP::Object { public: PDBSelector(std::string name) : Object(name) {} //! Return true if the line should be processed - virtual bool get_is_selected(const std::string &pdb_line) const = 0; + virtual bool get_is_selected(const PDBRecord &record) const = 0; + virtual ~PDBSelector(); }; @@ -56,9 +117,9 @@ class NonAlternativePDBSelector : public PDBSelector { NonAlternativePDBSelector(std::string name = "NonAlternativePDBSelector%1%") : PDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - return (internal::atom_alt_loc_indicator(pdb_line) == ' ' || - internal::atom_alt_loc_indicator(pdb_line) == 'A'); + bool get_is_selected(const PDBRecord &record) const override { + std::string alt_loc = record.get_alt_loc_indicator(); + return (alt_loc == " " || alt_loc == "" || alt_loc == "A"); } IMP_OBJECT_METHODS(NonAlternativePDBSelector); }; @@ -69,9 +130,10 @@ class ATOMPDBSelector : public NonAlternativePDBSelector { ATOMPDBSelector(std::string name = "ATOMPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - return (NonAlternativePDBSelector::get_is_selected(pdb_line) && - internal::is_ATOM_rec(pdb_line)); + bool get_is_selected(const PDBRecord &record) const override + { + return (NonAlternativePDBSelector::get_is_selected(record) && + record.get_is_atom()); } IMP_OBJECT_METHODS(ATOMPDBSelector) }; @@ -82,9 +144,9 @@ class CAlphaPDBSelector : public NonAlternativePDBSelector { CAlphaPDBSelector(std::string name = "CAlphaPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) return false; - const std::string type = internal::atom_type(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) return false; + const std::string type = record.get_padded_atom_name(); return (type[1] == 'C' && type[2] == 'A' && type[3] == ' '); } IMP_OBJECT_METHODS(CAlphaPDBSelector) @@ -96,9 +158,9 @@ class CBetaPDBSelector : public NonAlternativePDBSelector { CBetaPDBSelector(std::string name = "CBetaPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) return false; - const std::string type = internal::atom_type(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) return false; + const std::string type = record.get_padded_atom_name(); return (type[1] == 'C' && type[2] == 'B' && type[3] == ' '); } IMP_OBJECT_METHODS(CBetaPDBSelector) @@ -118,9 +180,8 @@ class AtomTypePDBSelector : public PDBSelector { std::sort(atom_types_.begin(), atom_types_.end()); } - bool get_is_selected(const std::string &pdb_line) const override { - std::string type = internal::atom_type(pdb_line); - boost::trim(type); + bool get_is_selected(const PDBRecord &record) const override { + std::string type = record.get_trimmed_atom_name(); return std::binary_search(atom_types_.begin(), atom_types_.end(), type); } IMP_OBJECT_METHODS(AtomTypePDBSelector) @@ -140,9 +201,8 @@ class ResidueTypePDBSelector : public PDBSelector { std::sort(residue_types_.begin(), residue_types_.end()); } - bool get_is_selected(const std::string &pdb_line) const override { - std::string type = internal::atom_residue_name(pdb_line); - boost::trim(type); + bool get_is_selected(const PDBRecord &record) const override { + std::string type = record.get_residue_name(); return std::binary_search(residue_types_.begin(), residue_types_.end(), type); } @@ -155,9 +215,9 @@ class CPDBSelector : public NonAlternativePDBSelector { CPDBSelector(std::string name = "CPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) return false; - const std::string type = internal::atom_type(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) return false; + const std::string type = record.get_padded_atom_name(); return (type[1] == 'C' && type[2] == ' ' && type[3] == ' '); } IMP_OBJECT_METHODS(CPDBSelector) @@ -169,9 +229,9 @@ class NPDBSelector : public NonAlternativePDBSelector { NPDBSelector(std::string name = "NPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) return false; - const std::string type = internal::atom_type(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) return false; + const std::string type = record.get_padded_atom_name(); return (type[1] == 'N' && type[2] == ' ' && type[3] == ' '); } IMP_OBJECT_METHODS(NPDBSelector) @@ -182,8 +242,8 @@ class AllPDBSelector : public PDBSelector { public: AllPDBSelector(std::string name = "AllPDBSelector%1%") : PDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - return (true || pdb_line.empty()); + bool get_is_selected(const PDBRecord &record) const override { + return true; } IMP_OBJECT_METHODS(AllPDBSelector); }; @@ -195,12 +255,13 @@ class AllPDBSelector : public PDBSelector { */ class ChainPDBSelector : public NonAlternativePDBSelector { public: - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) { + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) { return false; } + char cid = record.get_chain_id()[0]; for (int i = 0; i < (int)chains_.length(); i++) { - if (internal::atom_chain_id(pdb_line) == chains_[i]) return true; + if (cid == chains_[i]) return true; } return false; } @@ -222,28 +283,27 @@ class WaterPDBSelector : public NonAlternativePDBSelector { WaterPDBSelector(std::string name = "WaterPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) { + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) { return false; } - const std::string res_name = internal::atom_residue_name(pdb_line); - return ((res_name[0] == 'H' && res_name[1] == 'O' && res_name[2] == 'H') || - (res_name[0] == 'D' && res_name[1] == 'O' && res_name[2] == 'D')); + std::string res_name = record.get_residue_name(); + return (res_name == "HOH" || res_name == "DOD"); } IMP_OBJECT_METHODS(WaterPDBSelector) }; //! Select all hydrogen ATOM and HETATM records class IMPATOMEXPORT HydrogenPDBSelector : public NonAlternativePDBSelector { - bool is_hydrogen(std::string pdb_line) const; + bool is_hydrogen(const PDBRecord &record) const; public: HydrogenPDBSelector(std::string name = "HydrogenPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) return false; - return is_hydrogen(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) return false; + return is_hydrogen(record); } IMP_OBJECT_METHODS(HydrogenPDBSelector) }; @@ -253,11 +313,11 @@ class NonWaterNonHydrogenPDBSelector : public NonAlternativePDBSelector { IMP::PointerMember ws_, hs_; public: - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) { + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) { return false; } - return (!ws_->get_is_selected(pdb_line) && !hs_->get_is_selected(pdb_line)); + return (!ws_->get_is_selected(record) && !hs_->get_is_selected(record)); } IMP_OBJECT_METHODS(NonWaterNonHydrogenPDBSelector); NonWaterNonHydrogenPDBSelector(std::string name) @@ -275,11 +335,11 @@ class NonHydrogenPDBSelector : public NonAlternativePDBSelector { IMP::PointerMember hs_; public: - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) { + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) { return false; } - return (!hs_->get_is_selected(pdb_line)); + return (!hs_->get_is_selected(record)); } IMP_OBJECT_METHODS(NonHydrogenPDBSelector); NonHydrogenPDBSelector(std::string name) @@ -295,11 +355,11 @@ class NonWaterPDBSelector : public NonAlternativePDBSelector { IMP::PointerMember ws_; public: - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) { + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) { return false; } - return (!ws_->get_is_selected(pdb_line)); + return (!ws_->get_is_selected(record)); } IMP_OBJECT_METHODS(NonWaterPDBSelector); NonWaterPDBSelector(std::string name) @@ -315,10 +375,10 @@ class BackbonePDBSelector : public NonWaterNonHydrogenPDBSelector { BackbonePDBSelector(std::string name = "BackbonePDBSelector%1%") : NonWaterNonHydrogenPDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonWaterNonHydrogenPDBSelector::get_is_selected(pdb_line)) + bool get_is_selected(const PDBRecord &record) const override { + if (!NonWaterNonHydrogenPDBSelector::get_is_selected(record)) return false; - const std::string type = internal::atom_type(pdb_line); + const std::string type = record.get_padded_atom_name(); return ((type[1] == 'N' && type[2] == ' ' && type[3] == ' ') || (type[1] == 'C' && type[2] == 'A' && type[3] == ' ') || (type[1] == 'C' && type[2] == ' ' && type[3] == ' ') || @@ -333,9 +393,9 @@ class PPDBSelector : public NonAlternativePDBSelector { PPDBSelector(std::string name = "PPDBSelector%1%") : NonAlternativePDBSelector(name) {} - bool get_is_selected(const std::string &pdb_line) const override { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) return false; - const std::string type = internal::atom_type(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + if (!NonAlternativePDBSelector::get_is_selected(record)) return false; + const std::string type = record.get_padded_atom_name(); return (type[1] == 'P' && type[2] == ' ' && type[3] == ' '); } IMP_OBJECT_METHODS(PPDBSelector) @@ -356,8 +416,8 @@ class AndPDBSelector : public PDBSelector { const IMP::PointerMember a_, b_; public: - bool get_is_selected(const std::string &pdb_line) const override { - return a_->get_is_selected(pdb_line) && b_->get_is_selected(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + return a_->get_is_selected(record) && b_->get_is_selected(record); } IMP_OBJECT_METHODS(AndPDBSelector); AndPDBSelector(PDBSelector *a, PDBSelector *b) @@ -379,8 +439,8 @@ class OrPDBSelector : public PDBSelector { const IMP::PointerMember a_, b_; public: - bool get_is_selected(const std::string &pdb_line) const override { - return a_->get_is_selected(pdb_line) || b_->get_is_selected(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + return a_->get_is_selected(record) || b_->get_is_selected(record); } IMP_OBJECT_METHODS(OrPDBSelector); OrPDBSelector(PDBSelector *a, PDBSelector *b) @@ -403,8 +463,8 @@ class XorPDBSelector : public PDBSelector { const IMP::PointerMember a_, b_; public: - bool get_is_selected(const std::string &pdb_line) const override { - return a_->get_is_selected(pdb_line) != b_->get_is_selected(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + return a_->get_is_selected(record) != b_->get_is_selected(record); } IMP_OBJECT_METHODS(XorPDBSelector); XorPDBSelector(PDBSelector *a, PDBSelector *b) @@ -426,8 +486,8 @@ class NotPDBSelector : public PDBSelector { const IMP::PointerMember a_; public: - bool get_is_selected(const std::string &pdb_line) const override { - return !a_->get_is_selected(pdb_line); + bool get_is_selected(const PDBRecord &record) const override { + return !a_->get_is_selected(record); } IMP_OBJECT_METHODS(NotPDBSelector); NotPDBSelector(PDBSelector *a) : PDBSelector("NotPDBSelector%1%"), a_(a) {} diff --git a/modules/atom/pyext/swig.i-in b/modules/atom/pyext/swig.i-in index 4cfcc77061..d8d02d220f 100644 --- a/modules/atom/pyext/swig.i-in +++ b/modules/atom/pyext/swig.i-in @@ -5,6 +5,7 @@ namespace atom { } IMP_SWIG_GRAPH(IMP::atom, HierarchyTree, HierarchyTree, IMP::atom::Hierarchy); +IMP_SWIG_VALUE(IMP::atom, PDBRecord, PDBRecords); IMP_SWIG_BASE_OBJECT(IMP::atom, PDBSelector, PDBSelectors); IMP_SWIG_DECORATOR(IMP::atom, Angle, Angles); IMP_SWIG_DECORATOR(IMP::atom, Atom, Atoms); diff --git a/modules/atom/src/mmcif.cpp b/modules/atom/src/mmcif.cpp index 2bc90c18b5..3304c26b37 100644 --- a/modules/atom/src/mmcif.cpp +++ b/modules/atom/src/mmcif.cpp @@ -2,7 +2,7 @@ * \file mmcif.cpp * \brief Functions to read PDBs in mmCIF format * - * Copyright 2007-2022 IMP Inventors. All rights reserved. + * Copyright 2007-2023 IMP Inventors. All rights reserved. * */ @@ -36,49 +36,15 @@ class Category { c_(ihm_category_new(reader, name, callback, NULL, NULL, this, NULL)) {} }; - -class Keyword { - struct ihm_keyword *k_; -public: - Keyword(struct ihm_category *c, std::string name) - : k_(ihm_keyword_new(c, name.c_str())) {} - - const char *data() { return k_->data; } - - const char *as_str() { - if (k_->omitted || k_->unknown || !k_->in_file) { - return ""; - } else { - return k_->data; - } - } - - float as_float(float default_value=0.) { - if (k_->omitted || k_->unknown || !k_->in_file) { - return default_value; - } else { - return boost::lexical_cast(k_->data); - } - } - - int as_int(int default_value=0) { - if (k_->omitted || k_->unknown || !k_->in_file) { - return default_value; - } else { - return boost::lexical_cast(k_->data); - } - } -}; - - class AtomSiteCategory : public Category { std::string name_, filename_; Model *model_; IMP::PointerMember selector_; bool read_all_models_, honor_model_num_; - Keyword atom_name_, residue_name_, chain_, auth_chain_, element_, seq_id_, - group_, id_, occupancy_, temp_factor_, ins_code_, x_, y_, z_, - model_num_, auth_seq_id_, alt_loc_id_; + internal::CifKeyword atom_name_, residue_name_, chain_, auth_chain_, + element_, seq_id_, group_, id_, occupancy_, + temp_factor_, ins_code_, x_, y_, z_, + model_num_, auth_seq_id_, alt_loc_id_; Particle *cp_, *rp_, *root_p_; Hierarchies *hiers_; std::string curr_chain_; @@ -91,7 +57,7 @@ class AtomSiteCategory : public Category { std::string hetatm_; std::map, Particle *> chain_map_; std::map root_map_; - std::string pdb_line_; + PDBRecord pdb_record_; static void callback(struct ihm_reader *, void *data, struct ihm_error **) { ((AtomSiteCategory *)data)->handle(); @@ -126,6 +92,8 @@ class AtomSiteCategory : public Category { alt_loc_id_(c_, "label_alt_id"), cp_(nullptr), rp_(nullptr), root_p_(nullptr), hiers_(hiers) { + pdb_record_.set_keywords(group_, element_, atom_name_, alt_loc_id_, + residue_name_, auth_chain_, chain_, auth_seq_id_); curr_chain_ = ""; curr_seq_id_ = 0; curr_auth_seq_id_ = 0; @@ -191,46 +159,7 @@ class AtomSiteCategory : public Category { // Return true iff the current atom passes the PDBSelector check bool get_is_selected() { - // PDBSelectors currently take a PDB atom line string, so hack one. - // Note that this limits us to what can currently be stored in PDB, - // e.g. single character chain IDs and alternate locations - - // First blank any previous line - pdb_line_.assign(80, ' '); - // ATOM/HETATM indicator - replace(pdb_line_, internal::atom_entry_type_field_, 6, group_.as_str()); - // Atom name; need to pad appropriately (left align those with 2-character - // element names to distinguish calcium with C-alpha; otherwise pad with - // a space) - if (strlen(atom_name_.as_str()) >= 4 || strlen(element_.as_str()) == 2) { - replace(pdb_line_, internal::atom_type_field_, 4, atom_name_.as_str()); - } else { - replace(pdb_line_, internal::atom_type_field_ + 1, 3, - atom_name_.as_str()); - } - // Alternate location indicator - replace(pdb_line_, internal::atom_alt_loc_field_, 1, alt_loc_id_.as_str()); - // Residue name - replace(pdb_line_, internal::atom_res_name_field_, 3, - residue_name_.as_str()); - // Chain ID; use author-provided ID if available - if (strlen(auth_chain_.as_str()) > 0) { - replace(pdb_line_, internal::atom_chain_id_field_, 1, - auth_chain_.as_str()); - } else { - replace(pdb_line_, internal::atom_chain_id_field_, 1, chain_.as_str()); - } - // Insertion code - // Extract this from the author-provided residue number - const char *start = auth_seq_id_.as_str(); - char *endptr; - strtol(start, &endptr, 10); // ignore return value - // Insertion code is first non-digit (if any, otherwise blank) - pdb_line_[internal::atom_res_insertion_field_] = *endptr || ' '; - // Element - replace(pdb_line_, internal::atom_element_field_, 2, element_.as_str()); - - return selector_->get_is_selected(pdb_line_); + return selector_->get_is_selected(pdb_record_); } void handle() { diff --git a/modules/atom/src/pdb.cpp b/modules/atom/src/pdb.cpp index 30b256f2bc..0d6551973a 100644 --- a/modules/atom/src/pdb.cpp +++ b/modules/atom/src/pdb.cpp @@ -1,7 +1,7 @@ /** * \file PDBParser.h \brief A class for reading PDB files * - * Copyright 2007-2022 IMP Inventors. All rights reserved. + * Copyright 2007-2023 IMP Inventors. All rights reserved. * */ #include @@ -33,12 +33,130 @@ IMPATOM_BEGIN_NAMESPACE -bool HydrogenPDBSelector::is_hydrogen(std::string pdb_line) const { - if (!NonAlternativePDBSelector::get_is_selected(pdb_line)) { +void PDBRecord::set_line(const std::string &line) { + line_ = &line; + use_keywords_ = false; + use_line_ = true; +} + +void PDBRecord::set_keywords(internal::CifKeyword &group, + internal::CifKeyword &element, + internal::CifKeyword &atom_name, + internal::CifKeyword &alt_loc_id, + internal::CifKeyword &residue_name, + internal::CifKeyword &auth_chain, + internal::CifKeyword &chain, + internal::CifKeyword &auth_seq_id) { + group_ = &group; + element_ = &element; + atom_name_ = &atom_name; + alt_loc_id_ = &alt_loc_id; + residue_name_ = &residue_name; + auth_chain_ = &auth_chain; + chain_ = &chain; + auth_seq_id_ = &auth_seq_id; + use_keywords_ = true; + use_line_ = false; +} + +std::string PDBRecord::get_alt_loc_indicator() const { + IMP_INTERNAL_CHECK(use_line_ || use_keywords_, + "The PDB record contains neither PDB nor mmCIF information"); + if (use_line_) { + char alt_loc = internal::atom_alt_loc_indicator(*line_); + return std::string(1, alt_loc); + } else { + return alt_loc_id_->as_str(); + } +} + +bool PDBRecord::get_is_atom() const { + IMP_INTERNAL_CHECK(use_line_ || use_keywords_, + "The PDB record contains neither PDB nor mmCIF information"); + if (use_line_) { + return internal::is_ATOM_rec(*line_); + } else { + return strcmp(group_->as_str(), "ATOM"); + } +} + +std::string PDBRecord::get_trimmed_atom_name() const { + IMP_INTERNAL_CHECK(use_line_ || use_keywords_, + "The PDB record contains neither PDB nor mmCIF information"); + if (use_line_) { + std::string ret = internal::atom_type(*line_); + boost::trim(ret); + return ret; + } else { + return atom_name_->as_str(); + } +} + +std::string PDBRecord::get_padded_atom_name() const { + IMP_INTERNAL_CHECK(use_line_ || use_keywords_, + "The PDB record contains neither PDB nor mmCIF information"); + if (use_line_) { + return internal::atom_type(*line_); + } else { + std::string ret; + // left align atom names with 2-character element names to distinguish + // calcium from C-alpha; otherwise pad with a space + if (strlen(atom_name_->as_str()) < 4 && strlen(element_->as_str()) < 2) { + ret.append(" "); + } + ret.append(atom_name_->as_str()); + if (ret.size() < 4) { + ret.resize(4, ' '); + } + return ret; + } +} + +std::string PDBRecord::get_residue_name() const { + IMP_INTERNAL_CHECK(use_line_ || use_keywords_, + "The PDB record contains neither PDB nor mmCIF information"); + if (use_line_) { + std::string ret = internal::atom_residue_name(*line_); + boost::trim(ret); + return ret; + } else { + return residue_name_->as_str(); + } +} + +std::string PDBRecord::get_chain_id() const { + IMP_INTERNAL_CHECK(use_line_ || use_keywords_, + "The PDB record contains neither PDB nor mmCIF information"); + if (use_line_) { + char cid = internal::atom_chain_id(*line_); + return std::string(1, cid); + } else { + // Use author-provided ID if available + if (strlen(auth_chain_->as_str()) > 0) { + return auth_chain_->as_str(); + } else { + return chain_->as_str(); + } + } +} + +std::string PDBRecord::get_element() const { + IMP_INTERNAL_CHECK(use_line_ || use_keywords_, + "The PDB record contains neither PDB nor mmCIF information"); + if (use_line_) { + std::string ret = internal::atom_element(*line_); + boost::trim(ret); + return ret; + } else { + return element_->as_str(); + } +} + +bool HydrogenPDBSelector::is_hydrogen(const PDBRecord &record) const { + if (!NonAlternativePDBSelector::get_is_selected(record)) { return false; } - std::string elem = internal::atom_element(pdb_line); - boost::trim(elem); + std::string elem = record.get_element(); // determine if the line is hydrogen atom as follows: // 1. if the record has element field (columns 76-77), // check that it is indeed H. Note that it may be missing @@ -54,7 +172,7 @@ bool HydrogenPDBSelector::is_hydrogen(std::string pdb_line) const { // 3. if no hydrogen is found in the element record, // try atom type field. // some NMR structures have 'D' for labeled hydrogens - std::string atom_name = internal::atom_type(pdb_line); + std::string atom_name = record.get_padded_atom_name(); return ( // " HXX" or " DXX" or "1HXX" ... ((atom_name[0] == ' ' || isdigit(atom_name[0])) && (atom_name[1] == 'H' || atom_name[1] == 'D')) || @@ -171,6 +289,9 @@ Hierarchies read_pdb(std::istream& in, std::string name, std::string filename, bool has_atom = false; std::string line; + PDBRecord pdb_record; + pdb_record.set_line(line); + while (!in.eof()) { getline(in, line); if (in.eof()) break; @@ -196,7 +317,7 @@ Hierarchies read_pdb(std::istream& in, std::string name, std::string filename, // the // Particle to the Model if (internal::is_ATOM_rec(line) || internal::is_HETATM_rec(line)) { - if (!selector->get_is_selected(line)) { + if (!selector->get_is_selected(pdb_record)) { IMP_LOG_VERBOSE("Selector rejected line " << line << std::endl); continue; } diff --git a/modules/atom/test/test_pdb.py b/modules/atom/test/test_pdb.py index af80e46a97..ed1091e879 100644 --- a/modules/atom/test/test_pdb.py +++ b/modules/atom/test/test_pdb.py @@ -188,8 +188,8 @@ class my_selector(IMP.atom.PDBSelector): def __init__(self): IMP.atom.PDBSelector.__init__(self, "my selector") - def get_is_selected(self, ln): - return ln.startswith("ATOM") + def get_is_selected(self, record): + return record.get_is_atom() m = IMP.Model() with self.open_input_file("hydrogen.pdb") as fh: