Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BinaryCIF Import Implementation #353

Merged
merged 18 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
430 changes: 397 additions & 33 deletions layer2/CifFile.cpp

Large diffs are not rendered by default.

206 changes: 163 additions & 43 deletions layer2/CifFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,17 @@
#include <map>
#include <memory>
#include <vector>
#include <string>
#include <variant>

// for pymol::default_free
#include "MemoryDebug.h"

template<class... Ts>
struct overloaded : Ts... { using Ts::operator()...; };
template<class... Ts>
overloaded(Ts...) -> overloaded<Ts...>;

namespace pymol {
namespace _cif_detail {

Expand Down Expand Up @@ -44,6 +51,11 @@ template <typename T> T raw_to_typed(const char*);
class cif_data;
class cif_loop;
class cif_array;
namespace cif_detail {
struct cif_str_data;
struct bcif_data;
};
using CIFData = std::variant<cif_detail::cif_str_data, cif_detail::bcif_data>;

/**
* Class for reading CIF files.
Expand All @@ -57,7 +69,7 @@ class cif_array;
*
* Iterate over data blocks:
* @verbatim
for (auto& block : cf.datablocks()) {
for (auto& [code, block] : cf.datablocks()) {
// data_<code>
const char* code = block->code();

Expand All @@ -81,7 +93,7 @@ class cif_array;
*/
class cif_file {
std::vector<char*> m_tokens;
std::vector<cif_data> m_datablocks;
std::map<std::string, cif_data> m_datablocks;
std::unique_ptr<char, pymol::default_free> m_contents;

/**
Expand All @@ -98,6 +110,14 @@ class cif_file {
/// Parse CIF string
bool parse_string(const char*);

/**
* Parse BinaryCIF blob
* @param bytes BinaryCIF blob
* @param size Blob size
* @post datablocks() is valid
*/
bool parse_bcif(const char* bytes, std::size_t size);

protected:
/// Report a parsing error
virtual void error(const char*);
Expand All @@ -114,54 +134,112 @@ class cif_file {
cif_file(const char* filename, const char* contents = nullptr);

/// Data blocks
const std::vector<cif_data>& datablocks() const { return m_datablocks; }
const std::map<std::string, cif_data>& datablocks() const { return m_datablocks; }
};

/**
* View on a CIF data array. The viewed data is owned by the cif_file
*/
class cif_array {
friend class cif_file;

private:
enum { NOT_IN_LOOP = -1 };
using CifArrayElement = std::variant<std::int8_t, std::int16_t, std::int32_t,
std::uint8_t, std::uint16_t, std::uint32_t, float, double, std::string>;

// column index, -1 if not in loop
short col;
namespace cif_detail {
struct cif_str_array {
enum { NOT_IN_LOOP = -1 };

// pointer to either loop or single value
union {
const cif_loop * loop;
const char * value;
} pointer;
// column index, -1 if not in loop
short col;

// Raw data value or nullptr for unknown/inapplicable and `pos >= size()`
const char* get_value_raw(unsigned pos = 0) const;
// pointer to either loop or single value
union {
const cif_loop * loop;
const char * value;
} pointer;

// point this array to a loop (only for parsing)
void set_loop(const cif_loop * loop, short col_) {
col = col_;
pointer.loop = loop;
};
// Raw data value or NULL for unknown/inapplicable and `pos >= size()`
const char* get_value_raw(unsigned pos = 0) const;

// point this array to a loop (only for parsing)
void set_loop(const cif_loop * loop, short col_) {
col = col_;
pointer.loop = loop;
};

// point this array to a single value (only for parsing)
void set_value(const char * value) {
col = NOT_IN_LOOP;
pointer.value = value;
// point this array to a single value (only for parsing)
void set_value(const char * value) {
col = NOT_IN_LOOP;
pointer.value = value;
};
};
struct bcif_array {
std::vector<CifArrayElement> m_arr{};
};

/**
* Returns a typed value from a CIF data element.
* If the element is missing or inapplicable, return `d`.
* @param var CIF data element
* @param d default value
* @return typed value
*/
template <typename T> T var_to_typed(const CifArrayElement& var, const T& d)
{
if constexpr (std::is_same_v<T, const char*>) {
auto& str = std::get<std::string>(var);
return !str.empty() ? str.c_str() : d;
} else {
if (auto ptr = std::get_if<std::string>(&var); ptr && ptr->empty()) {
return d;
}
if constexpr (!std::is_same_v<T, std::string>) {
return std::visit(overloaded{[](const std::string& s) -> T {
return _cif_detail::raw_to_typed<T>(
s.c_str());
},
[](const auto& v) -> T { return v; }},
var);
}
}
return d;
}
}

/**
* View on a CIF data array. The viewed data is owned by the cif_file
*/
class cif_array {
friend class cif_file;

private:
mutable std::string m_internal_str_cache;
std::variant<cif_detail::cif_str_array, cif_detail::bcif_array> m_array;

public:
// constructor
cif_array() = default;

// constructor (only needed for EMPTY_ARRAY)
cif_array(std::nullptr_t) { set_value(nullptr); }
cif_array(std::nullptr_t) {
if (auto arr = std::get_if<cif_detail::cif_str_array>(&m_array)) {
arr->set_value(nullptr);
} else if (auto arr = std::get_if<cif_detail::bcif_array>(&m_array)) {
arr->m_arr.clear();
}
}

cif_array(std::vector<CifArrayElement>&& arr) {
m_array = cif_detail::bcif_array{std::move(arr)};
}

/// Number of elements in this array (= number of rows in loop)
unsigned size() const;

/// True if value in ['.', '?']
bool is_missing(unsigned pos = 0) const { return !get_value_raw(pos); }
bool is_missing(unsigned pos = 0) const {
if (auto arr = std::get_if<cif_detail::cif_str_array>(&m_array)) {
return !arr->get_value_raw(pos);
} else {
return false;
}
}

/// True if all values in ['.', '?']
bool is_missing_all() const;
Expand All @@ -172,8 +250,16 @@ class cif_array {
* @param d default value for unknown/inapplicable elements
*/
template <typename T> T as(unsigned pos = 0, T d = T()) const {
const char* s = get_value_raw(pos);
return s ? _cif_detail::raw_to_typed<T>(s) : d;
if (auto arr = std::get_if<cif_detail::cif_str_array>(&m_array)) {
const char* s = arr->get_value_raw(pos);
return s ? _cif_detail::raw_to_typed<T>(s) : d;
} else if (auto arr = std::get_if<cif_detail::bcif_array>(&m_array)) {
if (pos >= arr->m_arr.size())
return d;
auto& var = arr->m_arr[pos];
return cif_detail::var_to_typed<T>(var, d);
}
return d;
}

/**
Expand All @@ -184,7 +270,25 @@ class cif_array {
* @param d default value for unknown/inapplicable elements
*/
const char* as_s(unsigned pos = 0, const char* d = "") const {
return as(pos, d);
if (std::get_if<cif_detail::cif_str_array>(&m_array)) {
return as(pos, d);
} else if (auto arr = std::get_if<cif_detail::bcif_array>(&m_array)) {
if (pos >= arr->m_arr.size())
return d;
if (auto str_ptr = std::get_if<std::string>(&arr->m_arr[pos])) {
return str_ptr->c_str();
}
m_internal_str_cache = std::visit([](auto&& arg) -> std::string {
if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
std::string>) {
return arg;
} else {
return std::to_string(arg);
}
}, arr->m_arr[pos]);
return m_internal_str_cache.c_str();
}
return d;
}

/// Alias for as<int>()
Expand All @@ -210,17 +314,33 @@ class cif_array {
/**
* CIF data block. The viewed data is owned by the cif_file.
*/
class cif_data {
friend class cif_file;

// data_<code>
const char* m_code = nullptr;
namespace cif_detail {
struct cif_str_data {
// data_<code>
const char* m_code = nullptr;

std::map<_cif_detail::zstring_view, cif_array> m_dict;
std::map<std::string, cif_array> m_dict_str;
std::map<_cif_detail::zstring_view, cif_detail::cif_str_data> m_saveframes;

// only needed for freeing
std::vector<std::unique_ptr<cif_loop>> m_loops;
};

using ColumnMap = std::map<std::string, std::vector<CifArrayElement>>;
using CategoryMap = std::map<std::string, ColumnMap>;
using DataBlockMap = std::map<std::string, CategoryMap>;
struct bcif_data {
std::string m_code;
std::map<std::string, std::map<std::string, cif_array>> m_dict;
};
}

std::map<_cif_detail::zstring_view, cif_array> m_dict;
std::map<_cif_detail::zstring_view, cif_data> m_saveframes;
class cif_data {
friend class cif_file;

// only needed for freeing
std::vector<std::unique_ptr<cif_loop>> m_loops;
CIFData m_data;

// generic default value
static const cif_array* empty_array();
Expand All @@ -234,7 +354,7 @@ class cif_data {
cif_data& operator=(cif_data&&) = default;

/// Block code (never nullptr)
const char* code() const { return m_code ? m_code : ""; }
const char* code() const;

// Get a pointer to array or nullptr if not found
const cif_array* get_arr(const char* key) const;
Expand All @@ -253,7 +373,7 @@ class cif_data {
}

/// Get a pointer to a save frame or nullptr if not found
const cif_data* get_saveframe(const char* code) const;
const cif_detail::cif_str_data* get_saveframe(const char* code) const;
};

} // namespace pymol
Expand Down
47 changes: 44 additions & 3 deletions layer2/CifMoleculeReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ static bond_dict_t * get_global_components_bond_dict(PyMOLGlobals * G) {
return nullptr;
}

for (const auto& datablock : cif.datablocks()) {
for (const auto& [code, datablock] : cif.datablocks()) {
read_chem_comp_bond_dict(&datablock, bond_dict);
}
}
Expand Down Expand Up @@ -2264,7 +2264,7 @@ pymol::Result<ObjectMolecule*> ObjectMoleculeReadCifStr(PyMOLGlobals * G, Object
return pymol::make_error("Parsing CIF file failed: ", cif->m_error_msg);
}

for (const auto& datablock : cif->datablocks()) {
for (const auto& [code, datablock] : cif->datablocks()) {
ObjectMolecule * obj = ObjectMoleculeReadCifData(G, &datablock, discrete, quiet);

if (!obj) {
Expand Down Expand Up @@ -2330,7 +2330,7 @@ const bond_dict_t::mapped_type * bond_dict_t::get(PyMOLGlobals * G, const char *
return nullptr;
}

for (auto& item : cif.datablocks())
for (auto& [code, item] : cif.datablocks())
read_chem_comp_bond_dict(&item, *this);
}
}
Expand All @@ -2352,4 +2352,45 @@ const bond_dict_t::mapped_type * bond_dict_t::get(PyMOLGlobals * G, const char *
return nullptr;
}


///////////////////////////////////////

pymol::Result<ObjectMolecule*> ObjectMoleculeReadBCif(PyMOLGlobals* G,
ObjectMolecule* I, const char* bytes, std::size_t size, int frame,
int discrete, int quiet, int multiplex, int zoom)
{
#ifdef _PYMOL_NO_MSGPACKC
PRINTFB(G, FB_ObjectMolecule, FB_Errors)
" Error: This build has no BinaryCIF support.\n"
" Please install/enable msgpack-c.\n"
ENDFB(G);
return nullptr;
#endif

if (I) {
return pymol::Error("loading BCIF into existing object not supported, "
"please use 'create' to append to an existing object.");
}

if (multiplex > 0) {
return pymol::Error("loading BCIF with multiplex=1 not supported, please "
"use 'split_states' after loading the object.");
}

auto cif = std::make_unique<pymol::cif_file>();
cif->parse_bcif(bytes, size);

for (const auto& [code, datablock] : cif->datablocks()) {
auto obj = ObjectMoleculeReadCifData(G, &datablock, discrete, quiet);
if (!obj) {
PRINTFB(G, FB_ObjectMolecule, FB_Warnings)
" mmCIF-Warning: no coordinates found in data_%s\n", datablock.code() ENDFB(G);
continue;
}
if (cif->datablocks().size() == 1 || multiplex == 0)
return obj;
}
return nullptr;
}

// vi:sw=2:ts=2:expandtab
3 changes: 3 additions & 0 deletions layer2/ObjectMolecule.h
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,9 @@ ObjectMolecule *ObjectMoleculeReadMmtfStr(PyMOLGlobals * G, ObjectMolecule * I,
const char *st, int st_len, int frame, int discrete, int quiet, int multiplex, int zoom);
pymol::Result<ObjectMolecule*> ObjectMoleculeReadCifStr(PyMOLGlobals * G, ObjectMolecule * I,
const char *st, int frame, int discrete, int quiet, int multiplex, int zoom);
pymol::Result<ObjectMolecule*> ObjectMoleculeReadBCif(PyMOLGlobals* G,
ObjectMolecule* I, const char* bytes, std::size_t size, int frame,
int discrete, int quiet, int multiplex, int zoom);

std::unique_ptr<int[]> LoadTrajSeleHelper(
const ObjectMolecule* obj, CoordSet* cs, const char* selection);
Expand Down
Loading