Skip to content

Commit

Permalink
new subcommand: set
Browse files Browse the repository at this point in the history
work in progress

meant as partial replacement for PDBSET
  • Loading branch information
wojdyr committed Sep 2, 2024
1 parent 389226b commit c109960
Show file tree
Hide file tree
Showing 12 changed files with 264 additions and 33 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ add_executable(gemmi_prog
prog/main.cpp prog/map.cpp prog/map2sf.cpp
prog/mapcoef.cpp prog/mask.cpp prog/merge.cpp
prog/mondiff.cpp prog/monlib_opt.cpp prog/mtz.cpp prog/mtz2cif.cpp
prog/reindex.cpp prog/residues.cpp prog/rmsz.cpp
prog/reindex.cpp prog/residues.cpp prog/rmsz.cpp prog/set.cpp
prog/sf2map.cpp prog/sfcalc.cpp prog/sg.cpp prog/tags.cpp
prog/validate.cpp prog/validate_mon.cpp prog/wcn.cpp
prog/xds2mtz.cpp
Expand Down
3 changes: 2 additions & 1 deletion docs/gemmi-help.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ Usage: gemmi [--version] [--help] <command> [<args>]
Commands:
align sequence alignment (global, pairwise, affine gap penalty)
blobs list unmodelled electron density blobs
cif2mtz convert structure factor mmCIF to MTZ
cif2json translate (mm)CIF to (mm)JSON
cif2mtz convert structure factor mmCIF to MTZ
cifdiff compare tags in two (mm)CIF files
contact searches for contacts (neighbouring atoms)
contents info about content of a coordinate file (pdb, mmCIF, ...)
Expand All @@ -32,6 +32,7 @@ Commands:
reindex reindex MTZ file
residues list residues from a coordinate file
rmsz validate geometry using monomer library
set modify coordinate files (think pdbset)
sf2map transform map coefficients (from MTZ or mmCIF) to map
sfcalc calculate structure factors from a model
sg info about space groups
Expand Down
13 changes: 13 additions & 0 deletions docs/set-help.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
$ gemmi set -h
Usage:
gemmi set [options] INPUT_FILE OUTPUT_FILE

Modify atom attributes in a coordinate file.

Options:
-h, --help Print usage and exit.
-V, --version Print version and exit.
-v, --verbose Verbose output.
-B MIN[:MAX] Set isotropic B-factors to a single value or clamp it to
MIN:MAX
-O MIN[:MAX] Set occupancy to a single value or clamp it to MIN:MAX
13 changes: 13 additions & 0 deletions docs/utils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,19 @@ on UniProt mapping, the non-mapped sequence numbers are increased by
an offset of 5000 (this default value is inspired by PDBrenum)
or bigger if necessary (when UniProt positions are that large).

set
===

Modifies atom positions, isotropic B-factors and/or occupancies
in a PDB or mmCIF file. It serves as a partial replacement for CCP4 PDBSET.

Unlike most other gemmi tools, it doesn't parse the entire input file.
Instead, it reads only what is necessary to locate the relevant numbers
and replaces them, leaving the rest of the file unchanged.

.. literalinclude:: set-help.txt
:language: console

tags
====

Expand Down
10 changes: 10 additions & 0 deletions include/gemmi/pdb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ GEMMI_DLL std::vector<Op> read_remark_290(const std::vector<std::string>& raw_re

namespace impl {

// Compare the first 4 letters of s, ignoring case, with uppercase record.
// Both args must have at least 3+1 chars. ' ' and NUL are equivalent in s.
inline bool is_record_type(const char* s, const char* record) {
return ialpha4_id(s) == ialpha4_id(record);
}
// for record "TER": "TER ", TER\n, TER\r, TER\t match, TERE, TER1 don't
inline bool is_record_type3(const char* s, const char* record) {
return (ialpha4_id(s) & ~0xf) == ialpha4_id(record);
}

struct GEMMI_DLL PdbReader {
PdbReader(const PdbReadOptions& options_) : options(options_) {
if (options.max_line_length <= 0 || options.max_line_length > 120)
Expand Down
15 changes: 4 additions & 11 deletions prog/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,18 +319,11 @@ void convert(gemmi::Structure& st,
standardize_crystal_frame(st);

if (options[Biso]) {
const char* start = options[Biso].arg;
char* endptr = nullptr;
float value1 = std::strtof(start, &endptr);
float value2 = value1;
if (endptr != start && *endptr == ':') {
start = endptr + 1;
value2 = std::strtof(start, &endptr);
}
if (endptr != start && *endptr == '\0')
assign_b_iso(st, value1, value2);
else
float value1, value2;
bool ok = parse_number_or_range(options[Biso].arg, &value1, &value2);
if (!ok)
gemmi::fail("argument for -B should be a number or number:number");
assign_b_iso(st, value1, value2);
}

for (const option::Option* opt = options[Anisou]; opt; opt = opt->next()) {
Expand Down
12 changes: 7 additions & 5 deletions prog/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

void print_version(const char* program_name, bool verbose); // in options.h

int align_main(int argc, char** argv);
int blobs_main(int argc, char** argv);
int cif2mtz_main(int argc, char** argv);
int cif2json_main(int argc, char** argv);
int cif2mtz_main(int argc, char** argv);
int cifdiff_main(int argc, char** argv);
int contact_main(int argc, char** argv);
int contents_main(int argc, char** argv);
Expand All @@ -19,17 +20,17 @@ int fprime_main(int argc, char** argv);
int grep_main(int argc, char** argv);
int h_main(int argc, char** argv);
int json2cif_main(int argc, char** argv);
int map_main(int argc, char** argv);
int map2sf_main(int argc, char** argv);
int map_main(int argc, char** argv);
int mask_main(int argc, char** argv);
int merge_main(int argc, char** argv);
int mondiff_main(int argc, char** argv);
int mtz_main(int argc, char** argv);
int mtz2cif_main(int argc, char** argv);
int mtz_main(int argc, char** argv);
int reindex_main(int argc, char** argv);
int residues_main(int argc, char** argv);
int rmsz_main(int argc, char** argv);
int align_main(int argc, char** argv);
int set_main(int argc, char** argv);
int sf2map_main(int argc, char** argv);
int sfcalc_main(int argc, char** argv);
int sg_main(int argc, char** argv);
Expand All @@ -52,8 +53,8 @@ struct SubCmd {
SubCmd subcommands[] = {
CMD(align, "sequence alignment (global, pairwise, affine gap penalty)"),
CMD(blobs, "list unmodelled electron density blobs"),
CMD(cif2mtz, "convert structure factor mmCIF to MTZ"),
CMD(cif2json, "translate (mm)CIF to (mm)JSON"),
CMD(cif2mtz, "convert structure factor mmCIF to MTZ"),
CMD(cifdiff, "compare tags in two (mm)CIF files"),
CMD(contact, "searches for contacts (neighbouring atoms)"),
CMD(contents, "info about content of a coordinate file (pdb, mmCIF, ...)"),
Expand All @@ -74,6 +75,7 @@ SubCmd subcommands[] = {
CMD(reindex, "reindex MTZ file"),
CMD(residues, "list residues from a coordinate file"),
CMD(rmsz, "validate geometry using monomer library"),
CMD(set, "modify coordinate files (think pdbset)"),
CMD(sf2map, "transform map coefficients (from MTZ or mmCIF) to map"),
CMD(sfcalc, "calculate structure factors from a model"),
CMD(sg, "info about space groups"),
Expand Down
19 changes: 19 additions & 0 deletions prog/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,25 @@ std::vector<double> parse_blank_separated_numbers(const char* arg) {
return results;
}

template<typename T, typename Func>
bool parse_number_or_range_(const Func& strto, const char* start, T* value1, T* value2) {
char* endptr = nullptr;
*value1 = strto(start, &endptr);
*value2 = *value1;
if (endptr != start && *endptr == ':') {
start = endptr + 1;
*value2 = strto(start, &endptr);
}
return endptr != start && *endptr == '\0';
}

bool parse_number_or_range(const char* start, float* value1, float* value2) {
return parse_number_or_range_(&std::strtof, start, value1, value2);
}
bool parse_number_or_range(const char* start, double* value1, double* value2) {
return parse_number_or_range_(&std::strtod, start, value1, value2);
}

option::ArgStatus Arg::Required(const option::Option& option, bool msg) {
if (option.arg != nullptr)
return option::ARG_OK;
Expand Down
2 changes: 2 additions & 0 deletions prog/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ extern const option::Descriptor CommonUsage[];

std::vector<int> parse_comma_separated_ints(const char* arg);
std::vector<double> parse_blank_separated_numbers(const char* arg);
bool parse_number_or_range(const char* start, float* value1, float* value2);
bool parse_number_or_range(const char* start, double* value1, double* value2);

struct Arg: public option::Arg {
static option::ArgStatus Required(const option::Option& option, bool msg);
Expand Down
187 changes: 187 additions & 0 deletions prog/set.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// Copyright Global Phasing Ltd.

#include <stdio.h> // for printf, fprintf, stderr
#include <cstring> // for memchr, memcpy
#include <algorithm> // for min, max
#include <exception>
#include "gemmi/atof.hpp" // for fast_from_chars
//#include "gemmi/calculate.hpp" // for calculate_center_of_mass
#include "gemmi/pdb.hpp" // for impl::is_record_type
#include "gemmi/read_cif.hpp" // for read_into_buffer_gz
#include <gemmi/mmread.hpp> // for coor_format_from_content
#include <gemmi/sprintf.hpp> // for snprintf_z

#define GEMMI_PROG set
#include "options.h"

namespace {

enum OptionIndex { Bfactor=4, Occupancy, Noise, };

const option::Descriptor Usage[] = {
{ NoOp, 0, "", "", Arg::None,
"Usage:"
"\n " EXE_NAME " [options] INPUT_FILE OUTPUT_FILE"
"\n\nModify atom attributes in a coordinate file."
"\n\nOptions:" },
CommonUsage[Help],
CommonUsage[Version],
CommonUsage[Verbose],
{ Bfactor, 0, "B", "bfactor", Arg::Required,
" -B MIN[:MAX] \tSet isotropic B-factors to a single value or clamp it to MIN:MAX" },
{ Occupancy, 0, "O", "occ", Arg::Required,
" -O MIN[:MAX] \tSet occupancy to a single value or clamp it to MIN:MAX" },
{ 0, 0, 0, 0, 0, 0 }
};

namespace pegtl = tao::pegtl;
namespace rules = gemmi::cif::rules;

struct Context {
int target_column = -1;
int current_column = -1;
int loop_width = -1;
std::string target_tag;
std::size_t copy_pos = 0;
const gemmi::CharArray* arr;
std::vector<char> output;

void append(const char* a, const char* b) { output.insert(output.end(), a, b); }
};

template<typename Rule> struct Search : pegtl::nothing<Rule> {};

template<> struct Search<rules::str_loop> {
template<typename Input> static void apply(const Input&, Context& ctx) {
ctx.current_column = 0;
ctx.target_column = -1;
}
};
template<> struct Search<rules::loop_tag> {
template<typename Input> static void apply(const Input& in, Context& ctx) {
if (in.string() == ctx.target_tag)
ctx.target_column = ctx.current_column;
++ctx.current_column;
}
};
template<> struct Search<rules::loop_value> {
template<typename Input> static void apply(const Input& in, Context& ctx) {
if (ctx.target_column >= 0) {
if (ctx.loop_width < 0) {
ctx.loop_width = ctx.current_column;
ctx.current_column = 0;
} else {
++ctx.current_column;
if (ctx.current_column == ctx.loop_width)
ctx.current_column = 0;
}
if (ctx.current_column == ctx.target_column) {
auto pos = in.iterator().byte;
ctx.append(ctx.arr->data() + ctx.copy_pos, ctx.arr->data() + pos);
std::string s = "{" + in.string() + "}"; // TODO
ctx.append(s.data(), s.data() + s.size());
ctx.copy_pos = pos + in.size();
}
}
}
};

struct EditBuffer {
gemmi::CoorFormat format = gemmi::CoorFormat::Unknown;
std::string path; // for error messages
gemmi::CharArray arr;

template<typename Func>
void modify_values(std::pair<int,int> pdb_columns, const std::string& tag, const Func& func) {
if (format == gemmi::CoorFormat::Pdb)
modify_values_in_pdb(pdb_columns, func);
else
modify_values_in_mmcif(tag, func);
}

template<typename Func>
void modify_values_in_pdb(std::pair<int,int> cols, const Func& func) {
char* line = arr.data();
const char* buf_end = arr.data() + arr.size();
for (;;) {
char* eol = (char*) std::memchr(line, '\n', size_t(buf_end - line));
auto len = (eol ? eol : buf_end) - line;
if (len >= cols.second &&
(gemmi::impl::is_record_type(line, "ATOM") ||
gemmi::impl::is_record_type(line, "HETATM"))) {
double d = 0.;
char* start = line + cols.first - 1;
auto result = gemmi::fast_from_chars(start, line + cols.second, d);
if (result.ec != std::errc())
gemmi::fail("failed to parse a number in line:\n", line);
d = func(d);
char tmp[7] = {};
gemmi::snprintf_z(tmp, 7, "%6.2f", d);
std::memcpy(start, tmp, 6);
}
if (!eol)
break;
line = eol + 1;
}
}

template<typename Func>
void modify_values_in_mmcif(const std::string& tag, const Func& func) {
(void) func; // TODO
Context ctx;
ctx.target_tag = tag;
ctx.arr = &arr;
ctx.output.reserve(arr.size() * 11 / 10);
pegtl::memory_input<> in(arr.data(), arr.size(), path);
pegtl::parse<rules::file, Search, gemmi::cif::Errors>(in, ctx);
ctx.append(arr.data() + ctx.copy_pos, arr.data() + arr.size());
arr.resize(ctx.output.size());
std::memcpy(arr.data(), ctx.output.data(), ctx.output.size());
ctx.output.clear();
}
};

} // anonymous namespace

int GEMMI_MAIN(int argc, char **argv) {
OptParser p(EXE_NAME);
p.simple_parse(argc, argv, Usage);
p.require_positional_args(2);
int verbosity = p.options[Verbose].count();
EditBuffer eb;
eb.path = p.coordinate_input_file(0);
const char* output = p.nonOption(1);
try {
eb.arr = gemmi::read_into_buffer_gz(eb.path);
eb.format = gemmi::coor_format_from_content(eb.arr.data(), eb.arr.data() + eb.arr.size());
if (p.options[Bfactor]) {
double b_min, b_max;
bool ok = parse_number_or_range(p.options[Bfactor].arg, &b_min, &b_max);
if (!ok)
gemmi::fail("argument for -B should be a number or number:number");
eb.modify_values({61, 66}, "_atom_site.B_iso_or_equiv", [b_min, b_max](double x) {
return std::min(std::max(x, b_min), b_max);
});
}
if (p.options[Occupancy]) {
double occ_min, occ_max;
bool ok = parse_number_or_range(p.options[Occupancy].arg, &occ_min, &occ_max);
if (!ok)
gemmi::fail("argument for -O should be a number or number:number");
eb.modify_values({55, 60}, "_atom_site.occupancy", [occ_min, occ_max](double x) {
return std::min(std::max(x, occ_min), occ_max);
});
}
gemmi::fileptr_t f = gemmi::file_open(output, "wb");
if (std::fwrite(eb.arr.data(), eb.arr.size(), 1, f.get()) != 1) {
perror("Writing file failed");
return 1;
}
} catch (std::exception& e) {
fprintf(stderr, "ERROR: %s\n", e.what());
return 1;
}
if (verbosity > 0)
printf("Done.\n");
return 0;
}
10 changes: 0 additions & 10 deletions src/pdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,6 @@ template<int N> int read_base36(const char* p) {
return std::strtol(zstr, nullptr, 36);
}

// Compare the first 4 letters of s, ignoring case, with uppercase record.
// Both args must have at least 3+1 chars. ' ' and NUL are equivalent in s.
bool is_record_type(const char* s, const char* record) {
return ialpha4_id(s) == ialpha4_id(record);
}
// for record "TER": "TER ", TER\n, TER\r, TER\t match, TERE, TER1 don't
bool is_record_type3(const char* s, const char* record) {
return (ialpha4_id(s) & ~0xf) == ialpha4_id(record);
}

// The standard charge format is 2+, but some files have +2.
signed char read_charge(char digit, char sign) {
if (sign == ' ' && digit == ' ') // by far the most common case
Expand Down
Loading

0 comments on commit c109960

Please sign in to comment.