Skip to content

Commit

Permalink
work in progress: gemmi-ecalc
Browse files Browse the repository at this point in the history
  • Loading branch information
wojdyr committed Sep 5, 2023
1 parent 830741c commit 8e24ce1
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 30 deletions.
23 changes: 23 additions & 0 deletions include/gemmi/binner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,20 @@ struct Binner {
return setup_from_1_d2(nbins, method, std::move(inv_d2), nullptr);
}

/// the same as setup(), but returns mid values of 1/d^2.
template<typename DataProxy>
std::vector<double> setup_mid(int nbins, Method method, const DataProxy& proxy,
const UnitCell* cell_=nullptr) {
setup(2 * nbins, method, proxy, cell_);
std::vector<double> mids(nbins);
for (int i = 0; i < nbins; ++i) {
mids[i] = limits[2*i];
limits[i] = limits[2*i+1];
}
limits.resize(nbins);
return mids;
}

void ensure_limits_are_set() const {
if (limits.empty())
fail("Binner not set up");
Expand Down Expand Up @@ -136,6 +150,15 @@ struct Binner {
return nums;
}

std::vector<int> get_bins_from_1_d2(const std::vector<double>& inv_d2) const {
ensure_limits_are_set();
int hint = 0;
std::vector<int> nums(inv_d2.size());
for (size_t i = 0; i < inv_d2.size(); ++i)
nums[i] = get_bin_from_1_d2_hinted(inv_d2[i], hint);
return nums;
}

double dmin_of_bin(int n) const {
return 1. / std::sqrt(limits.at(n));
}
Expand Down
7 changes: 7 additions & 0 deletions include/gemmi/stats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@

namespace gemmi {

struct Mean {
int n = 0;
double sum = 0.;
void add_point(double x) { ++n; sum += x; }
double get_mean() const { return sum / n; }
};

// popular single-pass algorithm for calculating variance and mean
struct Variance {
int n = 0;
Expand Down
129 changes: 99 additions & 30 deletions prog/ecalc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,23 @@

#include <cstdio> // for printf, fprintf
#include <cstdlib> // for atof
#include "gemmi/binner.hpp"
#include "gemmi/mtz.hpp"
#include "gemmi/binner.hpp" // for Binner
#include "gemmi/mtz.hpp" // for Mtz
#include "gemmi/stats.hpp" // for Mean

#define GEMMI_PROG ecalc
#include "options.h"

namespace {

enum OptionIndex {
LabelF=4, LabelE, NoSigma
LabelF=4, LabelE, NoSigma, Method, BinSize
};

struct EcalcArg: public Arg {
static option::ArgStatus MethodChoice(const option::Option& option, bool msg) {
return Arg::Choice(option, msg, {"ma", "2", "3", "ec"});
}
};

const option::Descriptor Usage[] = {
Expand All @@ -27,11 +34,21 @@ const option::Descriptor Usage[] = {
" --E=LABEL \tLabel of the output column (default: E)." },
{ NoSigma, 0, "", "no-sigma", Arg::Required,
" --no-sigma \tDo not use sigma columns (SIGF->SIGE)." },
{ Method, 0, "", "method", EcalcArg::MethodChoice,
" --method=METHOD \tMethod of calculating <F^2>, one of:\n\t"
"ma - moving average,\n\t"
"2 - resolution bins equispaced in 1/d^2,\n\t"
"3 - resolution bins in 1/d^3 (default),\n\t"
"ec - bins with equal count of reflections." },
{ BinSize, 0, "n", "binsize", Arg::Int,
" -n, --binsize=N \tNumber of reflections per bin or in moving window\n\t"
"(default: 200)." },
{ NoOp, 0, "", "", Arg::None,
"\nColumn for SIGF is always the next column after F (the type must be Q)."
"\nIf INPUT.mtz has column E, it is replaced in OUTPUT.mtz."
"\nOtherwise, a new column is appended."
"\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E." },
"\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E."
"\nTo print the list of resolution shells add -vv." },
{ 0, 0, 0, 0, 0, 0 }
};

Expand All @@ -47,8 +64,8 @@ int GEMMI_MAIN(int argc, char **argv) {
const char* f_label = p.options[LabelF] ? p.options[LabelF].arg : "F";
const char* e_label = p.options[LabelE] ? p.options[LabelE].arg : "E";
bool use_sigma = !p.options[NoSigma];
bool verbose = p.options[Verbose];
if (verbose)
int verbose = p.options[Verbose].count();
if (verbose > 0)
std::fprintf(stderr, "Reading %s ...\n", input);
try {
Mtz mtz;
Expand All @@ -66,7 +83,7 @@ int GEMMI_MAIN(int argc, char **argv) {
gemmi::fail("Column ", e_label, " not followed by sigma column. Use --no-sigma.\n");
e_idx = (int)e->idx;
}
if (verbose)
if (verbose > 0)
std::fprintf(stderr, "%s column %s ...\n",
(e_idx >= 0 ? "Replacing existing" : "Adding"), e_label);
std::vector<std::string> trailing_cols(use_sigma ? 1 : 0);
Expand All @@ -86,34 +103,86 @@ int GEMMI_MAIN(int argc, char **argv) {
for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size())
if (!std::isnan(mtz.data[n + fcol_idx]))
++f_count;
int nbins = gemmi::iround(f_count / 200.);
if (verbose)
int binsize = 200;
if (p.options[BinSize])
binsize = std::atoi(p.options[BinSize].arg);
int nbins = gemmi::iround((double)f_count / binsize);
if (verbose > 0)
std::fprintf(stderr, "%d reflections, %d Fs, %d shells\n",
mtz.nreflections, f_count, nbins);
binner.setup(nbins, gemmi::Binner::Method::Dstar2, gemmi::MtzDataProxy{mtz});
std::vector<int> bin_index = binner.get_bins(gemmi::MtzDataProxy{mtz});
gemmi::GroupOps gops = mtz.spacegroup->operations();
std::vector<double> denom(binner.size(), 0.);
std::vector<int> count(denom.size(), 0);
if (verbose)
bool use_moving_average = false;
auto method = gemmi::Binner::Method::Dstar3;
if (p.options[Method])
switch (p.options[Method].arg[0]) {
case '2': method = gemmi::Binner::Method::Dstar3; break;
case 'e': method = gemmi::Binner::Method::EqualCount; break;
case 'm': use_moving_average = true; break;
}
if (verbose > 0)
std::fprintf(stderr, "Calculating E ...\n");
auto bin_num = bin_index.begin();
for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size(), bin_num++) {
gemmi::Miller hkl = mtz.get_hkl(n);
double f = mtz.data[n + fcol_idx];
if (std::isnan(f))
continue;
double f2 = f * f / gops.epsilon_factor(hkl);
mtz.data[n + ecol.idx] = std::sqrt(float(f2));
denom[*bin_num] += f2;
count[*bin_num]++;
gemmi::GroupOps gops = mtz.spacegroup->operations();
std::vector<double> computed_values(mtz.nreflections, NAN);
if (use_moving_average) {
//TODO
} else {
std::vector<double> mids = binner.setup_mid(nbins, method, gemmi::MtzDataProxy{mtz});
std::vector<double> inv_d2(mtz.nreflections);
for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i)
inv_d2[i] = mtz.cell.calculate_1_d2(mtz.get_hkl(n));
std::vector<int> bin_index = binner.get_bins_from_1_d2(inv_d2);
std::vector<gemmi::Mean> denom(binner.size());
for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), i++) {
gemmi::Miller hkl = mtz.get_hkl(n);
double f = mtz.data[n + fcol_idx];
if (!std::isnan(f)) {
double inv_epsilon = 1.0 / gops.epsilon_factor(hkl);
double f2 = f * f * inv_epsilon;
computed_values[i] = std::sqrt(inv_epsilon);
denom[bin_index[i]].add_point(f2);
}
}
if (verbose > 1) {
// print shell statistics
std::vector<int> refl_counts(binner.size());
printf(" shell\t #F\t d\t <F^2>\t #refl\t mid d\n");
for (int idx : bin_index)
++refl_counts[idx];
for (size_t i = 0; i < binner.size(); ++i) {
double d = 1 / std::sqrt(binner.limits[i]);
double mid_d = 1 / std::sqrt(mids[i]);
printf("%6zu\t%6d\t%6.2f\t%7.1f\t%6d\t%6.2f\n",
i+1, denom[i].n, d, denom[i].get_mean(), refl_counts[i], mid_d);
}
printf("\n");
}
// re-use Mean::sum for RMS
for (gemmi::Mean& m : denom)
m.sum = std::sqrt(m.get_mean());
for (size_t i = 0; i < computed_values.size(); ++i) {
double x = inv_d2[i];
int bin = bin_index[i];
double rms = denom[bin].sum;
if (x > mids.front() && x < mids.back()) {
// linear interpolation in 1/d^2
if (x > mids[bin])
++bin;
double x0 = mids[bin - 1];
double x1 = mids[bin];
double y0 = denom[bin - 1].sum;
double y1 = denom[bin].sum;
assert(x0 <= x && x <= x1);
rms = y0 + (x - x0) * (y1 - y0) / (x1 - x0);
}
computed_values[i] /= rms;
}
}
for (size_t i = 0; i < denom.size(); ++i)
denom[i] = std::sqrt(denom[i] / count[i]);
for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i)
mtz.data[n + ecol.idx] /= (float) denom[bin_index[i]];
if (verbose)
if (verbose > 0)
std::fprintf(stderr, "Writing %s ...\n", output);
for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i) {
mtz.data[n + ecol.idx] *= float(computed_values[i]);
if (use_sigma)
mtz.data[n + ecol.idx + 1] *= float(computed_values[i]);
}
mtz.write_to_file(output);
} catch (std::runtime_error& e) {
std::fprintf(stderr, "ERROR: %s\n", e.what());
Expand Down

0 comments on commit 8e24ce1

Please sign in to comment.