Skip to content

Commit

Permalink
move calculation of E (normalized F) to the library
Browse files Browse the repository at this point in the history
add python bindings

modify Binner::setup()
  • Loading branch information
wojdyr committed Sep 6, 2023
1 parent 2c1c18e commit ccec255
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 109 deletions.
24 changes: 24 additions & 0 deletions docs/ecalc-help.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
$ gemmi ecalc -h
Usage:
gemmi ecalc [options] INPUT.mtz OUTPUT.mtz

Calculates E (normalised structure amplitudes) from F.
-h, --help Print usage and exit.
-V, --version Print version and exit.
-v, --verbose Verbose output.
--F=LABEL Label of the input column (default: F).
--E=LABEL Label of the output column (default: E).
--no-sigma Do not use sigma columns (SIGF->SIGE).
--method=METHOD Method of calculating <F^2>, one of:
ma - moving average (not implemented yet),
2 - resolution bins equispaced in 1/d^2,
3 - resolution bins in 1/d^3 (default),
ec - bins with equal count of reflections.
--binsize=N Number of reflections per bin or in moving window
(default: 200).
--maxbins=N Maximum number of bins (default: 100).

Column for SIGF is always the next column after F (the type must be Q).
If INPUT.mtz has column E, it is replaced in OUTPUT.mtz.
Otherwise, a new column is appended.
The name for SIGE, if it is added as a column, is 'SIG' + label of E.
3 changes: 3 additions & 0 deletions docs/headers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ gemmi/dirwalk.hpp
in an alphabetical order. It wraps the tinydir library (as we cannot
depend on C++17 <filesystem> yet).

gemmi/ecalc.hpp
Normalization of amplitudes F->E ("Karle" approach, similar to CCP4 ECALC).

gemmi/eig3.hpp
Eigen decomposition code for symmetric 3x3 matrices.

Expand Down
9 changes: 9 additions & 0 deletions docs/utils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,15 @@ Merge intensities from multi-record reflection file.
.. literalinclude:: merge-help.txt
:language: console

ecalc
=====

Calculates normalized amplitudes E from amplitudes F.
Uses "Karle" approach, similar to CCP4 ECALC.

.. literalinclude:: ecalc-help.txt
:language: console

.. _sfcalc:

sfcalc
Expand Down
38 changes: 19 additions & 19 deletions include/gemmi/binner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,27 +80,26 @@ struct Binner {
}

template<typename DataProxy>
int setup(int nbins, Method method, const DataProxy& proxy,
const UnitCell* cell_=nullptr) {
void setup(int nbins, Method method, const DataProxy& proxy,
const UnitCell* cell_=nullptr, bool with_mids=false, size_t col_idx=0) {
if (col_idx >= proxy.stride())
fail("wrong col_idx in Binner::setup()");
cell = cell_ ? *cell_ : proxy.unit_cell();
std::vector<double> inv_d2(proxy.size() / proxy.stride());
for (size_t i = 0, offset = 0; i < inv_d2.size(); ++i, offset += proxy.stride())
inv_d2[i] = cell.calculate_1_d2(proxy.get_hkl(offset));
return setup_from_1_d2(nbins, method, std::move(inv_d2), nullptr);
}

/// the same as setup(), but returns mid values of 1/d^2.
template<typename DataProxy>
std::vector<double> setup_mid(int nbins, Method method, const DataProxy& proxy,
const UnitCell* cell_=nullptr) {
setup(2 * nbins, method, proxy, cell_);
std::vector<double> mids(nbins);
for (int i = 0; i < nbins; ++i) {
mids[i] = limits[2*i];
limits[i] = limits[2*i+1];
std::vector<double> inv_d2;
inv_d2.reserve(proxy.size() / proxy.stride());
for (size_t offset = 0; offset < proxy.size(); offset += proxy.stride())
if (col_idx == 0 || !std::isnan(proxy.get_num(offset + col_idx)))
inv_d2.push_back(cell.calculate_1_d2(proxy.get_hkl(offset)));
setup_from_1_d2((with_mids ? 2 * nbins : nbins),
method, std::move(inv_d2), nullptr);
if (with_mids) {
mids.resize(nbins);
for (int i = 0; i < nbins; ++i) {
mids[i] = limits[2*i];
limits[i] = limits[2*i+1];
}
limits.resize(nbins);
}
limits.resize(nbins);
return mids;
}

void ensure_limits_are_set() const {
Expand Down Expand Up @@ -172,6 +171,7 @@ struct Binner {
double min_1_d2;
double max_1_d2;
std::vector<double> limits; // upper limit of each bin
std::vector<double> mids; // the middle of each bin
};

inline Correlation combine_two_correlations(const Correlation& a, const Correlation& b) {
Expand Down
97 changes: 97 additions & 0 deletions include/gemmi/ecalc.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Copyright 2023 Global Phasing Ltd.
//
// Normalization of amplitudes F->E ("Karle" approach, similar to CCP4 ECALC).

#ifndef GEMMI_ECALC_HPP_
#define GEMMI_ECALC_HPP_

#include <cassert>
#include "binner.hpp"

namespace gemmi {

template<typename DataProxy>
std::vector<double> calculate_amplitude_normalizers(const DataProxy& data, int fcol_idx,
const Binner& binner) {
struct CountAndSum {
int n = 0;
double sum = 0.;
};
int nreflections = data.size() / data.stride();
std::vector<double> multipliers(nreflections, NAN);
if (data.spacegroup() == nullptr)
gemmi::fail("unknown space group in the data file");
GroupOps gops = data.spacegroup()->operations();
std::vector<double> inv_d2(multipliers.size());
for (size_t i = 0, n = 0; n < data.size(); n += data.stride(), ++i)
inv_d2[i] = data.unit_cell().calculate_1_d2(data.get_hkl(n));
std::vector<int> bin_index = binner.get_bins_from_1_d2(inv_d2);
std::vector<CountAndSum> stats(binner.size());
for (size_t i = 0, n = 0; n < data.size(); n += data.stride(), i++) {
Miller hkl = data.get_hkl(n);
double f = data.get_num(n + fcol_idx);
if (!std::isnan(f)) {
int epsilon = gops.epsilon_factor(hkl);
double inv_epsilon = 1.0 / epsilon;
double f2 = f * f * inv_epsilon;
multipliers[i] = std::sqrt(inv_epsilon);
CountAndSum& cs = stats[bin_index[i]];
cs.n++;
cs.sum += f2;
}
}

// simple smoothing with kernel [0.75 1 0.75]
std::vector<double> smoothed(stats.size());
{
const double k = 0.75;
smoothed[0] = (stats[0].sum + k * stats[1].sum) / (stats[0].n + k * stats[1].n);
size_t n = stats.size() - 1;
for (size_t i = 1; i < n; ++i)
smoothed[i] = (stats[i].sum + k * (stats[i-1].sum + stats[i+1].sum))
/ (stats[i].n + k * (stats[i-1].n + stats[i+1].n));
smoothed[n] = (stats[n].sum + k * stats[n-1].sum) / (stats[n].n + k * stats[n-1].n);
}

#if 0
{
// print shell statistics
std::vector<int> refl_counts(binner.size());
printf(" shell\t #F\t d\t <F^2>\tsmoothd\t #refl\t mid d\n");
for (int idx : bin_index)
++refl_counts[idx];
for (size_t i = 0; i < binner.size(); ++i) {
double d = 1 / std::sqrt(binner.limits[i]);
double mid_d = 1 / std::sqrt(binner.mids[i]);
double avg_f2 = stats[i].sum / stats[i].n;
printf("%6zu\t%6d\t%7.3f\t%7.0f\t%7.0f\t%6d\t%7.3f\n",
i+1, stats[i].n, d, avg_f2, smoothed[i], refl_counts[i], mid_d);
}
printf("\n");
}
#endif

for (double& x : smoothed)
x = std::sqrt(x);
for (size_t i = 0; i < multipliers.size(); ++i) {
double x = inv_d2[i];
int bin = bin_index[i];
double rms = smoothed[bin];
if (x > binner.mids.front() && x < binner.mids.back()) {
// linear interpolation in 1/d^2
if (x > binner.mids[bin])
++bin;
double x0 = binner.mids[bin - 1];
double x1 = binner.mids[bin];
double y0 = smoothed[bin - 1];
double y1 = smoothed[bin];
assert(x0 <= x && x <= x1);
rms = y0 + (x - x0) * (y1 - y0) / (x1 - x0);
}
multipliers[i] /= rms;
}
return multipliers;
}

} // namespace gemmi
#endif
7 changes: 0 additions & 7 deletions include/gemmi/stats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,6 @@

namespace gemmi {

struct Mean {
int n = 0;
double sum = 0.;
void add_point(double x) { ++n; sum += x; }
double get_mean() const { return sum / n; }
};

// popular single-pass algorithm for calculating variance and mean
struct Variance {
int n = 0;
Expand Down
93 changes: 13 additions & 80 deletions prog/ecalc.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
// Copyright 2023 Global Phasing Ltd.

#include <cstdio> // for printf, fprintf
#include <cstdlib> // for atof
#include "gemmi/binner.hpp" // for Binner
#include "gemmi/mtz.hpp" // for Mtz
#include "gemmi/stats.hpp" // for Mean
#include "gemmi/ecalc.hpp" // for Mean

#define GEMMI_PROG ecalc
#include "options.h"
Expand All @@ -17,7 +16,7 @@ enum OptionIndex {

struct EcalcArg: public Arg {
static option::ArgStatus MethodChoice(const option::Option& option, bool msg) {
return Arg::Choice(option, msg, {"ma", "2", "3", "ec"});
return Arg::Choice(option, msg, {/*"ma",*/ "2", "3", "ec"});
}
};

Expand Down Expand Up @@ -49,8 +48,7 @@ const option::Descriptor Usage[] = {
"\nColumn for SIGF is always the next column after F (the type must be Q)."
"\nIf INPUT.mtz has column E, it is replaced in OUTPUT.mtz."
"\nOtherwise, a new column is appended."
"\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E."
"\nTo print the list of resolution shells add -vv." },
"\nThe name for SIGE, if it is added as a column, is 'SIG' + label of E." },
{ 0, 0, 0, 0, 0, 0 }
};

Expand All @@ -72,8 +70,6 @@ int GEMMI_MAIN(int argc, char **argv) {
try {
Mtz mtz;
mtz.read_file_gz(input);
if (mtz.spacegroup == nullptr)
gemmi::fail("Unknown space group of the input data: ", input);
const Mtz::Column* fcol = &mtz.get_column_with_label(f_label);
if (use_sigma && !fcol->get_next_column_if_type('Q'))
gemmi::fail("Column ", f_label, " not followed by sigma column. Use --no-sigma.\n");
Expand All @@ -100,7 +96,6 @@ int GEMMI_MAIN(int argc, char **argv) {
}
if (!mtz.has_data()) // shouldn't happen
gemmi::fail("no data");
gemmi::Binner binner;
int f_count = 0;
for (size_t n = 0; n < mtz.data.size(); n += mtz.columns.size())
if (!std::isnan(mtz.data[n + fcol_idx]))
Expand All @@ -114,90 +109,28 @@ int GEMMI_MAIN(int argc, char **argv) {
mtz.nreflections, f_count, nbins);
if (nbins < 3)
gemmi::fail("not enough resolution bins");
bool use_moving_average = false;
//bool use_moving_average = false;
auto method = gemmi::Binner::Method::Dstar3;
if (p.options[Method])
switch (p.options[Method].arg[0]) {
case '2': method = gemmi::Binner::Method::Dstar3; break;
case '2': method = gemmi::Binner::Method::Dstar2; break;
case 'e': method = gemmi::Binner::Method::EqualCount; break;
case 'm': use_moving_average = true; break;
//case 'm': use_moving_average = true; break;
}
if (verbose > 0)
std::fprintf(stderr, "Calculating E ...\n");
gemmi::GroupOps gops = mtz.spacegroup->operations();
std::vector<double> computed_values(mtz.nreflections, NAN);
if (use_moving_average) {
//TODO
} else {
std::vector<double> mids = binner.setup_mid(nbins, method, gemmi::MtzDataProxy{mtz});
std::vector<double> inv_d2(mtz.nreflections);
for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i)
inv_d2[i] = mtz.cell.calculate_1_d2(mtz.get_hkl(n));
std::vector<int> bin_index = binner.get_bins_from_1_d2(inv_d2);
std::vector<gemmi::Mean> denom(binner.size());
for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), i++) {
gemmi::Miller hkl = mtz.get_hkl(n);
double f = mtz.data[n + fcol_idx];
if (!std::isnan(f)) {
double inv_epsilon = 1.0 / gops.epsilon_factor(hkl);
double f2 = f * f * inv_epsilon;
computed_values[i] = std::sqrt(inv_epsilon);
denom[bin_index[i]].add_point(f2);
}
}

// simple smoothing with kernel [0.75 1 0.75]
std::vector<double> smoothed(denom.size());
{
const double k = 0.75;
smoothed[0] = (denom[0].sum + k * denom[1].sum) / (denom[0].n + k * denom[1].n);
size_t n = denom.size() - 1;
for (size_t i = 1; i < n; ++i)
smoothed[i] = (denom[i].sum + k * (denom[i-1].sum + denom[i+1].sum))
/ (denom[i].n + k * (denom[i-1].n + denom[i+1].n));
smoothed[n] = (denom[n].sum + k * denom[n-1].sum) / (denom[n].n + k * denom[n-1].n);
}
gemmi::Binner binner;
gemmi::MtzDataProxy data_proxy{mtz};
binner.setup(nbins, method, data_proxy, nullptr, /*with_mids=*/true, fcol_idx);
std::vector<double> multipliers
= gemmi::calculate_amplitude_normalizers(data_proxy, fcol_idx, binner);

if (verbose > 1) {
// print shell statistics
std::vector<int> refl_counts(binner.size());
printf(" shell\t #F\t d\t <F^2>\tsmoothd\t #refl\t mid d\n");
for (int idx : bin_index)
++refl_counts[idx];
for (size_t i = 0; i < binner.size(); ++i) {
double d = 1 / std::sqrt(binner.limits[i]);
double mid_d = 1 / std::sqrt(mids[i]);
printf("%6zu\t%6d\t%6.2f\t%7.0f\t%7.0f\t%6d\t%6.2f\n",
i+1, denom[i].n, d, denom[i].get_mean(), smoothed[i], refl_counts[i], mid_d);
}
printf("\n");
}
for (double& x : smoothed)
x = std::sqrt(x);
for (size_t i = 0; i < computed_values.size(); ++i) {
double x = inv_d2[i];
int bin = bin_index[i];
double rms = smoothed[bin];
if (x > mids.front() && x < mids.back()) {
// linear interpolation in 1/d^2
if (x > mids[bin])
++bin;
double x0 = mids[bin - 1];
double x1 = mids[bin];
double y0 = smoothed[bin - 1];
double y1 = smoothed[bin];
assert(x0 <= x && x <= x1);
rms = y0 + (x - x0) * (y1 - y0) / (x1 - x0);
}
computed_values[i] /= rms;
}
}
if (verbose > 0)
std::fprintf(stderr, "Writing %s ...\n", output);
for (size_t i = 0, n = 0; n < mtz.data.size(); n += mtz.columns.size(), ++i) {
mtz.data[n + ecol.idx] *= float(computed_values[i]);
mtz.data[n + ecol.idx] *= float(multipliers[i]);
if (use_sigma)
mtz.data[n + ecol.idx + 1] *= float(computed_values[i]);
mtz.data[n + ecol.idx + 1] *= float(multipliers[i]);
}
mtz.write_to_file(output);
} catch (std::runtime_error& e) {
Expand Down
Loading

0 comments on commit ccec255

Please sign in to comment.